html.go (view raw)
1//
2// Copyright (c) 2019 Ted Unangst <tedu@tedunangst.com>
3//
4// Permission to use, copy, modify, and distribute this software for any
5// purpose with or without fee is hereby granted, provided that the above
6// copyright notice and this permission notice appear in all copies.
7//
8// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
16package main
17
18import (
19 "fmt"
20 "html/template"
21 "io"
22 "log"
23 "net/url"
24 "regexp"
25 "sort"
26 "strings"
27
28 "golang.org/x/net/html"
29)
30
31var permittedtags = []string{
32 "div", "h1", "h2", "h3", "h4", "h5", "h6",
33 "table", "thead", "tbody", "th", "tr", "td", "colgroup", "col",
34 "p", "br", "pre", "code", "blockquote", "q",
35 "samp", "mark", "ins", "dfn", "cite", "abbr", "address",
36 "strong", "em", "b", "i", "s", "u", "sub", "sup", "del", "tt", "small",
37 "ol", "ul", "li", "dl", "dt", "dd",
38}
39var permittedattr = []string{"colspan", "rowspan"}
40var bannedtags = []string{"script", "style"}
41
42func init() {
43 sort.Strings(permittedtags)
44 sort.Strings(permittedattr)
45 sort.Strings(bannedtags)
46}
47
48func contains(array []string, tag string) bool {
49 idx := sort.SearchStrings(array, tag)
50 return idx < len(array) && array[idx] == tag
51}
52
53func getattr(node *html.Node, attr string) string {
54 for _, a := range node.Attr {
55 if a.Key == attr {
56 return a.Val
57 }
58 }
59 return ""
60}
61
62func hasclass(node *html.Node, class string) bool {
63 return strings.Contains(" "+getattr(node, "class")+" ", " "+class+" ")
64}
65
66func writetag(w io.Writer, node *html.Node) {
67 io.WriteString(w, "<")
68 io.WriteString(w, node.Data)
69 for _, attr := range node.Attr {
70 if contains(permittedattr, attr.Key) {
71 fmt.Fprintf(w, ` %s="%s"`, attr.Key, html.EscapeString(attr.Val))
72 }
73 }
74 io.WriteString(w, ">")
75}
76
77func render(w io.Writer, node *html.Node) {
78 if node.Type == html.ElementNode {
79 tag := node.Data
80 switch {
81 case tag == "a":
82 href := getattr(node, "href")
83 hrefurl, err := url.Parse(href)
84 if err != nil {
85 href = "#BROKEN-" + href
86 } else {
87 href = hrefurl.String()
88 }
89 fmt.Fprintf(w, `<a href="%s" rel=noreferrer>`, html.EscapeString(href))
90 case tag == "img":
91 div := replaceimg(node)
92 if div != "skip" {
93 io.WriteString(w, div)
94 }
95 case tag == "span":
96 case tag == "iframe":
97 src := html.EscapeString(getattr(node, "src"))
98 fmt.Fprintf(w, `<iframe src="<a href="%s">%s</a>">`, src, src)
99 case contains(permittedtags, tag):
100 writetag(w, node)
101 case contains(bannedtags, tag):
102 return
103 }
104 } else if node.Type == html.TextNode {
105 io.WriteString(w, html.EscapeString(node.Data))
106 }
107
108 for c := node.FirstChild; c != nil; c = c.NextSibling {
109 render(w, c)
110 }
111
112 if node.Type == html.ElementNode {
113 tag := node.Data
114 if tag == "a" || (contains(permittedtags, tag) && tag != "br") {
115 fmt.Fprintf(w, "</%s>", tag)
116 }
117 if tag == "p" || tag == "div" {
118 io.WriteString(w, "\n")
119 }
120 }
121}
122
123func replaceimg(node *html.Node) string {
124 src := getattr(node, "src")
125 alt := getattr(node, "alt")
126 //title := getattr(node, "title")
127 if hasclass(node, "Emoji") && alt != "" {
128 return html.EscapeString(alt)
129 }
130 return html.EscapeString(fmt.Sprintf(`<img src="%s">`, src))
131}
132
133func cleannode(node *html.Node) template.HTML {
134 var buf strings.Builder
135 render(&buf, node)
136 return template.HTML(buf.String())
137}
138
139func cleanstring(shtml string) template.HTML {
140 reader := strings.NewReader(shtml)
141 body, err := html.Parse(reader)
142 if err != nil {
143 log.Printf("error parsing html: %s", err)
144 return ""
145 }
146 return cleannode(body)
147}
148
149func textonly(w io.Writer, node *html.Node) {
150 switch node.Type {
151 case html.ElementNode:
152 tag := node.Data
153 switch {
154 case tag == "a":
155 href := getattr(node, "href")
156 fmt.Fprintf(w, `<a href="%s">`, href)
157 case tag == "img":
158 io.WriteString(w, "<img>")
159 case contains(bannedtags, tag):
160 return
161 }
162 case html.TextNode:
163 io.WriteString(w, node.Data)
164 }
165 for c := node.FirstChild; c != nil; c = c.NextSibling {
166 textonly(w, c)
167 }
168 if node.Type == html.ElementNode {
169 tag := node.Data
170 if tag == "a" {
171 fmt.Fprintf(w, "</%s>", tag)
172 }
173 if tag == "p" || tag == "div" {
174 io.WriteString(w, "\n")
175 }
176 }
177}
178
179var re_whitespaceeater = regexp.MustCompile("[ \t\r]*\n[ \t\r]*")
180var re_blanklineeater = regexp.MustCompile("\n\n+")
181var re_tabeater = regexp.MustCompile("[ \t]+")
182
183func htmltotext(shtml template.HTML) string {
184 reader := strings.NewReader(string(shtml))
185 body, _ := html.Parse(reader)
186 var buf strings.Builder
187 textonly(&buf, body)
188 rv := buf.String()
189 rv = re_whitespaceeater.ReplaceAllLiteralString(rv, "\n")
190 rv = re_blanklineeater.ReplaceAllLiteralString(rv, "\n\n")
191 rv = re_tabeater.ReplaceAllLiteralString(rv, " ")
192 for len(rv) > 0 && rv[0] == '\n' {
193 rv = rv[1:]
194 }
195 return rv
196}