html.go (view raw)
1//
2// Copyright (c) 2019 Ted Unangst <tedu@tedunangst.com>
3//
4// Permission to use, copy, modify, and distribute this software for any
5// purpose with or without fee is hereby granted, provided that the above
6// copyright notice and this permission notice appear in all copies.
7//
8// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
16package main
17
18import (
19 "fmt"
20 "html/template"
21 "io"
22 "log"
23 "net/url"
24 "regexp"
25 "sort"
26 "strings"
27
28 "golang.org/x/net/html"
29)
30
31var permittedtags = []string{"div", "h1", "h2", "h3", "h4", "h5", "h6",
32 "table", "thead", "tbody", "th", "tr", "td",
33 "p", "br", "pre", "code", "blockquote",
34 "strong", "em", "b", "i", "s", "u", "sub", "sup", "del",
35 "ol", "ul", "li"}
36var permittedattr = []string{"colspan", "rowspan"}
37var bannedtags = []string{"script", "style"}
38
39func init() {
40 sort.Strings(permittedtags)
41 sort.Strings(permittedattr)
42 sort.Strings(bannedtags)
43}
44
45func contains(array []string, tag string) bool {
46 idx := sort.SearchStrings(array, tag)
47 return idx < len(array) && array[idx] == tag
48}
49
50func getattr(node *html.Node, attr string) string {
51 for _, a := range node.Attr {
52 if a.Key == attr {
53 return a.Val
54 }
55 }
56 return ""
57}
58
59func hasclass(node *html.Node, class string) bool {
60 return strings.Contains(" "+getattr(node, "class")+" ", " "+class+" ")
61}
62
63func writetag(w io.Writer, node *html.Node) {
64 io.WriteString(w, "<")
65 io.WriteString(w, node.Data)
66 for _, attr := range node.Attr {
67 if contains(permittedattr, attr.Key) {
68 fmt.Fprintf(w, ` %s="%s"`, attr.Key, html.EscapeString(attr.Val))
69 }
70 }
71 io.WriteString(w, ">")
72}
73
74func render(w io.Writer, node *html.Node) {
75 switch node.Type {
76 case html.ElementNode:
77 tag := node.Data
78 switch {
79 case tag == "a":
80 href := getattr(node, "href")
81 hrefurl, err := url.Parse(href)
82 if err != nil {
83 href = "#BROKEN-" + href
84 } else {
85 href = hrefurl.String()
86 }
87 fmt.Fprintf(w, `<a href="%s" rel=noreferrer>`, html.EscapeString(href))
88 case tag == "img":
89 div := replaceimg(node)
90 if div != "skip" {
91 io.WriteString(w, div)
92 }
93 case tag == "span":
94 case tag == "iframe":
95 src := html.EscapeString(getattr(node, "src"))
96 fmt.Fprintf(w, `<iframe src="<a href="%s">%s</a>">`, src, src)
97 case contains(permittedtags, tag):
98 writetag(w, node)
99 case contains(bannedtags, tag):
100 return
101 }
102 case html.TextNode:
103 io.WriteString(w, html.EscapeString(node.Data))
104 }
105 for c := node.FirstChild; c != nil; c = c.NextSibling {
106 render(w, c)
107 }
108 if node.Type == html.ElementNode {
109 tag := node.Data
110 if tag == "a" || (contains(permittedtags, tag) && tag != "br") {
111 fmt.Fprintf(w, "</%s>", tag)
112 }
113 if tag == "p" || tag == "div" {
114 io.WriteString(w, "\n")
115 }
116 }
117}
118
119func replaceimg(node *html.Node) string {
120 src := getattr(node, "src")
121 alt := getattr(node, "alt")
122 //title := getattr(node, "title")
123 if hasclass(node, "Emoji") && alt != "" {
124 return html.EscapeString(alt)
125 }
126 return html.EscapeString(fmt.Sprintf(`<img src="%s">`, src))
127}
128
129func cleannode(node *html.Node) template.HTML {
130 var buf strings.Builder
131 render(&buf, node)
132 return template.HTML(buf.String())
133}
134
135func cleanstring(shtml string) template.HTML {
136 reader := strings.NewReader(shtml)
137 body, err := html.Parse(reader)
138 if err != nil {
139 log.Printf("error parsing html: %s", err)
140 return ""
141 }
142 return cleannode(body)
143}
144
145func textonly(w io.Writer, node *html.Node) {
146 switch node.Type {
147 case html.ElementNode:
148 tag := node.Data
149 switch {
150 case tag == "a":
151 href := getattr(node, "href")
152 fmt.Fprintf(w, `<a href="%s">`, href)
153 case tag == "img":
154 io.WriteString(w, "<img>")
155 case contains(bannedtags, tag):
156 return
157 }
158 case html.TextNode:
159 io.WriteString(w, node.Data)
160 }
161 for c := node.FirstChild; c != nil; c = c.NextSibling {
162 textonly(w, c)
163 }
164 if node.Type == html.ElementNode {
165 tag := node.Data
166 if tag == "a" {
167 fmt.Fprintf(w, "</%s>", tag)
168 }
169 if tag == "p" || tag == "div" {
170 io.WriteString(w, "\n")
171 }
172 }
173}
174
175var re_whitespaceeater = regexp.MustCompile("[ \t\r]*\n[ \t\r]*")
176var re_blanklineeater = regexp.MustCompile("\n\n+")
177var re_tabeater = regexp.MustCompile("[ \t]+")
178
179func htmltotext(shtml template.HTML) string {
180 reader := strings.NewReader(string(shtml))
181 body, _ := html.Parse(reader)
182 var buf strings.Builder
183 textonly(&buf, body)
184 rv := buf.String()
185 rv = re_whitespaceeater.ReplaceAllLiteralString(rv, "\n")
186 rv = re_blanklineeater.ReplaceAllLiteralString(rv, "\n\n")
187 rv = re_tabeater.ReplaceAllLiteralString(rv, " ")
188 for len(rv) > 0 && rv[0] == '\n' {
189 rv = rv[1:]
190 }
191 return rv
192}