all repos — honk @ a4d92f2e3fc9ae6692393f6bd22a0b6e609bf30f

my fork of honk

html.go (view raw)

  1//
  2// Copyright (c) 2019 Ted Unangst <tedu@tedunangst.com>
  3//
  4// Permission to use, copy, modify, and distribute this software for any
  5// purpose with or without fee is hereby granted, provided that the above
  6// copyright notice and this permission notice appear in all copies.
  7//
  8// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15
 16package main
 17
 18import (
 19	"fmt"
 20	"html/template"
 21	"io"
 22	"log"
 23	"net/url"
 24	"regexp"
 25	"sort"
 26	"strings"
 27
 28	"golang.org/x/net/html"
 29)
 30
 31var permittedtags = []string{
 32	"div", "h1", "h2", "h3", "h4", "h5", "h6",
 33	"table", "thead", "tbody", "th", "tr", "td", "colgroup", "col",
 34	"p", "br", "pre", "code", "blockquote", "q",
 35	"samp", "mark", "ins", "dfn", "cite", "abbr", "address",
 36	"strong", "em", "b", "i", "s", "u", "sub", "sup", "del", "tt", "small",
 37	"ol", "ul", "li", "dl", "dt", "dd",
 38}
 39var permittedattr = []string{"colspan", "rowspan"}
 40var bannedtags = []string{"script", "style"}
 41
 42func init() {
 43	sort.Strings(permittedtags)
 44	sort.Strings(permittedattr)
 45	sort.Strings(bannedtags)
 46}
 47
 48func contains(array []string, tag string) bool {
 49	idx := sort.SearchStrings(array, tag)
 50	return idx < len(array) && array[idx] == tag
 51}
 52
 53func getattr(node *html.Node, attr string) string {
 54	for _, a := range node.Attr {
 55		if a.Key == attr {
 56			return a.Val
 57		}
 58	}
 59	return ""
 60}
 61
 62func hasclass(node *html.Node, class string) bool {
 63	return strings.Contains(" "+getattr(node, "class")+" ", " "+class+" ")
 64}
 65
 66func writetag(w io.Writer, node *html.Node) {
 67	io.WriteString(w, "<")
 68	io.WriteString(w, node.Data)
 69	for _, attr := range node.Attr {
 70		if contains(permittedattr, attr.Key) {
 71			fmt.Fprintf(w, ` %s="%s"`, attr.Key, html.EscapeString(attr.Val))
 72		}
 73	}
 74	io.WriteString(w, ">")
 75}
 76
 77func render(w io.Writer, node *html.Node) {
 78	if node.Type == html.ElementNode {
 79		tag := node.Data
 80		switch {
 81		case tag == "a":
 82			href := getattr(node, "href")
 83			hrefurl, err := url.Parse(href)
 84			if err != nil {
 85				href = "#BROKEN-" + href
 86			} else {
 87				href = hrefurl.String()
 88			}
 89			fmt.Fprintf(w, `<a href="%s" rel=noreferrer>`, html.EscapeString(href))
 90		case tag == "img":
 91			div := replaceimg(node)
 92			if div != "skip" {
 93				io.WriteString(w, div)
 94			}
 95		case tag == "span":
 96		case tag == "iframe":
 97			src := html.EscapeString(getattr(node, "src"))
 98			fmt.Fprintf(w, `&lt;iframe src="<a href="%s">%s</a>"&gt;`, src, src)
 99		case contains(permittedtags, tag):
100			writetag(w, node)
101		case contains(bannedtags, tag):
102			return
103		}
104	} else if node.Type == html.TextNode {
105		io.WriteString(w, html.EscapeString(node.Data))
106	}
107
108	for c := node.FirstChild; c != nil; c = c.NextSibling {
109		render(w, c)
110	}
111
112	if node.Type == html.ElementNode {
113		tag := node.Data
114		if tag == "a" || (contains(permittedtags, tag) && tag != "br") {
115			fmt.Fprintf(w, "</%s>", tag)
116		}
117		if tag == "p" || tag == "div" {
118			io.WriteString(w, "\n")
119		}
120	}
121}
122
123func replaceimg(node *html.Node) string {
124	src := getattr(node, "src")
125	alt := getattr(node, "alt")
126	//title := getattr(node, "title")
127	if hasclass(node, "Emoji") && alt != "" {
128		return html.EscapeString(alt)
129	}
130	return html.EscapeString(fmt.Sprintf(`<img src="%s">`, src))
131}
132
133func cleannode(node *html.Node) template.HTML {
134	var buf strings.Builder
135	render(&buf, node)
136	return template.HTML(buf.String())
137}
138
139func cleanstring(shtml string) template.HTML {
140	reader := strings.NewReader(shtml)
141	body, err := html.Parse(reader)
142	if err != nil {
143		log.Printf("error parsing html: %s", err)
144		return ""
145	}
146	return cleannode(body)
147}
148
149func textonly(w io.Writer, node *html.Node) {
150	switch node.Type {
151	case html.ElementNode:
152		tag := node.Data
153		switch {
154		case tag == "a":
155			href := getattr(node, "href")
156			fmt.Fprintf(w, `<a href="%s">`, href)
157		case tag == "img":
158			io.WriteString(w, "<img>")
159		case contains(bannedtags, tag):
160			return
161		}
162	case html.TextNode:
163		io.WriteString(w, node.Data)
164	}
165	for c := node.FirstChild; c != nil; c = c.NextSibling {
166		textonly(w, c)
167	}
168	if node.Type == html.ElementNode {
169		tag := node.Data
170		if tag == "a" {
171			fmt.Fprintf(w, "</%s>", tag)
172		}
173		if tag == "p" || tag == "div" {
174			io.WriteString(w, "\n")
175		}
176	}
177}
178
179var re_whitespaceeater = regexp.MustCompile("[ \t\r]*\n[ \t\r]*")
180var re_blanklineeater = regexp.MustCompile("\n\n+")
181var re_tabeater = regexp.MustCompile("[ \t]+")
182
183func htmltotext(shtml template.HTML) string {
184	reader := strings.NewReader(string(shtml))
185	body, _ := html.Parse(reader)
186	var buf strings.Builder
187	textonly(&buf, body)
188	rv := buf.String()
189	rv = re_whitespaceeater.ReplaceAllLiteralString(rv, "\n")
190	rv = re_blanklineeater.ReplaceAllLiteralString(rv, "\n\n")
191	rv = re_tabeater.ReplaceAllLiteralString(rv, " ")
192	for len(rv) > 0 && rv[0] == '\n' {
193		rv = rv[1:]
194	}
195	return rv
196}