all repos — honk @ 844cdfc15e932cfd026bff6104411bba73efb5be

my fork of honk

html.go (view raw)

  1//
  2// Copyright (c) 2019 Ted Unangst <tedu@tedunangst.com>
  3//
  4// Permission to use, copy, modify, and distribute this software for any
  5// purpose with or without fee is hereby granted, provided that the above
  6// copyright notice and this permission notice appear in all copies.
  7//
  8// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 10// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 11// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 12// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 13// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 14// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 15
 16package main
 17
 18import (
 19	"fmt"
 20	"html/template"
 21	"io"
 22	"log"
 23	"net/url"
 24	"regexp"
 25	"sort"
 26	"strings"
 27
 28	"golang.org/x/net/html"
 29)
 30
 31var permittedtags = []string{"div", "h1", "h2", "h3", "h4", "h5", "h6",
 32	"table", "thead", "tbody", "th", "tr", "td",
 33	"p", "br", "pre", "code", "blockquote",
 34	"strong", "em", "b", "i", "s", "u", "sup", "del",
 35	"ol", "ul", "li"}
 36var permittedattr = []string{"colspan", "rowspan"}
 37var bannedtags = []string{"script", "style"}
 38
 39func init() {
 40	sort.Strings(permittedtags)
 41	sort.Strings(permittedattr)
 42	sort.Strings(bannedtags)
 43}
 44
 45func contains(array []string, tag string) bool {
 46	idx := sort.SearchStrings(array, tag)
 47	return idx < len(array) && array[idx] == tag
 48}
 49
 50func getattr(node *html.Node, attr string) string {
 51	for _, a := range node.Attr {
 52		if a.Key == attr {
 53			return a.Val
 54		}
 55	}
 56	return ""
 57}
 58
 59func hasclass(node *html.Node, class string) bool {
 60	return strings.Contains(" "+getattr(node, "class")+" ", " "+class+" ")
 61}
 62
 63func writetag(w io.Writer, node *html.Node) {
 64	io.WriteString(w, "<")
 65	io.WriteString(w, node.Data)
 66	for _, attr := range node.Attr {
 67		if contains(permittedattr, attr.Key) {
 68			fmt.Fprintf(w, ` %s="%s"`, attr.Key, html.EscapeString(attr.Val))
 69		}
 70	}
 71	io.WriteString(w, ">")
 72}
 73
 74func render(w io.Writer, node *html.Node) {
 75	switch node.Type {
 76	case html.ElementNode:
 77		tag := node.Data
 78		switch {
 79		case tag == "a":
 80			href := getattr(node, "href")
 81			hrefurl, err := url.Parse(href)
 82			if err != nil {
 83				href = "#BROKEN-" + href
 84			} else {
 85				href = hrefurl.String()
 86			}
 87			fmt.Fprintf(w, `<a href="%s" rel=noreferrer>`, html.EscapeString(href))
 88		case tag == "img":
 89			div := replaceimg(node)
 90			if div != "skip" {
 91				io.WriteString(w, div)
 92			}
 93		case tag == "span":
 94		case tag == "iframe":
 95			src := html.EscapeString(getattr(node, "src"))
 96			fmt.Fprintf(w, `&lt;iframe src="<a href="%s">%s</a>"&gt;`, src, src)
 97		case contains(permittedtags, tag):
 98			writetag(w, node)
 99		case contains(bannedtags, tag):
100			return
101		}
102	case html.TextNode:
103		io.WriteString(w, html.EscapeString(node.Data))
104	}
105	for c := node.FirstChild; c != nil; c = c.NextSibling {
106		render(w, c)
107	}
108	if node.Type == html.ElementNode {
109		tag := node.Data
110		if tag == "a" || (contains(permittedtags, tag) && tag != "br") {
111			fmt.Fprintf(w, "</%s>", tag)
112		}
113		if tag == "p" || tag == "div" {
114			io.WriteString(w, "\n")
115		}
116	}
117}
118
119func replaceimg(node *html.Node) string {
120	src := getattr(node, "src")
121	alt := getattr(node, "alt")
122	//title := getattr(node, "title")
123	if hasclass(node, "Emoji") && alt != "" {
124		return html.EscapeString(alt)
125	}
126	return html.EscapeString(fmt.Sprintf(`<img src="%s">`, src))
127}
128
129func cleannode(node *html.Node) template.HTML {
130	var buf strings.Builder
131	render(&buf, node)
132	return template.HTML(buf.String())
133}
134
135func cleanstring(shtml string) template.HTML {
136	reader := strings.NewReader(shtml)
137	body, err := html.Parse(reader)
138	if err != nil {
139		log.Printf("error parsing html: %s", err)
140		return ""
141	}
142	return cleannode(body)
143}
144
145func textonly(w io.Writer, node *html.Node) {
146	switch node.Type {
147	case html.ElementNode:
148		tag := node.Data
149		switch {
150		case tag == "a":
151			href := getattr(node, "href")
152			fmt.Fprintf(w, `<a href="%s">`, href)
153		case tag == "img":
154			io.WriteString(w, "<img>")
155		case contains(bannedtags, tag):
156			return
157		}
158	case html.TextNode:
159		io.WriteString(w, node.Data)
160	}
161	for c := node.FirstChild; c != nil; c = c.NextSibling {
162		textonly(w, c)
163	}
164	if node.Type == html.ElementNode {
165		tag := node.Data
166		if tag == "a" {
167			fmt.Fprintf(w, "</%s>", tag)
168		}
169		if tag == "p" || tag == "div" {
170			io.WriteString(w, "\n")
171		}
172	}
173}
174
175var re_whitespaceeater = regexp.MustCompile("[ \t\r]*\n[ \t\r]*")
176var re_blanklineeater = regexp.MustCompile("\n\n+")
177var re_tabeater = regexp.MustCompile("[ \t]+")
178
179func htmltotext(shtml template.HTML) string {
180	reader := strings.NewReader(string(shtml))
181	body, _ := html.Parse(reader)
182	var buf strings.Builder
183	textonly(&buf, body)
184	rv := buf.String()
185	rv = re_whitespaceeater.ReplaceAllLiteralString(rv, "\n")
186	rv = re_blanklineeater.ReplaceAllLiteralString(rv, "\n\n")
187	rv = re_tabeater.ReplaceAllLiteralString(rv, " ")
188	for len(rv) > 0 && rv[0] == '\n' {
189		rv = rv[1:]
190	}
191	return rv
192}