all repos — grayfriday @ 52f7a2a7b02d11db19411c28e6c67fc351f20aaf

blackfriday fork with a few changes

sanitize.go (view raw)

  1package blackfriday
  2
  3import (
  4	"bufio"
  5	"bytes"
  6	"code.google.com/p/go.net/html"
  7	"fmt"
  8	"io"
  9)
 10
 11// Whitelisted element tags, attributes on particular tags, attributes that are
 12// interpreted as protocols (again on particular tags), and allowed protocols.
 13var (
 14	whitelistTags      map[string]bool
 15	whitelistAttrs     map[string]map[string]bool
 16	protocolAttrs      map[string]map[string]bool
 17	whitelistProtocols [][]byte
 18)
 19
 20func init() {
 21	whitelistTags = toSet([]string{
 22		// Headings
 23		"h1", "h2", "h3", "h4", "h5", "h6",
 24		// Block elements
 25		"p", "pre", "blockquote", "hr", "div", "header", "article", "aside", "footer",
 26		"section", "main", "mark", "figure", "figcaption",
 27		// Inline elements
 28		"a", "br", "cite", "code", "img",
 29		// Lists
 30		"ol", "ul", "li",
 31		// Tables
 32		"table", "tbody", "td", "tfoot", "th", "thead", "tr", "colgroup", "col", "caption",
 33		// Formatting
 34		"u", "i", "em", "small", "strike", "b", "strong", "sub", "sup", "q",
 35		// Definition lists
 36		"dd", "dl", "dt",
 37	})
 38	whitelistAttrs = map[string]map[string]bool{
 39		"a":   toSet([]string{"href", "title", "rel"}),
 40		"img": toSet([]string{"src", "alt", "title"}),
 41		"td":  toSet([]string{"align"}),
 42		"th":  toSet([]string{"align"}),
 43	}
 44	protocolAttrs = map[string]map[string]bool{
 45		"a":   toSet([]string{"href"}),
 46		"img": toSet([]string{"src"}),
 47	}
 48	whitelistProtocols = [][]byte{
 49		[]byte("http://"),
 50		[]byte("https://"),
 51		[]byte("ftp://"),
 52		[]byte("mailto:"),
 53	}
 54}
 55
 56func toSet(keys []string) map[string]bool {
 57	m := make(map[string]bool, len(keys))
 58	for _, k := range keys {
 59		m[k] = true
 60	}
 61	return m
 62}
 63
 64// Sanitizes the given input by parsing it as HTML5, then whitelisting known to
 65// be safe elements and attributes. All other HTML is escaped, unsafe attributes
 66// are stripped.
 67func sanitizeHtmlSafe(input []byte) []byte {
 68	r := bytes.NewReader(input)
 69	var w bytes.Buffer
 70	tokenizer := html.NewTokenizer(r)
 71	wr := bufio.NewWriter(&w)
 72
 73	// Iterate through all tokens in the input stream and sanitize them.
 74	for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() {
 75		switch t {
 76		case html.TextToken:
 77			// Text is written escaped.
 78			wr.WriteString(tokenizer.Token().String())
 79		case html.SelfClosingTagToken, html.StartTagToken:
 80			// HTML tags are escaped unless whitelisted.
 81			tag, hasAttributes := tokenizer.TagName()
 82			tagName := string(tag)
 83			if whitelistTags[tagName] {
 84				wr.WriteString("<")
 85				wr.Write(tag)
 86				for hasAttributes {
 87					var key, val []byte
 88					key, val, hasAttributes = tokenizer.TagAttr()
 89					attrName := string(key)
 90					// Only include whitelisted attributes for the given tagName.
 91					tagWhitelistedAttrs, ok := whitelistAttrs[tagName]
 92					if ok && tagWhitelistedAttrs[attrName] {
 93						// For whitelisted attributes, if it's an attribute that requires
 94						// protocol checking, do so and strip it if it's not known to be safe.
 95						tagProtocolAttrs, ok := protocolAttrs[tagName]
 96						if ok && tagProtocolAttrs[attrName] {
 97							if !isRelativeLink(val) && !protocolAllowed(val) {
 98								continue
 99							}
100						}
101						wr.WriteByte(' ')
102						wr.Write(key)
103						wr.WriteString(`="`)
104						wr.WriteString(html.EscapeString(string(val)))
105						wr.WriteByte('"')
106					}
107				}
108				if t == html.SelfClosingTagToken {
109					wr.WriteString("/>")
110				} else {
111					wr.WriteString(">")
112				}
113			} else {
114				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
115			}
116			// Make sure that tags like <script> that switch the parser into raw mode
117			// do not destroy the parse mode for following HTML text (the point is to
118			// escape them anyway). For that, switch off raw mode in the tokenizer.
119			tokenizer.NextIsNotRawText()
120		case html.EndTagToken:
121			// Whitelisted tokens can be written in raw.
122			tag, _ := tokenizer.TagName()
123			if whitelistTags[string(tag)] {
124				wr.Write(tokenizer.Raw())
125			} else {
126				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
127			}
128		case html.CommentToken:
129			// Comments are not really expected, but harmless.
130			wr.Write(tokenizer.Raw())
131		case html.DoctypeToken:
132			// Escape DOCTYPES, entities etc can be dangerous
133			wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
134		default:
135			tokenizer.Token()
136			panic(fmt.Errorf("Unexpected token type %v", t))
137		}
138	}
139	err := tokenizer.Err()
140	if err != nil && err != io.EOF {
141		panic(tokenizer.Err())
142	}
143	wr.Flush()
144	return w.Bytes()
145}
146
147func protocolAllowed(attr []byte) bool {
148	for _, prefix := range whitelistProtocols {
149		if bytes.HasPrefix(attr, prefix) {
150			return true
151		}
152	}
153	return false
154}