all repos — grayfriday @ 41251715adafaa3a6a5b0e624d92fc53fee525fa

blackfriday fork with a few changes

sanitize.go (view raw)

  1package blackfriday
  2
  3import (
  4	"bufio"
  5	"bytes"
  6	"code.google.com/p/go.net/html"
  7	"fmt"
  8	"io"
  9)
 10
 11// Whitelisted element tags, attributes on particular tags, attributes that are
 12// interpreted as protocols (again on particular tags), and allowed protocols.
 13var (
 14	whitelistTags      map[string]bool
 15	whitelistAttrs     map[string]map[string]bool
 16	protocolAttrs      map[string]map[string]bool
 17	whitelistProtocols [][]byte
 18)
 19
 20func init() {
 21	whitelistTags = toSet([]string{
 22		"a", "b", "blockquote", "br", "caption", "cite", "code", "col",
 23		"colgroup", "dd", "div", "dl", "dt", "em",
 24		"h1", "h2", "h3", "h4", "h5", "h6",
 25		"i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong",
 26		"sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u",
 27		"ul"})
 28	whitelistAttrs = map[string]map[string]bool{
 29		"a":   toSet([]string{"href", "title"}),
 30		"img": toSet([]string{"src", "alt", "title"}),
 31	}
 32	protocolAttrs = map[string]map[string]bool{
 33		"a":   toSet([]string{"href"}),
 34		"img": toSet([]string{"src"}),
 35	}
 36	whitelistProtocols = [][]byte{
 37		[]byte("http://"),
 38		[]byte("https://"),
 39		[]byte("ftp://"),
 40		[]byte("mailto:"),
 41	}
 42}
 43
 44func toSet(keys []string) map[string]bool {
 45	m := make(map[string]bool, len(keys))
 46	for _, k := range keys {
 47		m[k] = true
 48	}
 49	return m
 50}
 51
 52// Sanitizes the given input by parsing it as HTML5, then whitelisting known to
 53// be safe elements and attributes. All other HTML is escaped, unsafe attributes
 54// are stripped.
 55func sanitizeHtmlSafe(input []byte) []byte {
 56	r := bytes.NewReader(input)
 57	var w bytes.Buffer
 58	tokenizer := html.NewTokenizer(r)
 59	wr := bufio.NewWriter(&w)
 60
 61	// Iterate through all tokens in the input stream and sanitize them.
 62	for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() {
 63		switch t {
 64		case html.TextToken:
 65			// Text is written escaped.
 66			wr.WriteString(tokenizer.Token().String())
 67		case html.StartTagToken:
 68			// HTML tags are escaped unless whitelisted.
 69			tag, hasAttributes := tokenizer.TagName()
 70			tagName := string(tag)
 71			if whitelistTags[tagName] {
 72				wr.WriteString("<")
 73				wr.Write(tag)
 74				for hasAttributes {
 75					var key, val []byte
 76					key, val, hasAttributes = tokenizer.TagAttr()
 77					attrName := string(key)
 78					// Only include whitelisted attributes for the given tagName.
 79					tagWhitelistedAttrs, ok := whitelistAttrs[tagName]
 80					if ok && tagWhitelistedAttrs[attrName] {
 81						// For whitelisted attributes, if it's an attribute that requires
 82						// protocol checking, do so and strip it if it's not known to be safe.
 83						tagProtocolAttrs, ok := protocolAttrs[tagName]
 84						if ok && tagProtocolAttrs[attrName] {
 85							if !protocolAllowed(val) {
 86								continue
 87							}
 88						}
 89						wr.WriteByte(' ')
 90						wr.Write(key)
 91						wr.WriteString(`="`)
 92						wr.WriteString(html.EscapeString(string(val)))
 93						wr.WriteByte('"')
 94					}
 95				}
 96				wr.WriteString(">")
 97			} else {
 98				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
 99			}
100		case html.EndTagToken:
101			// Whitelisted tokens can be written in raw.
102			tag, _ := tokenizer.TagName()
103			if whitelistTags[string(tag)] {
104				wr.Write(tokenizer.Raw())
105			} else {
106				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
107			}
108		default:
109			panic(fmt.Errorf("Unexpected token type %v", t))
110		}
111	}
112	err := tokenizer.Err()
113	if err != nil && err != io.EOF {
114		panic(tokenizer.Err())
115	}
116	wr.Flush()
117	return w.Bytes()
118}
119
120func protocolAllowed(attr []byte) bool {
121	for _, prefix := range whitelistProtocols {
122		if bytes.HasPrefix(attr, prefix) {
123			return true
124		}
125	}
126	return false
127}