sanitize.go (view raw)
1package blackfriday
2
3import (
4 "bufio"
5 "bytes"
6 "code.google.com/p/go.net/html"
7 "fmt"
8 "io"
9)
10
11// Whitelisted element tags, attributes on particular tags, attributes that are
12// interpreted as protocols (again on particular tags), and allowed protocols.
13var (
14 whitelistTags map[string]bool
15 whitelistAttrs map[string]map[string]bool
16 protocolAttrs map[string]map[string]bool
17 whitelistProtocols [][]byte
18)
19
20func init() {
21 whitelistTags = toSet([]string{
22 "a", "b", "blockquote", "br", "caption", "cite", "code", "col",
23 "colgroup", "dd", "div", "dl", "dt", "em",
24 "h1", "h2", "h3", "h4", "h5", "h6",
25 "i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong",
26 "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u",
27 "ul"})
28 whitelistAttrs = map[string]map[string]bool{
29 "a": toSet([]string{"href", "title"}),
30 "img": toSet([]string{"src", "alt", "title"}),
31 }
32 protocolAttrs = map[string]map[string]bool{
33 "a": toSet([]string{"href"}),
34 "img": toSet([]string{"src"}),
35 }
36 whitelistProtocols = [][]byte{
37 []byte("http://"),
38 []byte("https://"),
39 []byte("ftp://"),
40 []byte("mailto:"),
41 }
42}
43
44func toSet(keys []string) map[string]bool {
45 m := make(map[string]bool, len(keys))
46 for _, k := range keys {
47 m[k] = true
48 }
49 return m
50}
51
52// Sanitizes the given input by parsing it as HTML5, then whitelisting known to
53// be safe elements and attributes. All other HTML is escaped, unsafe attributes
54// are stripped.
55func sanitizeHtmlSafe(input []byte) []byte {
56 r := bytes.NewReader(input)
57 var w bytes.Buffer
58 tokenizer := html.NewTokenizer(r)
59 wr := bufio.NewWriter(&w)
60
61 // Iterate through all tokens in the input stream and sanitize them.
62 for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() {
63 switch t {
64 case html.TextToken:
65 // Text is written escaped.
66 wr.WriteString(tokenizer.Token().String())
67 case html.StartTagToken:
68 // HTML tags are escaped unless whitelisted.
69 tag, hasAttributes := tokenizer.TagName()
70 tagName := string(tag)
71 if whitelistTags[tagName] {
72 wr.WriteString("<")
73 wr.Write(tag)
74 for hasAttributes {
75 var key, val []byte
76 key, val, hasAttributes = tokenizer.TagAttr()
77 attrName := string(key)
78 // Only include whitelisted attributes for the given tagName.
79 tagWhitelistedAttrs, ok := whitelistAttrs[tagName]
80 if ok && tagWhitelistedAttrs[attrName] {
81 // For whitelisted attributes, if it's an attribute that requires
82 // protocol checking, do so and strip it if it's not known to be safe.
83 tagProtocolAttrs, ok := protocolAttrs[tagName]
84 if ok && tagProtocolAttrs[attrName] {
85 if !protocolAllowed(val) {
86 continue
87 }
88 }
89 wr.WriteByte(' ')
90 wr.Write(key)
91 wr.WriteString(`="`)
92 wr.WriteString(html.EscapeString(string(val)))
93 wr.WriteByte('"')
94 }
95 }
96 wr.WriteString(">")
97 } else {
98 wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
99 }
100 case html.EndTagToken:
101 // Whitelisted tokens can be written in raw.
102 tag, _ := tokenizer.TagName()
103 if whitelistTags[string(tag)] {
104 wr.Write(tokenizer.Raw())
105 } else {
106 wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
107 }
108 default:
109 panic(fmt.Errorf("Unexpected token type %v", t))
110 }
111 }
112 err := tokenizer.Err()
113 if err != nil && err != io.EOF {
114 panic(tokenizer.Err())
115 }
116 wr.Flush()
117 return w.Bytes()
118}
119
120func protocolAllowed(attr []byte) bool {
121 for _, prefix := range whitelistProtocols {
122 if bytes.HasPrefix(attr, prefix) {
123 return true
124 }
125 }
126 return false
127}