sanitize.go (view raw)
1package blackfriday
2
3import (
4 "bufio"
5 "bytes"
6 "code.google.com/p/go.net/html"
7 "fmt"
8 "io"
9)
10
11// Whitelisted element tags, attributes on particular tags, attributes that are
12// interpreted as protocols (again on particular tags), and allowed protocols.
13var (
14 whitelistTags map[string]bool
15 whitelistAttrs map[string]map[string]bool
16 protocolAttrs map[string]map[string]bool
17 whitelistProtocols [][]byte
18)
19
20func init() {
21 whitelistTags = toSet([]string{
22 // Headings
23 "h1", "h2", "h3", "h4", "h5", "h6",
24 // Block elements
25 "p", "pre", "blockquote", "hr", "div", "header", "article", "aside", "footer",
26 "section", "main", "mark", "figure", "figcaption",
27 // Inline elements
28 "a", "br", "cite", "code", "img",
29 // Lists
30 "ol", "ul", "li",
31 // Tables
32 "table", "tbody", "td", "tfoot", "th", "thead", "tr", "colgroup", "col", "caption",
33 // Formatting
34 "u", "i", "em", "small", "strike", "b", "strong", "sub", "sup", "q",
35 // Definition lists
36 "dd", "dl", "dt",
37 })
38 whitelistAttrs = map[string]map[string]bool{
39 "a": toSet([]string{"href", "title", "rel"}),
40 "img": toSet([]string{"src", "alt", "title"}),
41 "td": toSet([]string{"align"}),
42 "th": toSet([]string{"align"}),
43 }
44 protocolAttrs = map[string]map[string]bool{
45 "a": toSet([]string{"href"}),
46 "img": toSet([]string{"src"}),
47 }
48 whitelistProtocols = [][]byte{
49 []byte("http://"),
50 []byte("https://"),
51 []byte("ftp://"),
52 []byte("mailto:"),
53 }
54}
55
56func toSet(keys []string) map[string]bool {
57 m := make(map[string]bool, len(keys))
58 for _, k := range keys {
59 m[k] = true
60 }
61 return m
62}
63
64// Sanitizes the given input by parsing it as HTML5, then whitelisting known to
65// be safe elements and attributes. All other HTML is escaped, unsafe attributes
66// are stripped.
67func sanitizeHtmlSafe(input []byte) []byte {
68 r := bytes.NewReader(input)
69 var w bytes.Buffer
70 tokenizer := html.NewTokenizer(r)
71 wr := bufio.NewWriter(&w)
72
73 // Iterate through all tokens in the input stream and sanitize them.
74 for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() {
75 switch t {
76 case html.TextToken:
77 // Text is written escaped.
78 wr.WriteString(tokenizer.Token().String())
79 case html.SelfClosingTagToken, html.StartTagToken:
80 // HTML tags are escaped unless whitelisted.
81 tag, hasAttributes := tokenizer.TagName()
82 tagName := string(tag)
83 if whitelistTags[tagName] {
84 wr.WriteString("<")
85 wr.Write(tag)
86 for hasAttributes {
87 var key, val []byte
88 key, val, hasAttributes = tokenizer.TagAttr()
89 attrName := string(key)
90 // Only include whitelisted attributes for the given tagName.
91 tagWhitelistedAttrs, ok := whitelistAttrs[tagName]
92 if ok && tagWhitelistedAttrs[attrName] {
93 // For whitelisted attributes, if it's an attribute that requires
94 // protocol checking, do so and strip it if it's not known to be safe.
95 tagProtocolAttrs, ok := protocolAttrs[tagName]
96 if ok && tagProtocolAttrs[attrName] {
97 if !isRelativeLink(val) && !protocolAllowed(val) {
98 continue
99 }
100 }
101 wr.WriteByte(' ')
102 wr.Write(key)
103 wr.WriteString(`="`)
104 wr.WriteString(html.EscapeString(string(val)))
105 wr.WriteByte('"')
106 }
107 }
108 if t == html.SelfClosingTagToken {
109 wr.WriteString("/>")
110 } else {
111 wr.WriteString(">")
112 }
113 } else {
114 wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
115 }
116 // Make sure that tags like <script> that switch the parser into raw mode
117 // do not destroy the parse mode for following HTML text (the point is to
118 // escape them anyway). For that, switch off raw mode in the tokenizer.
119 tokenizer.NextIsNotRawText()
120 case html.EndTagToken:
121 // Whitelisted tokens can be written in raw.
122 tag, _ := tokenizer.TagName()
123 if whitelistTags[string(tag)] {
124 wr.Write(tokenizer.Raw())
125 } else {
126 wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
127 }
128 case html.CommentToken:
129 // Comments are not really expected, but harmless.
130 wr.Write(tokenizer.Raw())
131 case html.DoctypeToken:
132 // Escape DOCTYPES, entities etc can be dangerous
133 wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
134 default:
135 tokenizer.Token()
136 panic(fmt.Errorf("Unexpected token type %v", t))
137 }
138 }
139 err := tokenizer.Err()
140 if err != nil && err != io.EOF {
141 panic(tokenizer.Err())
142 }
143 wr.Flush()
144 return w.Bytes()
145}
146
147func protocolAllowed(attr []byte) bool {
148 for _, prefix := range whitelistProtocols {
149 if bytes.HasPrefix(attr, prefix) {
150 return true
151 }
152 }
153 return false
154}