Merge pull request #69 from mprobst/master Use go.net/html's parser to sanitize HTML.
Vytautas Ĺ altenis vytas@rtfb.lt
Thu, 01 May 2014 20:47:17 +0300
5 files changed,
181 insertions(+),
124 deletions(-)
M
html.go
→
html.go
@@ -43,52 +43,12 @@ HTML_SMARTYPANTS_LATEX_DASHES // enable LaTeX-style dashes (with HTML_USE_SMARTYPANTS)
) var ( - tags = []string{ - "b", - "blockquote", - "code", - "del", - "dd", - "dl", - "dt", - "em", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "i", - "kbd", - "li", - "ol", - "p", - "pre", - "s", - "sup", - "sub", - "strong", - "strike", - "ul", - "table", - "tr", - "td", - "th", - "thead", - "tbody", - - } - alignments = []string{ "left", "right", "center", } - urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` - tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)(\salign="(` + strings.Join(alignments, "|") + `)")?>|<(br|hr)\s?\/?>)$`) - anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`) - imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) // TODO: improve this regexp to catch all possible entities: htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`) )@@ -820,43 +780,12 @@
return false, -1 } -func sanitizeHtml(html []byte) []byte { - var result []byte - for string(html) != "" { - skip, tag, rest := findHtmlTag(html) - html = rest - result = append(result, skip...) - result = append(result, sanitizeTag(tag)...) - } - return append(result, []byte("\n")...) -} - -func sanitizeTag(tag []byte) []byte { - if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) { - return tag - } - return []byte("") -} - func skipUntilChar(text []byte, start int, char byte) int { i := start for i < len(text) && text[i] != char { i++ } return i -} - -func findHtmlTag(html []byte) (skip, tag, rest []byte) { - start := skipUntilChar(html, 0, '<') - rightAngle := skipUntilCharIgnoreQuotes(html, start, '>') - if rightAngle > start { - skip = html[0:start] - tag = html[start : rightAngle+1] - rest = html[rightAngle+1:] - return - } - - return []byte(""), []byte(""), []byte("") } func skipSpace(tag []byte, i int) int {
M
inline_test.go
→
inline_test.go
@@ -72,135 +72,135 @@
func TestRawHtmlTag(t *testing.T) { tests := []string{ "zz <style>p {}</style>\n", - "<p>zz p {}</p>\n", + "<p>zz <style>p {}</style></p>\n", "zz <STYLE>p {}</STYLE>\n", - "<p>zz p {}</p>\n", + "<p>zz <style>p {}</style></p>\n", "<SCRIPT>alert()</SCRIPT>\n", - "<p>alert()</p>\n", + "<p><script>alert()</script></p>\n", "zz <SCRIPT>alert()</SCRIPT>\n", - "<p>zz alert()</p>\n", + "<p>zz <script>alert()</script></p>\n", "zz <script>alert()</script>\n", - "<p>zz alert()</p>\n", + "<p>zz <script>alert()</script></p>\n", " <script>alert()</script>\n", - "<p>alert()</p>\n", + "<p><script>alert()</script></p>\n", "<script>alert()</script>\n", - "alert()\n", + "<script>alert()</script>\n", "<script src='foo'></script>\n", - "\n", + "<script src='foo'></script>\n", "<script src='a>b'></script>\n", - "\n", + "<script src='a>b'></script>\n", "zz <script src='foo'></script>\n", - "<p>zz </p>\n", + "<p>zz <script src='foo'></script></p>\n", "zz <script src=foo></script>\n", - "<p>zz </p>\n", + "<p>zz <script src=foo></script></p>\n", `<script><script src="http://example.com/exploit.js"></SCRIPT></script>`, - "\n", + "<script><script src="http://example.com/exploit.js"></script></script>\n", `'';!--"<XSS>=&{()}`, - "<p>'';!--"=&{()}</p>\n", + "<p>'';!--"<xss>=&{()}</p>\n", "<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>", - "<p></p>\n", + "<p><script SRC=http://ha.ckers.org/xss.js></script></p>\n", "<SCRIPT \nSRC=http://ha.ckers.org/xss.js></SCRIPT>", - "<p></p>\n", + "<p><script \nSRC=http://ha.ckers.org/xss.js></script></p>\n", `<IMG SRC="javascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=javascript:alert('XSS')>", - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=JaVaScRiPt:alert('XSS')>", - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>", - "<p></p>\n", + "<p><img></p>\n", `<a onmouseover="alert(document.cookie)">xss link</a>`, - "<p>xss link</a></p>\n", + "<p><a>xss link</a></p>\n", "<a onmouseover=alert(document.cookie)>xss link</a>", - "<p>xss link</a></p>\n", + "<p><a>xss link</a></p>\n", - // XXX: this doesn't pass yet - //`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, - //"<p></p>\n", + `<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, + "<p><img><script>alert(&quot;XSS&quot;)</script>"></p>\n", "<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>", - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC=# onmouseover="alert('xxs')">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC= onmouseover="alert('xxs')">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG onmouseover="alert('xxs')">`, - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=javascript:alert('XSS')>", - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=javascript:alert('XSS')>", - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=javascript:alert('XSS')>", - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC="javascriptascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC="jav	ascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC="jav
ascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC="jav
ascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC="  javascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", `<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, - "<p></p>\n", + "<p><script/XSS SRC="http://ha.ckers.org/xss.js"></script></p>\n", - // XXX: this doesn't pass yet - //"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>", - //"\n", + "<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>", + "<p><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></p>\n", `<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, - "<p></p>\n", + "<p><script/SRC="http://ha.ckers.org/xss.js"></script></p>\n", - // XXX: this doesn't pass yet - //`<<SCRIPT>alert("XSS");//<</SCRIPT>`, - //"", + // HTML5 interprets the <script> tag contents as raw test, thus the end + // result has double-escaped &quot; + `<<SCRIPT>alert("XSS");//<</SCRIPT>`, + "<p><<script>alert(&quot;XSS&quot;);//&lt;</script></p>\n", + // HTML5 parses the </p> within an unclosed <script> tag as text. + // Same for the following tests. "<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >", - "<p></p>\n", + "<p><script SRC=http://ha.ckers.org/xss.js?< B ></p>\n", "<SCRIPT SRC=//ha.ckers.org/.j>", - "<p></p>\n", + "<p><script SRC=//ha.ckers.org/.j></p>\n", - // XXX: this doesn't pass yet - //`<IMG SRC="javascript:alert('XSS')"`, - //"", + `<IMG SRC="javascript:alert('XSS')"`, + "<p><IMG SRC="javascript:alert('XSS')"</p>\n", - // XXX: this doesn't pass yet - //"<iframe src=http://ha.ckers.org/scriptlet.html <", - //"", + "<iframe src=http://ha.ckers.org/scriptlet.html <", + // The hyperlink gets linkified, the <iframe> gets escaped + "<p><iframe src=<a href=\"http://ha.ckers.org/scriptlet.html\">http://ha.ckers.org/scriptlet.html</a> <</p>\n", } doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT) }
M
markdown.go
→
markdown.go
@@ -298,7 +298,7 @@ first := firstPass(p, input)
second := secondPass(p, first) if renderer.GetFlags()&HTML_SANITIZE_OUTPUT != 0 { - second = sanitizeHtml(second) + second = sanitizeHtmlSafe(second) } return second
A
sanitize.go
@@ -0,0 +1,127 @@
+package blackfriday + +import ( + "bufio" + "bytes" + "code.google.com/p/go.net/html" + "fmt" + "io" +) + +// Whitelisted element tags, attributes on particular tags, attributes that are +// interpreted as protocols (again on particular tags), and allowed protocols. +var ( + whitelistTags map[string]bool + whitelistAttrs map[string]map[string]bool + protocolAttrs map[string]map[string]bool + whitelistProtocols [][]byte +) + +func init() { + whitelistTags = toSet([]string{ + "a", "b", "blockquote", "br", "caption", "cite", "code", "col", + "colgroup", "dd", "div", "dl", "dt", "em", + "h1", "h2", "h3", "h4", "h5", "h6", + "i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong", + "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", + "ul"}) + whitelistAttrs = map[string]map[string]bool{ + "a": toSet([]string{"href", "title"}), + "img": toSet([]string{"src", "alt", "title"}), + } + protocolAttrs = map[string]map[string]bool{ + "a": toSet([]string{"href"}), + "img": toSet([]string{"src"}), + } + whitelistProtocols = [][]byte{ + []byte("http://"), + []byte("https://"), + []byte("ftp://"), + []byte("mailto:"), + } +} + +func toSet(keys []string) map[string]bool { + m := make(map[string]bool, len(keys)) + for _, k := range keys { + m[k] = true + } + return m +} + +// Sanitizes the given input by parsing it as HTML5, then whitelisting known to +// be safe elements and attributes. All other HTML is escaped, unsafe attributes +// are stripped. +func sanitizeHtmlSafe(input []byte) []byte { + r := bytes.NewReader(input) + var w bytes.Buffer + tokenizer := html.NewTokenizer(r) + wr := bufio.NewWriter(&w) + + // Iterate through all tokens in the input stream and sanitize them. + for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() { + switch t { + case html.TextToken: + // Text is written escaped. + wr.WriteString(tokenizer.Token().String()) + case html.StartTagToken: + // HTML tags are escaped unless whitelisted. + tag, hasAttributes := tokenizer.TagName() + tagName := string(tag) + if whitelistTags[tagName] { + wr.WriteString("<") + wr.Write(tag) + for hasAttributes { + var key, val []byte + key, val, hasAttributes = tokenizer.TagAttr() + attrName := string(key) + // Only include whitelisted attributes for the given tagName. + tagWhitelistedAttrs, ok := whitelistAttrs[tagName] + if ok && tagWhitelistedAttrs[attrName] { + // For whitelisted attributes, if it's an attribute that requires + // protocol checking, do so and strip it if it's not known to be safe. + tagProtocolAttrs, ok := protocolAttrs[tagName] + if ok && tagProtocolAttrs[attrName] { + if !protocolAllowed(val) { + continue + } + } + wr.WriteByte(' ') + wr.Write(key) + wr.WriteString(`="`) + wr.WriteString(html.EscapeString(string(val))) + wr.WriteByte('"') + } + } + wr.WriteString(">") + } else { + wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) + } + case html.EndTagToken: + // Whitelisted tokens can be written in raw. + tag, _ := tokenizer.TagName() + if whitelistTags[string(tag)] { + wr.Write(tokenizer.Raw()) + } else { + wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) + } + default: + panic(fmt.Errorf("Unexpected token type %v", t)) + } + } + err := tokenizer.Err() + if err != nil && err != io.EOF { + panic(tokenizer.Err()) + } + wr.Flush() + return w.Bytes() +} + +func protocolAllowed(attr []byte) bool { + for _, prefix := range whitelistProtocols { + if bytes.HasPrefix(attr, prefix) { + return true + } + } + return false +}