all repos — grayfriday @ 41251715adafaa3a6a5b0e624d92fc53fee525fa

blackfriday fork with a few changes

Use go.net/html's parser to sanitize HTML.

Use an HTML5 compliant parser that interprets HTML as a browser would to parse
the Markdown result and then sanitize based on the result.
Escape unrecognized and disallowed HTML in the result.
Currently works with a hard coded whitelist of safe HTML tags and attributes.
Martin Probst martin@probst.io
Sun, 27 Apr 2014 23:40:44 +0200
commit

41251715adafaa3a6a5b0e624d92fc53fee525fa

parent

3ca168f879360b439e173ee4feb5916607ccee40

5 files changed, 181 insertions(+), 124 deletions(-)

jump to
M html.gohtml.go

@@ -43,52 +43,12 @@ HTML_SMARTYPANTS_LATEX_DASHES // enable LaTeX-style dashes (with HTML_USE_SMARTYPANTS)

) var ( - tags = []string{ - "b", - "blockquote", - "code", - "del", - "dd", - "dl", - "dt", - "em", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "i", - "kbd", - "li", - "ol", - "p", - "pre", - "s", - "sup", - "sub", - "strong", - "strike", - "ul", - "table", - "tr", - "td", - "th", - "thead", - "tbody", - - } - alignments = []string{ "left", "right", "center", } - urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` - tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)(\salign="(` + strings.Join(alignments, "|") + `)")?>|<(br|hr)\s?\/?>)$`) - anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`) - imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) // TODO: improve this regexp to catch all possible entities: htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`) )

@@ -820,43 +780,12 @@

return false, -1 } -func sanitizeHtml(html []byte) []byte { - var result []byte - for string(html) != "" { - skip, tag, rest := findHtmlTag(html) - html = rest - result = append(result, skip...) - result = append(result, sanitizeTag(tag)...) - } - return append(result, []byte("\n")...) -} - -func sanitizeTag(tag []byte) []byte { - if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) { - return tag - } - return []byte("") -} - func skipUntilChar(text []byte, start int, char byte) int { i := start for i < len(text) && text[i] != char { i++ } return i -} - -func findHtmlTag(html []byte) (skip, tag, rest []byte) { - start := skipUntilChar(html, 0, '<') - rightAngle := skipUntilCharIgnoreQuotes(html, start, '>') - if rightAngle > start { - skip = html[0:start] - tag = html[start : rightAngle+1] - rest = html[rightAngle+1:] - return - } - - return []byte(""), []byte(""), []byte("") } func skipSpace(tag []byte, i int) int {
M inline.goinline.go

@@ -20,6 +20,7 @@ "strconv"

) var ( + urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`) )
M inline_test.goinline_test.go

@@ -72,135 +72,135 @@

func TestRawHtmlTag(t *testing.T) { tests := []string{ "zz <style>p {}</style>\n", - "<p>zz p {}</p>\n", + "<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n", "zz <STYLE>p {}</STYLE>\n", - "<p>zz p {}</p>\n", + "<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n", "<SCRIPT>alert()</SCRIPT>\n", - "<p>alert()</p>\n", + "<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n", "zz <SCRIPT>alert()</SCRIPT>\n", - "<p>zz alert()</p>\n", + "<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n", "zz <script>alert()</script>\n", - "<p>zz alert()</p>\n", + "<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n", " <script>alert()</script>\n", - "<p>alert()</p>\n", + "<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n", "<script>alert()</script>\n", - "alert()\n", + "&lt;script&gt;alert()&lt;/script&gt;\n", "<script src='foo'></script>\n", - "\n", + "&lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;\n", "<script src='a>b'></script>\n", - "\n", + "&lt;script src=&#39;a&gt;b&#39;&gt;&lt;/script&gt;\n", "zz <script src='foo'></script>\n", - "<p>zz </p>\n", + "<p>zz &lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;</p>\n", "zz <script src=foo></script>\n", - "<p>zz </p>\n", + "<p>zz &lt;script src=foo&gt;&lt;/script&gt;</p>\n", `<script><script src="http://example.com/exploit.js"></SCRIPT></script>`, - "\n", + "&lt;script&gt;&lt;script src=&#34;http://example.com/exploit.js&#34;&gt;&lt;/script&gt;&lt;/script&gt;\n", `'';!--"<XSS>=&{()}`, - "<p>'';!--&quot;=&amp;{()}</p>\n", + "<p>&#39;&#39;;!--&#34;&lt;xss&gt;=&amp;{()}</p>\n", "<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>", - "<p></p>\n", + "<p>&lt;script SRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n", "<SCRIPT \nSRC=http://ha.ckers.org/xss.js></SCRIPT>", - "<p></p>\n", + "<p>&lt;script \nSRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n", `<IMG SRC="javascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=javascript:alert('XSS')>", - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=JaVaScRiPt:alert('XSS')>", - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>", - "<p></p>\n", + "<p><img></p>\n", `<a onmouseover="alert(document.cookie)">xss link</a>`, - "<p>xss link</a></p>\n", + "<p><a>xss link</a></p>\n", "<a onmouseover=alert(document.cookie)>xss link</a>", - "<p>xss link</a></p>\n", + "<p><a>xss link</a></p>\n", - // XXX: this doesn't pass yet - //`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, - //"<p></p>\n", + `<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, + "<p><img>&lt;script&gt;alert(&amp;quot;XSS&amp;quot;)&lt;/script&gt;&#34;&gt;</p>\n", "<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>", - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC=# onmouseover="alert('xxs')">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC= onmouseover="alert('xxs')">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG onmouseover="alert('xxs')">`, - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>", - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>", - "<p></p>\n", + "<p><img></p>\n", "<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>", - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC="javascriptascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC="jav&#x09;ascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC="jav&#x0A;ascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC="jav&#x0D;ascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", `<IMG SRC=" &#14; javascript:alert('XSS');">`, - "<p></p>\n", + "<p><img></p>\n", `<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, - "<p></p>\n", + "<p>&lt;script/XSS SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n", - // XXX: this doesn't pass yet - //"<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>", - //"\n", + "<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>", + "<p>&lt;body onload!#$%&amp;()*~+-_.,:;?@[/|\\]^`=alert(&#34;XSS&#34;)&gt;</p>\n", `<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, - "<p></p>\n", + "<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n", - // XXX: this doesn't pass yet - //`<<SCRIPT>alert("XSS");//<</SCRIPT>`, - //"", + // HTML5 interprets the <script> tag contents as raw test, thus the end + // result has double-escaped &amp;quot; + `<<SCRIPT>alert("XSS");//<</SCRIPT>`, + "<p>&lt;&lt;script&gt;alert(&amp;quot;XSS&amp;quot;);//&amp;lt;&lt;/script&gt;</p>\n", + // HTML5 parses the </p> within an unclosed <script> tag as text. + // Same for the following tests. "<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >", - "<p></p>\n", + "<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;&lt;/p&gt;\n", "<SCRIPT SRC=//ha.ckers.org/.j>", - "<p></p>\n", + "<p>&lt;script SRC=//ha.ckers.org/.j&gt;&lt;/p&gt;\n", - // XXX: this doesn't pass yet - //`<IMG SRC="javascript:alert('XSS')"`, - //"", + `<IMG SRC="javascript:alert('XSS')"`, + "<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n", - // XXX: this doesn't pass yet - //"<iframe src=http://ha.ckers.org/scriptlet.html <", - //"", + "<iframe src=http://ha.ckers.org/scriptlet.html <", + // The hyperlink gets linkified, the <iframe> gets escaped + "<p>&lt;iframe src=<a href=\"http://ha.ckers.org/scriptlet.html\">http://ha.ckers.org/scriptlet.html</a> &lt;</p>\n", } doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT) }
M markdown.gomarkdown.go

@@ -298,7 +298,7 @@ first := firstPass(p, input)

second := secondPass(p, first) if renderer.GetFlags()&HTML_SANITIZE_OUTPUT != 0 { - second = sanitizeHtml(second) + second = sanitizeHtmlSafe(second) } return second
A sanitize.go

@@ -0,0 +1,127 @@

+package blackfriday + +import ( + "bufio" + "bytes" + "code.google.com/p/go.net/html" + "fmt" + "io" +) + +// Whitelisted element tags, attributes on particular tags, attributes that are +// interpreted as protocols (again on particular tags), and allowed protocols. +var ( + whitelistTags map[string]bool + whitelistAttrs map[string]map[string]bool + protocolAttrs map[string]map[string]bool + whitelistProtocols [][]byte +) + +func init() { + whitelistTags = toSet([]string{ + "a", "b", "blockquote", "br", "caption", "cite", "code", "col", + "colgroup", "dd", "div", "dl", "dt", "em", + "h1", "h2", "h3", "h4", "h5", "h6", + "i", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong", + "sub", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", + "ul"}) + whitelistAttrs = map[string]map[string]bool{ + "a": toSet([]string{"href", "title"}), + "img": toSet([]string{"src", "alt", "title"}), + } + protocolAttrs = map[string]map[string]bool{ + "a": toSet([]string{"href"}), + "img": toSet([]string{"src"}), + } + whitelistProtocols = [][]byte{ + []byte("http://"), + []byte("https://"), + []byte("ftp://"), + []byte("mailto:"), + } +} + +func toSet(keys []string) map[string]bool { + m := make(map[string]bool, len(keys)) + for _, k := range keys { + m[k] = true + } + return m +} + +// Sanitizes the given input by parsing it as HTML5, then whitelisting known to +// be safe elements and attributes. All other HTML is escaped, unsafe attributes +// are stripped. +func sanitizeHtmlSafe(input []byte) []byte { + r := bytes.NewReader(input) + var w bytes.Buffer + tokenizer := html.NewTokenizer(r) + wr := bufio.NewWriter(&w) + + // Iterate through all tokens in the input stream and sanitize them. + for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() { + switch t { + case html.TextToken: + // Text is written escaped. + wr.WriteString(tokenizer.Token().String()) + case html.StartTagToken: + // HTML tags are escaped unless whitelisted. + tag, hasAttributes := tokenizer.TagName() + tagName := string(tag) + if whitelistTags[tagName] { + wr.WriteString("<") + wr.Write(tag) + for hasAttributes { + var key, val []byte + key, val, hasAttributes = tokenizer.TagAttr() + attrName := string(key) + // Only include whitelisted attributes for the given tagName. + tagWhitelistedAttrs, ok := whitelistAttrs[tagName] + if ok && tagWhitelistedAttrs[attrName] { + // For whitelisted attributes, if it's an attribute that requires + // protocol checking, do so and strip it if it's not known to be safe. + tagProtocolAttrs, ok := protocolAttrs[tagName] + if ok && tagProtocolAttrs[attrName] { + if !protocolAllowed(val) { + continue + } + } + wr.WriteByte(' ') + wr.Write(key) + wr.WriteString(`="`) + wr.WriteString(html.EscapeString(string(val))) + wr.WriteByte('"') + } + } + wr.WriteString(">") + } else { + wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) + } + case html.EndTagToken: + // Whitelisted tokens can be written in raw. + tag, _ := tokenizer.TagName() + if whitelistTags[string(tag)] { + wr.Write(tokenizer.Raw()) + } else { + wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) + } + default: + panic(fmt.Errorf("Unexpected token type %v", t)) + } + } + err := tokenizer.Err() + if err != nil && err != io.EOF { + panic(tokenizer.Err()) + } + wr.Flush() + return w.Bytes() +} + +func protocolAllowed(attr []byte) bool { + for _, prefix := range whitelistProtocols { + if bytes.HasPrefix(attr, prefix) { + return true + } + } + return false +}