Rip off all blackfriday's html sanitization effort As per discussion in issue #90.
Vytautas Saltenis vytas@rtfb.lt
Fri, 19 Sep 2014 20:30:00 +0300
5 files changed,
3 insertions(+),
366 deletions(-)
M
html.go
→
html.go
@@ -29,7 +29,6 @@ HTML_SKIP_HTML = 1 << iota // skip preformatted HTML blocks
HTML_SKIP_STYLE // skip embedded <style> elements HTML_SKIP_IMAGES // skip embedded images HTML_SKIP_LINKS // skip all links - HTML_SANITIZE_OUTPUT // strip output of everything that's not known to be safe HTML_SAFELINK // only link to trusted protocols HTML_NOFOLLOW_LINKS // only link with rel="nofollow" HTML_HREF_TARGET_BLANK // add a blank target
M
inline_test.go
→
inline_test.go
@@ -425,15 +425,12 @@ func TestNofollowLink(t *testing.T) {
var tests = []string{ "[foo](http://bar.com/foo/)\n", "<p><a href=\"http://bar.com/foo/\" rel=\"nofollow\">foo</a></p>\n", - } - doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS|HTML_SANITIZE_OUTPUT, - HtmlRendererParameters{}) - // HTML_SANITIZE_OUTPUT won't allow relative links, so test that separately: - tests = []string{ + "[foo](/bar/)\n", "<p><a href=\"/bar/\">foo</a></p>\n", } - doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS, HtmlRendererParameters{}) + doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS, + HtmlRendererParameters{}) } func TestHrefTargetBlank(t *testing.T) {
M
markdown.go
→
markdown.go
@@ -238,7 +238,6 @@ htmlFlags |= HTML_USE_XHTML
htmlFlags |= HTML_USE_SMARTYPANTS htmlFlags |= HTML_SMARTYPANTS_FRACTIONS htmlFlags |= HTML_SMARTYPANTS_LATEX_DASHES - htmlFlags |= HTML_SANITIZE_OUTPUT renderer := HtmlRenderer(htmlFlags, "", "") // set up the parser@@ -298,11 +297,6 @@ }
first := firstPass(p, input) second := secondPass(p, first) - - if renderer.GetFlags()&HTML_SANITIZE_OUTPUT != 0 { - second = sanitizeHtmlSafe(second) - } - return second }
D
sanitize.go
@@ -1,154 +0,0 @@
-package blackfriday - -import ( - "bufio" - "bytes" - "code.google.com/p/go.net/html" - "fmt" - "io" -) - -// Whitelisted element tags, attributes on particular tags, attributes that are -// interpreted as protocols (again on particular tags), and allowed protocols. -var ( - whitelistTags map[string]bool - whitelistAttrs map[string]map[string]bool - protocolAttrs map[string]map[string]bool - whitelistProtocols [][]byte -) - -func init() { - whitelistTags = toSet([]string{ - // Headings - "h1", "h2", "h3", "h4", "h5", "h6", - // Block elements - "p", "pre", "blockquote", "hr", "div", "header", "article", "aside", "footer", - "section", "main", "mark", "figure", "figcaption", - // Inline elements - "a", "br", "cite", "code", "img", - // Lists - "ol", "ul", "li", - // Tables - "table", "tbody", "td", "tfoot", "th", "thead", "tr", "colgroup", "col", "caption", - // Formatting - "u", "i", "em", "small", "strike", "b", "strong", "sub", "sup", "q", - // Definition lists - "dd", "dl", "dt", - }) - whitelistAttrs = map[string]map[string]bool{ - "a": toSet([]string{"href", "title", "rel"}), - "img": toSet([]string{"src", "alt", "title"}), - "td": toSet([]string{"align"}), - "th": toSet([]string{"align"}), - } - protocolAttrs = map[string]map[string]bool{ - "a": toSet([]string{"href"}), - "img": toSet([]string{"src"}), - } - whitelistProtocols = [][]byte{ - []byte("http://"), - []byte("https://"), - []byte("ftp://"), - []byte("mailto:"), - } -} - -func toSet(keys []string) map[string]bool { - m := make(map[string]bool, len(keys)) - for _, k := range keys { - m[k] = true - } - return m -} - -// Sanitizes the given input by parsing it as HTML5, then whitelisting known to -// be safe elements and attributes. All other HTML is escaped, unsafe attributes -// are stripped. -func sanitizeHtmlSafe(input []byte) []byte { - r := bytes.NewReader(input) - var w bytes.Buffer - tokenizer := html.NewTokenizer(r) - wr := bufio.NewWriter(&w) - - // Iterate through all tokens in the input stream and sanitize them. - for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() { - switch t { - case html.TextToken: - // Text is written escaped. - wr.WriteString(tokenizer.Token().String()) - case html.SelfClosingTagToken, html.StartTagToken: - // HTML tags are escaped unless whitelisted. - tag, hasAttributes := tokenizer.TagName() - tagName := string(tag) - if whitelistTags[tagName] { - wr.WriteString("<") - wr.Write(tag) - for hasAttributes { - var key, val []byte - key, val, hasAttributes = tokenizer.TagAttr() - attrName := string(key) - // Only include whitelisted attributes for the given tagName. - tagWhitelistedAttrs, ok := whitelistAttrs[tagName] - if ok && tagWhitelistedAttrs[attrName] { - // For whitelisted attributes, if it's an attribute that requires - // protocol checking, do so and strip it if it's not known to be safe. - tagProtocolAttrs, ok := protocolAttrs[tagName] - if ok && tagProtocolAttrs[attrName] { - if !isRelativeLink(val) && !protocolAllowed(val) { - continue - } - } - wr.WriteByte(' ') - wr.Write(key) - wr.WriteString(`="`) - wr.WriteString(html.EscapeString(string(val))) - wr.WriteByte('"') - } - } - if t == html.SelfClosingTagToken { - wr.WriteString("/>") - } else { - wr.WriteString(">") - } - } else { - wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) - } - // Make sure that tags like <script> that switch the parser into raw mode - // do not destroy the parse mode for following HTML text (the point is to - // escape them anyway). For that, switch off raw mode in the tokenizer. - tokenizer.NextIsNotRawText() - case html.EndTagToken: - // Whitelisted tokens can be written in raw. - tag, _ := tokenizer.TagName() - if whitelistTags[string(tag)] { - wr.Write(tokenizer.Raw()) - } else { - wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) - } - case html.CommentToken: - // Comments are not really expected, but harmless. - wr.Write(tokenizer.Raw()) - case html.DoctypeToken: - // Escape DOCTYPES, entities etc can be dangerous - wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) - default: - tokenizer.Token() - panic(fmt.Errorf("Unexpected token type %v", t)) - } - } - err := tokenizer.Err() - if err != nil && err != io.EOF { - panic(tokenizer.Err()) - } - wr.Flush() - return w.Bytes() -} - -func protocolAllowed(attr []byte) bool { - for _, prefix := range whitelistProtocols { - if bytes.HasPrefix(attr, prefix) { - return true - } - } - return false -}
D
sanitize_test.go
@@ -1,199 +0,0 @@
-package blackfriday - -import ( - "testing" -) - -func doTestsSanitize(t *testing.T, tests []string) { - doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT, HtmlRendererParameters{}) -} - -func TestSanitizeRawHtmlTag(t *testing.T) { - tests := []string{ - "zz <style>p {}</style>\n", - "<p>zz <style>p {}</style></p>\n", - - "zz <STYLE>p {}</STYLE>\n", - "<p>zz <style>p {}</style></p>\n", - - "<SCRIPT>alert()</SCRIPT>\n", - "<p><script>alert()</script></p>\n", - - "zz <SCRIPT>alert()</SCRIPT>\n", - "<p>zz <script>alert()</script></p>\n", - - "zz <script>alert()</script>\n", - "<p>zz <script>alert()</script></p>\n", - - " <script>alert()</script>\n", - "<p><script>alert()</script></p>\n", - - "<script>alert()</script>\n", - "<script>alert()</script>\n", - - "<script src='foo'></script>\n", - "<script src='foo'></script>\n", - - "<script src='a>b'></script>\n", - "<script src='a>b'></script>\n", - - "zz <script src='foo'></script>\n", - "<p>zz <script src='foo'></script></p>\n", - - "zz <script src=foo></script>\n", - "<p>zz <script src=foo></script></p>\n", - - `<script><script src="http://example.com/exploit.js"></SCRIPT></script>`, - "<script><script src="http://example.com/exploit.js"></script></script>\n", - - `'';!--"<XSS>=&{()}`, - "<p>'';!--"<xss>=&{()}</p>\n", - - "<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>", - "<p><script SRC=http://ha.ckers.org/xss.js></script></p>\n", - - "<SCRIPT \nSRC=http://ha.ckers.org/xss.js></SCRIPT>", - "<p><script \nSRC=http://ha.ckers.org/xss.js></script></p>\n", - - `<IMG SRC="javascript:alert('XSS');">`, - "<p><img></p>\n", - - "<IMG SRC=javascript:alert('XSS')>", - "<p><img></p>\n", - - "<IMG SRC=JaVaScRiPt:alert('XSS')>", - "<p><img></p>\n", - - "<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>", - "<p><img></p>\n", - - `<a onmouseover="alert(document.cookie)">xss link</a>`, - "<p><a>xss link</a></p>\n", - - "<a onmouseover=alert(document.cookie)>xss link</a>", - "<p><a>xss link</a></p>\n", - - `<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, - "<p><img><script>alert("XSS")</script>"></p>\n", - - "<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>", - "<p><img></p>\n", - - `<IMG SRC=# onmouseover="alert('xxs')">`, - "<p><img src=\"#\"></p>\n", - - `<IMG SRC= onmouseover="alert('xxs')">`, - "<p><img></p>\n", - - `<IMG onmouseover="alert('xxs')">`, - "<p><img></p>\n", - - "<IMG SRC=javascript:alert('XSS')>", - "<p><img></p>\n", - - "<IMG SRC=javascript:alert('XSS')>", - "<p><img></p>\n", - - "<IMG SRC=javascript:alert('XSS')>", - "<p><img></p>\n", - - `<IMG SRC="javascriptascript:alert('XSS');">`, - "<p><img></p>\n", - - `<IMG SRC="jav	ascript:alert('XSS');">`, - "<p><img></p>\n", - - `<IMG SRC="jav
ascript:alert('XSS');">`, - "<p><img></p>\n", - - `<IMG SRC="jav
ascript:alert('XSS');">`, - "<p><img></p>\n", - - `<IMG SRC="  javascript:alert('XSS');">`, - "<p><img></p>\n", - - `<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, - "<p><script/XSS SRC="http://ha.ckers.org/xss.js"></script></p>\n", - - "<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>", - "<p><body onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert("XSS")></p>\n", - - `<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, - "<p><script/SRC="http://ha.ckers.org/xss.js"></script></p>\n", - - `<<SCRIPT>alert("XSS");//<</SCRIPT>`, - "<p><<script>alert("XSS");//<</script></p>\n", - - "<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >", - "<p><script SRC=http://ha.ckers.org/xss.js?< B ></p>\n", - - "<SCRIPT SRC=//ha.ckers.org/.j>", - "<p><script SRC=//ha.ckers.org/.j></p>\n", - - `<IMG SRC="javascript:alert('XSS')"`, - "<p><IMG SRC="javascript:alert('XSS')"</p>\n", - - "<iframe src=http://ha.ckers.org/scriptlet.html <", - // The hyperlink gets linkified, the <iframe> gets escaped - "<p><iframe src=<a href=\"http://ha.ckers.org/scriptlet.html\">http://ha.ckers.org/scriptlet.html</a> <</p>\n", - - // Additonal token types: SelfClosing, Comment, DocType. - "<br/>", - "<p><br/></p>\n", - - "<!-- Comment -->", - "<!-- Comment -->\n", - - "<!DOCTYPE test>", - "<p><!DOCTYPE test></p>\n", - } - doTestsSanitize(t, tests) -} - -func TestSanitizeQuoteEscaping(t *testing.T) { - tests := []string{ - // Make sure quotes are transported correctly (different entities or - // unicode, but correct semantics) - "<p>Here are some "quotes".</p>\n", - "<p>Here are some "quotes".</p>\n", - - "<p>Here are some “quotes”.</p>\n", - "<p>Here are some \u201Cquotes\u201D.</p>\n", - - // Within a <script> tag, content gets parsed by the raw text parsing rules. - // This test makes sure we correctly disable those parsing rules and do not - // escape e.g. the closing </p>. - `Here are <script> some "quotes".`, - "<p>Here are <script> some "quotes".</p>\n", - - // Same test for an unknown element that does not switch into raw mode. - `Here are <eviltag> some "quotes".`, - "<p>Here are <eviltag> some "quotes".</p>\n", - } - doTestsSanitize(t, tests) -} - -func TestSanitizeSelfClosingTag(t *testing.T) { - tests := []string{ - "<hr>\n", - "<hr>\n", - - "<hr/>\n", - "<hr/>\n", - - // Make sure that evil attributes are stripped for self closing tags. - "<hr onclick=\"evil()\"/>\n", - "<hr/>\n", - } - doTestsSanitize(t, tests) -} - -func TestSanitizeInlineLink(t *testing.T) { - tests := []string{ - "[link](javascript:evil)", - "<p><a>link</a></p>\n", - "[link](/abc)", - "<p><a href=\"/abc\">link</a></p>\n", - } - doTestsSanitize(t, tests) -}