all repos — grayfriday @ cf6bfc9d6d9f0d0279ff7660e0095b21b7df8c86

blackfriday fork with a few changes

Rip off all blackfriday's html sanitization effort

As per discussion in issue #90.
Vytautas Saltenis vytas@rtfb.lt
Fri, 19 Sep 2014 20:30:00 +0300
commit

cf6bfc9d6d9f0d0279ff7660e0095b21b7df8c86

parent

44a39c16c6b85809f8c2663bdb8ddbced1ffdee8

5 files changed, 3 insertions(+), 366 deletions(-)

jump to
M html.gohtml.go

@@ -29,7 +29,6 @@ HTML_SKIP_HTML = 1 << iota // skip preformatted HTML blocks

HTML_SKIP_STYLE // skip embedded <style> elements HTML_SKIP_IMAGES // skip embedded images HTML_SKIP_LINKS // skip all links - HTML_SANITIZE_OUTPUT // strip output of everything that's not known to be safe HTML_SAFELINK // only link to trusted protocols HTML_NOFOLLOW_LINKS // only link with rel="nofollow" HTML_HREF_TARGET_BLANK // add a blank target
M inline_test.goinline_test.go

@@ -425,15 +425,12 @@ func TestNofollowLink(t *testing.T) {

var tests = []string{ "[foo](http://bar.com/foo/)\n", "<p><a href=\"http://bar.com/foo/\" rel=\"nofollow\">foo</a></p>\n", - } - doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS|HTML_SANITIZE_OUTPUT, - HtmlRendererParameters{}) - // HTML_SANITIZE_OUTPUT won't allow relative links, so test that separately: - tests = []string{ + "[foo](/bar/)\n", "<p><a href=\"/bar/\">foo</a></p>\n", } - doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS, HtmlRendererParameters{}) + doTestsInlineParam(t, tests, 0, HTML_SAFELINK|HTML_NOFOLLOW_LINKS, + HtmlRendererParameters{}) } func TestHrefTargetBlank(t *testing.T) {
M markdown.gomarkdown.go

@@ -238,7 +238,6 @@ htmlFlags |= HTML_USE_XHTML

htmlFlags |= HTML_USE_SMARTYPANTS htmlFlags |= HTML_SMARTYPANTS_FRACTIONS htmlFlags |= HTML_SMARTYPANTS_LATEX_DASHES - htmlFlags |= HTML_SANITIZE_OUTPUT renderer := HtmlRenderer(htmlFlags, "", "") // set up the parser

@@ -298,11 +297,6 @@ }

first := firstPass(p, input) second := secondPass(p, first) - - if renderer.GetFlags()&HTML_SANITIZE_OUTPUT != 0 { - second = sanitizeHtmlSafe(second) - } - return second }
D sanitize.go

@@ -1,154 +0,0 @@

-package blackfriday - -import ( - "bufio" - "bytes" - "code.google.com/p/go.net/html" - "fmt" - "io" -) - -// Whitelisted element tags, attributes on particular tags, attributes that are -// interpreted as protocols (again on particular tags), and allowed protocols. -var ( - whitelistTags map[string]bool - whitelistAttrs map[string]map[string]bool - protocolAttrs map[string]map[string]bool - whitelistProtocols [][]byte -) - -func init() { - whitelistTags = toSet([]string{ - // Headings - "h1", "h2", "h3", "h4", "h5", "h6", - // Block elements - "p", "pre", "blockquote", "hr", "div", "header", "article", "aside", "footer", - "section", "main", "mark", "figure", "figcaption", - // Inline elements - "a", "br", "cite", "code", "img", - // Lists - "ol", "ul", "li", - // Tables - "table", "tbody", "td", "tfoot", "th", "thead", "tr", "colgroup", "col", "caption", - // Formatting - "u", "i", "em", "small", "strike", "b", "strong", "sub", "sup", "q", - // Definition lists - "dd", "dl", "dt", - }) - whitelistAttrs = map[string]map[string]bool{ - "a": toSet([]string{"href", "title", "rel"}), - "img": toSet([]string{"src", "alt", "title"}), - "td": toSet([]string{"align"}), - "th": toSet([]string{"align"}), - } - protocolAttrs = map[string]map[string]bool{ - "a": toSet([]string{"href"}), - "img": toSet([]string{"src"}), - } - whitelistProtocols = [][]byte{ - []byte("http://"), - []byte("https://"), - []byte("ftp://"), - []byte("mailto:"), - } -} - -func toSet(keys []string) map[string]bool { - m := make(map[string]bool, len(keys)) - for _, k := range keys { - m[k] = true - } - return m -} - -// Sanitizes the given input by parsing it as HTML5, then whitelisting known to -// be safe elements and attributes. All other HTML is escaped, unsafe attributes -// are stripped. -func sanitizeHtmlSafe(input []byte) []byte { - r := bytes.NewReader(input) - var w bytes.Buffer - tokenizer := html.NewTokenizer(r) - wr := bufio.NewWriter(&w) - - // Iterate through all tokens in the input stream and sanitize them. - for t := tokenizer.Next(); t != html.ErrorToken; t = tokenizer.Next() { - switch t { - case html.TextToken: - // Text is written escaped. - wr.WriteString(tokenizer.Token().String()) - case html.SelfClosingTagToken, html.StartTagToken: - // HTML tags are escaped unless whitelisted. - tag, hasAttributes := tokenizer.TagName() - tagName := string(tag) - if whitelistTags[tagName] { - wr.WriteString("<") - wr.Write(tag) - for hasAttributes { - var key, val []byte - key, val, hasAttributes = tokenizer.TagAttr() - attrName := string(key) - // Only include whitelisted attributes for the given tagName. - tagWhitelistedAttrs, ok := whitelistAttrs[tagName] - if ok && tagWhitelistedAttrs[attrName] { - // For whitelisted attributes, if it's an attribute that requires - // protocol checking, do so and strip it if it's not known to be safe. - tagProtocolAttrs, ok := protocolAttrs[tagName] - if ok && tagProtocolAttrs[attrName] { - if !isRelativeLink(val) && !protocolAllowed(val) { - continue - } - } - wr.WriteByte(' ') - wr.Write(key) - wr.WriteString(`="`) - wr.WriteString(html.EscapeString(string(val))) - wr.WriteByte('"') - } - } - if t == html.SelfClosingTagToken { - wr.WriteString("/>") - } else { - wr.WriteString(">") - } - } else { - wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) - } - // Make sure that tags like <script> that switch the parser into raw mode - // do not destroy the parse mode for following HTML text (the point is to - // escape them anyway). For that, switch off raw mode in the tokenizer. - tokenizer.NextIsNotRawText() - case html.EndTagToken: - // Whitelisted tokens can be written in raw. - tag, _ := tokenizer.TagName() - if whitelistTags[string(tag)] { - wr.Write(tokenizer.Raw()) - } else { - wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) - } - case html.CommentToken: - // Comments are not really expected, but harmless. - wr.Write(tokenizer.Raw()) - case html.DoctypeToken: - // Escape DOCTYPES, entities etc can be dangerous - wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) - default: - tokenizer.Token() - panic(fmt.Errorf("Unexpected token type %v", t)) - } - } - err := tokenizer.Err() - if err != nil && err != io.EOF { - panic(tokenizer.Err()) - } - wr.Flush() - return w.Bytes() -} - -func protocolAllowed(attr []byte) bool { - for _, prefix := range whitelistProtocols { - if bytes.HasPrefix(attr, prefix) { - return true - } - } - return false -}
D sanitize_test.go

@@ -1,199 +0,0 @@

-package blackfriday - -import ( - "testing" -) - -func doTestsSanitize(t *testing.T, tests []string) { - doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT, HtmlRendererParameters{}) -} - -func TestSanitizeRawHtmlTag(t *testing.T) { - tests := []string{ - "zz <style>p {}</style>\n", - "<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n", - - "zz <STYLE>p {}</STYLE>\n", - "<p>zz &lt;style&gt;p {}&lt;/style&gt;</p>\n", - - "<SCRIPT>alert()</SCRIPT>\n", - "<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n", - - "zz <SCRIPT>alert()</SCRIPT>\n", - "<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n", - - "zz <script>alert()</script>\n", - "<p>zz &lt;script&gt;alert()&lt;/script&gt;</p>\n", - - " <script>alert()</script>\n", - "<p>&lt;script&gt;alert()&lt;/script&gt;</p>\n", - - "<script>alert()</script>\n", - "&lt;script&gt;alert()&lt;/script&gt;\n", - - "<script src='foo'></script>\n", - "&lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;\n", - - "<script src='a>b'></script>\n", - "&lt;script src=&#39;a&gt;b&#39;&gt;&lt;/script&gt;\n", - - "zz <script src='foo'></script>\n", - "<p>zz &lt;script src=&#39;foo&#39;&gt;&lt;/script&gt;</p>\n", - - "zz <script src=foo></script>\n", - "<p>zz &lt;script src=foo&gt;&lt;/script&gt;</p>\n", - - `<script><script src="http://example.com/exploit.js"></SCRIPT></script>`, - "&lt;script&gt;&lt;script src=&#34;http://example.com/exploit.js&#34;&gt;&lt;/script&gt;&lt;/script&gt;\n", - - `'';!--"<XSS>=&{()}`, - "<p>&#39;&#39;;!--&#34;&lt;xss&gt;=&amp;{()}</p>\n", - - "<SCRIPT SRC=http://ha.ckers.org/xss.js></SCRIPT>", - "<p>&lt;script SRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n", - - "<SCRIPT \nSRC=http://ha.ckers.org/xss.js></SCRIPT>", - "<p>&lt;script \nSRC=http://ha.ckers.org/xss.js&gt;&lt;/script&gt;</p>\n", - - `<IMG SRC="javascript:alert('XSS');">`, - "<p><img></p>\n", - - "<IMG SRC=javascript:alert('XSS')>", - "<p><img></p>\n", - - "<IMG SRC=JaVaScRiPt:alert('XSS')>", - "<p><img></p>\n", - - "<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>", - "<p><img></p>\n", - - `<a onmouseover="alert(document.cookie)">xss link</a>`, - "<p><a>xss link</a></p>\n", - - "<a onmouseover=alert(document.cookie)>xss link</a>", - "<p><a>xss link</a></p>\n", - - `<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, - "<p><img>&lt;script&gt;alert(&#34;XSS&#34;)&lt;/script&gt;&#34;&gt;</p>\n", - - "<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>", - "<p><img></p>\n", - - `<IMG SRC=# onmouseover="alert('xxs')">`, - "<p><img src=\"#\"></p>\n", - - `<IMG SRC= onmouseover="alert('xxs')">`, - "<p><img></p>\n", - - `<IMG onmouseover="alert('xxs')">`, - "<p><img></p>\n", - - "<IMG SRC=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;>", - "<p><img></p>\n", - - "<IMG SRC=&#0000106&#0000097&#0000118&#0000097&#0000115&#0000099&#0000114&#0000105&#0000112&#0000116&#0000058&#0000097&#0000108&#0000101&#0000114&#0000116&#0000040&#0000039&#0000088&#0000083&#0000083&#0000039&#0000041>", - "<p><img></p>\n", - - "<IMG SRC=&#x6A&#x61&#x76&#x61&#x73&#x63&#x72&#x69&#x70&#x74&#x3A&#x61&#x6C&#x65&#x72&#x74&#x28&#x27&#x58&#x53&#x53&#x27&#x29>", - "<p><img></p>\n", - - `<IMG SRC="javascriptascript:alert('XSS');">`, - "<p><img></p>\n", - - `<IMG SRC="jav&#x09;ascript:alert('XSS');">`, - "<p><img></p>\n", - - `<IMG SRC="jav&#x0A;ascript:alert('XSS');">`, - "<p><img></p>\n", - - `<IMG SRC="jav&#x0D;ascript:alert('XSS');">`, - "<p><img></p>\n", - - `<IMG SRC=" &#14; javascript:alert('XSS');">`, - "<p><img></p>\n", - - `<SCRIPT/XSS SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, - "<p>&lt;script/XSS SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n", - - "<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>", - "<p>&lt;body onload!#$%&amp;()*~+-_.,:;?@[/|\\]^`=alert(&#34;XSS&#34;)&gt;</p>\n", - - `<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, - "<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n", - - `<<SCRIPT>alert("XSS");//<</SCRIPT>`, - "<p>&lt;&lt;script&gt;alert(&#34;XSS&#34;);//&lt;&lt;/script&gt;</p>\n", - - "<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >", - "<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;</p>\n", - - "<SCRIPT SRC=//ha.ckers.org/.j>", - "<p>&lt;script SRC=//ha.ckers.org/.j&gt;</p>\n", - - `<IMG SRC="javascript:alert('XSS')"`, - "<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n", - - "<iframe src=http://ha.ckers.org/scriptlet.html <", - // The hyperlink gets linkified, the <iframe> gets escaped - "<p>&lt;iframe src=<a href=\"http://ha.ckers.org/scriptlet.html\">http://ha.ckers.org/scriptlet.html</a> &lt;</p>\n", - - // Additonal token types: SelfClosing, Comment, DocType. - "<br/>", - "<p><br/></p>\n", - - "<!-- Comment -->", - "<!-- Comment -->\n", - - "<!DOCTYPE test>", - "<p>&lt;!DOCTYPE test&gt;</p>\n", - } - doTestsSanitize(t, tests) -} - -func TestSanitizeQuoteEscaping(t *testing.T) { - tests := []string{ - // Make sure quotes are transported correctly (different entities or - // unicode, but correct semantics) - "<p>Here are some &quot;quotes&quot;.</p>\n", - "<p>Here are some &#34;quotes&#34;.</p>\n", - - "<p>Here are some &ldquo;quotes&rdquo;.</p>\n", - "<p>Here are some \u201Cquotes\u201D.</p>\n", - - // Within a <script> tag, content gets parsed by the raw text parsing rules. - // This test makes sure we correctly disable those parsing rules and do not - // escape e.g. the closing </p>. - `Here are <script> some "quotes".`, - "<p>Here are &lt;script&gt; some &#34;quotes&#34;.</p>\n", - - // Same test for an unknown element that does not switch into raw mode. - `Here are <eviltag> some "quotes".`, - "<p>Here are &lt;eviltag&gt; some &#34;quotes&#34;.</p>\n", - } - doTestsSanitize(t, tests) -} - -func TestSanitizeSelfClosingTag(t *testing.T) { - tests := []string{ - "<hr>\n", - "<hr>\n", - - "<hr/>\n", - "<hr/>\n", - - // Make sure that evil attributes are stripped for self closing tags. - "<hr onclick=\"evil()\"/>\n", - "<hr/>\n", - } - doTestsSanitize(t, tests) -} - -func TestSanitizeInlineLink(t *testing.T) { - tests := []string{ - "[link](javascript:evil)", - "<p><a>link</a></p>\n", - "[link](/abc)", - "<p><a href=\"/abc\">link</a></p>\n", - } - doTestsSanitize(t, tests) -}