all repos — grayfriday @ 11e042f6c12bc52bac1a6ec03a2be6f19b592890

blackfriday fork with a few changes

Avoid raw mode parsing so that raw mode tags like <script> don't cause issues.

Certain tags like <script> but also <title> and others switch an HTML5 parser
into raw mode, which causes the rest of the HTML string to be always parsed as
text, including any elements or entities that we do want to support (e.g. <p>).

As we're going to escape any of the raw text elements anyway (it's e.g. script,
style, title, xmp, noframes, and a couple of others) we can just switch of raw
text parsing by disabling it after each starting tag.
Martin Probst martin@probst.io
Sat, 03 May 2014 12:58:25 +0200
commit

11e042f6c12bc52bac1a6ec03a2be6f19b592890

parent

50b8e0370b6d767a2df828f8a3481a6a443fdb61

2 files changed, 20 insertions(+), 8 deletions(-)

jump to
M inline_test.goinline_test.go

@@ -135,7 +135,7 @@ "<a onmouseover=alert(document.cookie)>xss link</a>",

"<p><a>xss link</a></p>\n", `<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, - "<p><img>&lt;script&gt;alert(&amp;quot;XSS&amp;quot;)&lt;/script&gt;&#34;&gt;</p>\n", + "<p><img>&lt;script&gt;alert(&#34;XSS&#34;)&lt;/script&gt;&#34;&gt;</p>\n", "<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>", "<p><img></p>\n",

@@ -182,18 +182,14 @@

`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, "<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n", - // HTML5 interprets the <script> tag contents as raw test, thus the end - // result has double-escaped &amp;quot; `<<SCRIPT>alert("XSS");//<</SCRIPT>`, - "<p>&lt;&lt;script&gt;alert(&amp;quot;XSS&amp;quot;);//&amp;lt;&lt;/script&gt;</p>\n", + "<p>&lt;&lt;script&gt;alert(&#34;XSS&#34;);//&lt;&lt;/script&gt;</p>\n", - // HTML5 parses the </p> within an unclosed <script> tag as text. - // Same for the following tests. "<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >", - "<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;&lt;/p&gt;\n", + "<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;</p>\n", "<SCRIPT SRC=//ha.ckers.org/.j>", - "<p>&lt;script SRC=//ha.ckers.org/.j&gt;&lt;/p&gt;\n", + "<p>&lt;script SRC=//ha.ckers.org/.j&gt;</p>\n", `<IMG SRC="javascript:alert('XSS')"`, "<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n",

@@ -220,11 +216,23 @@ }

func TestQuoteEscaping(t *testing.T) { tests := []string{ + // Make sure quotes are transported correctly (different entities or + // unicode, but correct semantics) "<p>Here are some &quot;quotes&quot;.</p>\n", "<p>Here are some &#34;quotes&#34;.</p>\n", "<p>Here are some &ldquo;quotes&rdquo;.</p>\n", "<p>Here are some \u201Cquotes\u201D.</p>\n", + + // Within a <script> tag, content gets parsed by the raw text parsing rules. + // This test makes sure we correctly disable those parsing rules and do not + // escape e.g. the closing </p>. + `Here are <script> some "quotes".`, + "<p>Here are &lt;script&gt; some &#34;quotes&#34;.</p>\n", + + // Same test for an unknown element that does not switch into raw mode. + `Here are <eviltag> some "quotes".`, + "<p>Here are &lt;eviltag&gt; some &#34;quotes&#34;.</p>\n", } doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT) }
M sanitize.gosanitize.go

@@ -107,6 +107,10 @@ wr.WriteString(">")

} else { wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) } + // Make sure that tags like <script> that switch the parser into raw mode + // do not destroy the parse mode for following HTML text (the point is to + // escape them anyway). For that, switch off raw mode in the tokenizer. + tokenizer.NextIsNotRawText() case html.EndTagToken: // Whitelisted tokens can be written in raw. tag, _ := tokenizer.TagName()