all repos — grayfriday @ 643477a0516ad3565177aa8ac8371e0eb4c824f9

blackfriday fork with a few changes

Merge pull request #75 from mprobst/sanitize_test

Avoid raw mode parsing so that tags like <script> don't cause escaping
Vytautas Ĺ altenis vytas@rtfb.lt
Sat, 03 May 2014 15:11:41 +0300
commit

643477a0516ad3565177aa8ac8371e0eb4c824f9

parent

50b8e0370b6d767a2df828f8a3481a6a443fdb61

2 files changed, 20 insertions(+), 8 deletions(-)

jump to
M inline_test.goinline_test.go

@@ -135,7 +135,7 @@ "<a onmouseover=alert(document.cookie)>xss link</a>",

"<p><a>xss link</a></p>\n", `<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`, - "<p><img>&lt;script&gt;alert(&amp;quot;XSS&amp;quot;)&lt;/script&gt;&#34;&gt;</p>\n", + "<p><img>&lt;script&gt;alert(&#34;XSS&#34;)&lt;/script&gt;&#34;&gt;</p>\n", "<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>", "<p><img></p>\n",

@@ -182,18 +182,14 @@

`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`, "<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n", - // HTML5 interprets the <script> tag contents as raw test, thus the end - // result has double-escaped &amp;quot; `<<SCRIPT>alert("XSS");//<</SCRIPT>`, - "<p>&lt;&lt;script&gt;alert(&amp;quot;XSS&amp;quot;);//&amp;lt;&lt;/script&gt;</p>\n", + "<p>&lt;&lt;script&gt;alert(&#34;XSS&#34;);//&lt;&lt;/script&gt;</p>\n", - // HTML5 parses the </p> within an unclosed <script> tag as text. - // Same for the following tests. "<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >", - "<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;&lt;/p&gt;\n", + "<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;</p>\n", "<SCRIPT SRC=//ha.ckers.org/.j>", - "<p>&lt;script SRC=//ha.ckers.org/.j&gt;&lt;/p&gt;\n", + "<p>&lt;script SRC=//ha.ckers.org/.j&gt;</p>\n", `<IMG SRC="javascript:alert('XSS')"`, "<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n",

@@ -220,11 +216,23 @@ }

func TestQuoteEscaping(t *testing.T) { tests := []string{ + // Make sure quotes are transported correctly (different entities or + // unicode, but correct semantics) "<p>Here are some &quot;quotes&quot;.</p>\n", "<p>Here are some &#34;quotes&#34;.</p>\n", "<p>Here are some &ldquo;quotes&rdquo;.</p>\n", "<p>Here are some \u201Cquotes\u201D.</p>\n", + + // Within a <script> tag, content gets parsed by the raw text parsing rules. + // This test makes sure we correctly disable those parsing rules and do not + // escape e.g. the closing </p>. + `Here are <script> some "quotes".`, + "<p>Here are &lt;script&gt; some &#34;quotes&#34;.</p>\n", + + // Same test for an unknown element that does not switch into raw mode. + `Here are <eviltag> some "quotes".`, + "<p>Here are &lt;eviltag&gt; some &#34;quotes&#34;.</p>\n", } doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT) }
M sanitize.gosanitize.go

@@ -107,6 +107,10 @@ wr.WriteString(">")

} else { wr.WriteString(html.EscapeString(string(tokenizer.Raw()))) } + // Make sure that tags like <script> that switch the parser into raw mode + // do not destroy the parse mode for following HTML text (the point is to + // escape them anyway). For that, switch off raw mode in the tokenizer. + tokenizer.NextIsNotRawText() case html.EndTagToken: // Whitelisted tokens can be written in raw. tag, _ := tokenizer.TagName()