icy does git — grayfriday: 643477a0516ad3565177aa8ac8371e0eb4c824f9

Merge pull request #75 from mprobst/sanitize_test

Avoid raw mode parsing so that tags like <script> don't cause escaping

Vytautas Šaltenis vytas@rtfb.lt

Sat, 03 May 2014 15:11:41 +0300

commit

643477a0516ad3565177aa8ac8371e0eb4c824f9

parent

50b8e0370b6d767a2df828f8a3481a6a443fdb61

2 files changed, 20 insertions(+), 8 deletions(-)

jump to

inline_test.go

sanitize.go

M inline_test.go → inline_test.go

@@ -135,7 +135,7 @@ "<a onmouseover=alert(document.cookie)>xss link</a>",
 		"<p><a>xss link</a></p>\n",
 
 		`<IMG """><SCRIPT>alert("XSS")</SCRIPT>">`,
-		"<p><img>&lt;script&gt;alert(&amp;quot;XSS&amp;quot;)&lt;/script&gt;&#34;&gt;</p>\n",
+		"<p><img>&lt;script&gt;alert(&#34;XSS&#34;)&lt;/script&gt;&#34;&gt;</p>\n",
 
 		"<IMG SRC=javascript:alert(String.fromCharCode(88,83,83))>",
 		"<p><img></p>\n",
@@ -182,18 +182,14 @@
 		`<SCRIPT/SRC="http://ha.ckers.org/xss.js"></SCRIPT>`,
 		"<p>&lt;script/SRC=&#34;http://ha.ckers.org/xss.js&#34;&gt;&lt;/script&gt;</p>\n",
 
-		// HTML5 interprets the <script> tag contents as raw test, thus the end
-		// result has double-escaped &amp;quot;
 		`<<SCRIPT>alert("XSS");//<</SCRIPT>`,
-		"<p>&lt;&lt;script&gt;alert(&amp;quot;XSS&amp;quot;);//&amp;lt;&lt;/script&gt;</p>\n",
+		"<p>&lt;&lt;script&gt;alert(&#34;XSS&#34;);//&lt;&lt;/script&gt;</p>\n",
 
-		// HTML5 parses the </p> within an unclosed <script> tag as text.
-		// Same for the following tests.
 		"<SCRIPT SRC=http://ha.ckers.org/xss.js?< B >",
-		"<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;&lt;/p&gt;\n",
+		"<p>&lt;script SRC=http://ha.ckers.org/xss.js?&lt; B &gt;</p>\n",
 
 		"<SCRIPT SRC=//ha.ckers.org/.j>",
-		"<p>&lt;script SRC=//ha.ckers.org/.j&gt;&lt;/p&gt;\n",
+		"<p>&lt;script SRC=//ha.ckers.org/.j&gt;</p>\n",
 
 		`<IMG SRC="javascript:alert('XSS')"`,
 		"<p>&lt;IMG SRC=&#34;javascript:alert(&#39;XSS&#39;)&#34;</p>\n",
@@ -220,11 +216,23 @@ }
 
 func TestQuoteEscaping(t *testing.T) {
 	tests := []string{
+		// Make sure quotes are transported correctly (different entities or
+		// unicode, but correct semantics)
 		"<p>Here are some &quot;quotes&quot;.</p>\n",
 		"<p>Here are some &#34;quotes&#34;.</p>\n",
 
 		"<p>Here are some &ldquo;quotes&rdquo;.</p>\n",
 		"<p>Here are some \u201Cquotes\u201D.</p>\n",
+
+		// Within a <script> tag, content gets parsed by the raw text parsing rules.
+		// This test makes sure we correctly disable those parsing rules and do not
+		// escape e.g. the closing </p>.
+		`Here are <script> some "quotes".`,
+		"<p>Here are &lt;script&gt; some &#34;quotes&#34;.</p>\n",
+
+		// Same test for an unknown element that does not switch into raw mode.
+		`Here are <eviltag> some "quotes".`,
+		"<p>Here are &lt;eviltag&gt; some &#34;quotes&#34;.</p>\n",
 	}
 	doTestsInlineParam(t, tests, 0, HTML_SKIP_STYLE|HTML_SANITIZE_OUTPUT)
 }

M sanitize.go → sanitize.go

@@ -107,6 +107,10 @@ wr.WriteString(">")
 			} else {
 				wr.WriteString(html.EscapeString(string(tokenizer.Raw())))
 			}
+			// Make sure that tags like <script> that switch the parser into raw mode
+			// do not destroy the parse mode for following HTML text (the point is to
+			// escape them anyway). For that, switch off raw mode in the tokenizer.
+			tokenizer.NextIsNotRawText()
 		case html.EndTagToken:
 			// Whitelisted tokens can be written in raw.
 			tag, _ := tokenizer.TagName()

all repos — grayfriday @ 643477a0516ad3565177aa8ac8371e0eb4c824f9

blackfriday fork with a few changes