all repos — grayfriday @ 55bb56bf9b0fd106c6a7b6f89123fbf435bd8e50

blackfriday fork with a few changes

Merge pull request #55 from rtfb/master

Autolink fixes
Vytautas Ĺ altenis vytas@rtfb.lt
Sun, 30 Mar 2014 19:58:39 +0300
commit

55bb56bf9b0fd106c6a7b6f89123fbf435bd8e50

parent

d643453f1ef4fe5d73e8697dfac24435ae6ad99d

3 files changed, 107 insertions(+), 40 deletions(-)

jump to
M html.gohtml.go

@@ -43,7 +43,7 @@ HTML_SMARTYPANTS_LATEX_DASHES // enable LaTeX-style dashes (with HTML_USE_SMARTYPANTS)

) var ( - tags = []string{ + tags = []string{ "b", "blockquote", "code",

@@ -71,10 +71,12 @@ "strong",

"strike", "ul", } - urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` + urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+` tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`) - anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`) - imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) + anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`) + imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`) + // TODO: improve this regexp to catch all possible entities: + htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`) ) // Html is a type that implements the Renderer interface for HTML output.

@@ -128,50 +130,51 @@ smartypants: smartypants(flags),

} } +// Using if statements is a bit faster than a switch statement. As the compiler +// improves, this should be unnecessary this is only worthwhile because +// attrEscape is the single largest CPU user in normal use. +// Also tried using map, but that gave a ~3x slowdown. +func escapeSingleChar(char byte) (string, bool) { + if char == '"' { + return "&quot;", true + } + if char == '&' { + return "&amp;", true + } + if char == '<' { + return "&lt;", true + } + if char == '>' { + return "&gt;", true + } + return "", false +} + func attrEscape(out *bytes.Buffer, src []byte) { org := 0 for i, ch := range src { - // using if statements is a bit faster than a switch statement. - // as the compiler improves, this should be unnecessary - // this is only worthwhile because attrEscape is the single - // largest CPU user in normal use - if ch == '"' { + if entity, ok := escapeSingleChar(ch); ok { if i > org { // copy all the normal characters since the last escape out.Write(src[org:i]) } org = i + 1 - out.WriteString("&quot;") - continue - } - if ch == '&' { - if i > org { - out.Write(src[org:i]) - } - org = i + 1 - out.WriteString("&amp;") - continue - } - if ch == '<' { - if i > org { - out.Write(src[org:i]) - } - org = i + 1 - out.WriteString("&lt;") - continue - } - if ch == '>' { - if i > org { - out.Write(src[org:i]) - } - org = i + 1 - out.WriteString("&gt;") - continue + out.WriteString(entity) } } if org < len(src) { out.Write(src[org:]) } +} + +func entityEscapeWithSkip(out *bytes.Buffer, src []byte, skipRanges [][]int) { + end := 0 + for _, rang := range skipRanges { + attrEscape(out, src[end:rang[0]]) + out.Write(src[rang[0]:rang[1]]) + end = rang[1] + } + attrEscape(out, src[end:]) } func (options *Html) GetFlags() int {

@@ -418,10 +421,11 @@ out.WriteString("</p>\n")

} func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) { + skipRanges := htmlEntity.FindAllIndex(link, -1) if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) && kind != LINK_TYPE_EMAIL { // mark it but don't link it if it is not a safe link: no smartypants out.WriteString("<tt>") - attrEscape(out, link) + entityEscapeWithSkip(out, link, skipRanges) out.WriteString("</tt>") return }

@@ -430,7 +434,7 @@ out.WriteString("<a href=\"")

if kind == LINK_TYPE_EMAIL { out.WriteString("mailto:") } - attrEscape(out, link) + entityEscapeWithSkip(out, link, skipRanges) out.WriteString("\">") // Pretty print: if we get an email address as

@@ -442,7 +446,7 @@ attrEscape(out, link[len("mailto://"):])

case bytes.HasPrefix(link, []byte("mailto:")): attrEscape(out, link[len("mailto:"):]) default: - attrEscape(out, link) + entityEscapeWithSkip(out, link, skipRanges) } out.WriteString("</a>")
M inline.goinline.go

@@ -15,7 +15,12 @@ package blackfriday

import ( "bytes" + "regexp" "strconv" +) + +var ( + anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`) ) // Functions to parse text within a block

@@ -612,12 +617,34 @@

return end } +func linkEndsWithEntity(data []byte, linkEnd int) bool { + entityRanges := htmlEntity.FindAllIndex(data[:linkEnd], -1) + if entityRanges != nil && entityRanges[len(entityRanges)-1][1] == linkEnd { + return true + } + return false +} + func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int { // quick check to rule out most false hits on ':' if p.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' { return 0 } + // Now a more expensive check to see if we're not inside an anchor element + anchorStart := offset + offsetFromAnchor := 0 + for anchorStart > 0 && data[anchorStart] != '<' { + anchorStart-- + offsetFromAnchor++ + } + + anchorStr := anchorRe.Find(data[anchorStart:]) + if anchorStr != nil { + out.Write(anchorStr[offsetFromAnchor:]) + return len(anchorStr) - offsetFromAnchor + } + // scan backward for a word boundary rewind := 0 for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) {

@@ -635,12 +662,17 @@ return 0

} linkEnd := 0 - for linkEnd < len(data) && !isspace(data[linkEnd]) { + for linkEnd < len(data) && !isEndOfLink(data[linkEnd]) { linkEnd++ } // Skip punctuation at the end of the link - if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' { + if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',') && data[linkEnd-2] != '\\' { + linkEnd-- + } + + // But don't skip semicolon if it's a part of escaped entity: + if data[linkEnd-1] == ';' && data[linkEnd-2] != '\\' && !linkEndsWithEntity(data, linkEnd) { linkEnd-- }

@@ -716,6 +748,10 @@ p.r.AutoLink(out, uLink.Bytes(), LINK_TYPE_NORMAL)

} return linkEnd - rewind +} + +func isEndOfLink(char byte) bool { + return isspace(char) || char == '<' } var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://"), []byte("/")}
M inline_test.goinline_test.go

@@ -682,6 +682,33 @@

"even a > can be escaped <http://new.com?q=\\>&etc>\n", "<p>even a &gt; can be escaped <a href=\"http://new.com?q=&gt;&amp;etc\">" + "http://new.com?q=&gt;&amp;etc</a></p>\n", + + "<a href=\"http://fancy.com\">http://fancy.com</a>\n", + "<p><a href=\"http://fancy.com\">http://fancy.com</a></p>\n", + + "<a href=\"http://fancy.com\">This is a link</a>\n", + "<p><a href=\"http://fancy.com\">This is a link</a></p>\n", + + "<a href=\"http://www.fancy.com/A_B.pdf\">http://www.fancy.com/A_B.pdf</a>\n", + "<p><a href=\"http://www.fancy.com/A_B.pdf\">http://www.fancy.com/A_B.pdf</a></p>\n", + + "(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (\n", + "<p>(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (</p>\n", + + "(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (part two: <a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a>)).\n", + "<p>(<a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a> (part two: <a href=\"http://www.fancy.com/A_B\">http://www.fancy.com/A_B</a>)).</p>\n", + + "http://www.foo.com<br />\n", + "<p><a href=\"http://www.foo.com\">http://www.foo.com</a><br /></p>\n", + + "http://foo.com/viewtopic.php?f=18&amp;t=297", + "<p><a href=\"http://foo.com/viewtopic.php?f=18&amp;t=297\">http://foo.com/viewtopic.php?f=18&amp;t=297</a></p>\n", + + "http://foo.com/viewtopic.php?param=&quot;18&quot;zz", + "<p><a href=\"http://foo.com/viewtopic.php?param=&quot;18&quot;zz\">http://foo.com/viewtopic.php?param=&quot;18&quot;zz</a></p>\n", + + "http://foo.com/viewtopic.php?param=&quot;18&quot;", + "<p><a href=\"http://foo.com/viewtopic.php?param=&quot;18&quot;\">http://foo.com/viewtopic.php?param=&quot;18&quot;</a></p>\n", } doTestsInline(t, tests) }