Get rid of the preprocess stage Yay!!
@@ -35,10 +35,6 @@ // Parse block-level data.
// Note: this function and many that it calls assume that // the input buffer ends with a newline. func (p *parser) block(data []byte) { - if len(data) == 0 || data[len(data)-1] != '\n' { - panic("block input is missing terminating newline") - } - // this is called recursively: enforce a maximum depth if p.nesting >= p.maxNesting { return@@ -130,7 +126,7 @@ // ______
if p.isHRule(data) { p.addBlock(HorizontalRule, nil) var i int - for i = 0; data[i] != '\n'; i++ { + for i = 0; i < len(data) && data[i] != '\n'; i++ { } data = data[i:] continue@@ -215,10 +211,10 @@ }
if p.flags&SpaceHeaders != 0 { level := 0 - for level < 6 && data[level] == '#' { + for level < 6 && level < len(data) && data[level] == '#' { level++ } - if data[level] != ' ' { + if level == len(data) || data[level] != ' ' { return false } }@@ -227,7 +223,7 @@ }
func (p *parser) prefixHeader(data []byte) int { level := 0 - for level < 6 && data[level] == '#' { + for level < 6 && level < len(data) && data[level] == '#' { level++ } i := skipChar(data, level, ' ')@@ -276,7 +272,7 @@ // test of level 1 header
if data[0] == '=' { i := skipChar(data, 1, '=') i = skipChar(data, i, ' ') - if data[i] == '\n' { + if i < len(data) && data[i] == '\n' { return 1 } return 0@@ -286,7 +282,7 @@ // test of level 2 header
if data[0] == '-' { i := skipChar(data, 1, '-') i = skipChar(data, i, ' ') - if data[i] == '\n' { + if i < len(data) && data[i] == '\n' { return 2 } return 0@@ -444,6 +440,9 @@ }
// HR, which is the only self-closing block tag considered func (p *parser) htmlHr(data []byte, doRender bool) int { + if len(data) < 4 { + return 0 + } if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') { return 0 }@@ -451,13 +450,11 @@ if data[3] != ' ' && data[3] != '/' && data[3] != '>' {
// not an <hr> tag after all; at least not a valid one return 0 } - i := 3 - for data[i] != '>' && data[i] != '\n' { + for i < len(data) && data[i] != '>' && data[i] != '\n' { i++ } - - if data[i] == '>' { + if i < len(data) && data[i] == '>' { i++ if j := p.isEmpty(data[i:]); j > 0 { size := i + j@@ -472,13 +469,12 @@ }
return size } } - return 0 } func (p *parser) htmlFindTag(data []byte) (string, bool) { i := 0 - for isalnum(data[i]) { + for i < len(data) && isalnum(data[i]) { i++ } key := string(data[:i])@@ -535,7 +531,10 @@ if data[i] != ' ' && data[i] != '\t' {
return 0 } } - return i + 1 + if i < len(data) && data[i] == '\n' { + i++ + } + return i } func (*parser) isHRule(data []byte) bool {@@ -554,7 +553,7 @@ c := data[i]
// the whole line must be the char or whitespace n := 0 - for data[i] != '\n' { + for i < len(data) && data[i] != '\n' { switch { case data[i] == c: n++@@ -570,8 +569,7 @@
// isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data, // and returns the end index if so, or 0 otherwise. It also returns the marker found. // If syntax is not nil, it gets set to the syntax specified in the fence line. -// A final newline is mandatory to recognize the fence line, unless newlineOptional is true. -func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional bool) (end int, marker string) { +func isFenceLine(data []byte, syntax *string, oldmarker string) (end int, marker string) { i, size := 0, 0 // skip up to three spaces@@ -613,7 +611,7 @@ syn := 0
i = skipChar(data, i, ' ') if i >= len(data) { - if newlineOptional && i == len(data) { + if i == len(data) { return i, marker } return 0, ""@@ -658,12 +656,11 @@ }
i = skipChar(data, i, ' ') if i >= len(data) || data[i] != '\n' { - if newlineOptional && i == len(data) { + if i == len(data) { return i, marker } return 0, "" } - return i + 1, marker // Take newline into account. }@@ -672,7 +669,7 @@ // or 0 otherwise. It writes to out if doRender is true, otherwise it has no side effects.
// If doRender is true, a final newline is mandatory to recognize the fenced code block. func (p *parser) fencedCodeBlock(data []byte, doRender bool) int { var syntax string - beg, marker := isFenceLine(data, &syntax, "", false) + beg, marker := isFenceLine(data, &syntax, "") if beg == 0 || beg >= len(data) { return 0 }@@ -685,8 +682,7 @@ for {
// safe to assume beg < len(data) // check for the end of the code block - newlineOptional := !doRender - fenceEnd, _ := isFenceLine(data[beg:], nil, marker, newlineOptional) + fenceEnd, _ := isFenceLine(data[beg:], nil, marker) if fenceEnd != 0 { beg += fenceEnd break@@ -756,7 +752,7 @@ p.addBlock(TableBody, nil)
for i < len(data) { pipes, rowStart := 0, i - for ; data[i] != '\n'; i++ { + for ; i < len(data) && data[i] != '\n'; i++ { if data[i] == '|' { pipes++ }@@ -768,7 +764,9 @@ break
} // include the newline in data sent to tableRow - i++ + if i < len(data)-1 && data[i] == '\n' { + i++ + } p.tableRow(data[rowStart:i], columns, false) }@@ -787,7 +785,7 @@
func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) { i := 0 colCount := 1 - for i = 0; data[i] != '\n'; i++ { + for i = 0; i < len(data) && data[i] != '\n'; i++ { if data[i] == '|' && !isBackslashEscaped(data, i) { colCount++ }@@ -799,7 +797,11 @@ return
} // include the newline in the data sent to tableRow - header := data[:i+1] + j := i + if j < len(data) && data[j] == '\n' { + j++ + } + header := data[:j] // column count ignores pipes at beginning or end of line if data[0] == '|' {@@ -825,7 +827,7 @@
// each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3 // and trailing | optional on last column col := 0 - for data[i] != '\n' { + for i < len(data) && data[i] != '\n' { dashes := 0 if data[i] == ':' {@@ -833,19 +835,21 @@ i++
columns[col] |= TableAlignmentLeft dashes++ } - for data[i] == '-' { + for i < len(data) && data[i] == '-' { i++ dashes++ } - if data[i] == ':' { + if i < len(data) && data[i] == ':' { i++ columns[col] |= TableAlignmentRight dashes++ } - for data[i] == ' ' { + for i < len(data) && data[i] == ' ' { i++ } - + if i == len(data) { + return + } // end of column test is messy switch { case dashes < 3:@@ -856,12 +860,12 @@ case data[i] == '|' && !isBackslashEscaped(data, i):
// marker found, now skip past trailing whitespace col++ i++ - for data[i] == ' ' { + for i < len(data) && data[i] == ' ' { i++ } // trailing junk found after last column - if col >= colCount && data[i] != '\n' { + if col >= colCount && i < len(data) && data[i] != '\n' { return }@@ -884,7 +888,10 @@ }
p.addBlock(TableHead, nil) p.tableRow(header, columns, true) - size = i + 1 + size = i + if size < len(data) && data[size] == '\n' { + size++ + } return }@@ -897,13 +904,13 @@ i++
} for col = 0; col < len(columns) && i < len(data); col++ { - for data[i] == ' ' { + for i < len(data) && data[i] == ' ' { i++ } cellStart := i - for (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' { + for i < len(data) && (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' { i++ }@@ -912,7 +919,7 @@
// skip the end-of-cell marker, possibly taking us past end of buffer i++ - for cellEnd > cellStart && data[cellEnd-1] == ' ' { + for cellEnd > cellStart && cellEnd-1 < len(data) && data[cellEnd-1] == ' ' { cellEnd-- }@@ -934,11 +941,11 @@
// returns blockquote prefix length func (p *parser) quotePrefix(data []byte) int { i := 0 - for i < 3 && data[i] == ' ' { + for i < 3 && i < len(data) && data[i] == ' ' { i++ } - if data[i] == '>' { - if data[i+1] == ' ' { + if i < len(data) && data[i] == '>' { + if i < len(data)-1 && data[i+1] == ' ' { return i + 2 } return i + 1@@ -968,7 +975,7 @@ end = beg
// Step over whole lines, collecting them. While doing that, check for // fenced code and if one's found, incorporate it altogether, // irregardless of any contents inside it - for data[end] != '\n' { + for end < len(data) && data[end] != '\n' { if p.flags&FencedCode != 0 { if i := p.fencedCodeBlock(data[end:], false); i > 0 { // -1 to compensate for the extra end++ after the loop:@@ -978,7 +985,9 @@ }
} end++ } - end++ + if end < len(data) && data[end] == '\n' { + end++ + } if pre := p.quotePrefix(data[beg:]); pre > 0 { // skip the prefix beg += pre@@ -996,7 +1005,10 @@ }
// returns prefix length for block code func (p *parser) codePrefix(data []byte) int { - if data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' { + if data[0] == '\t' { + return 1 + } + if len(data) >= 4 && data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' { return 4 } return 0@@ -1008,10 +1020,12 @@
i := 0 for i < len(data) { beg := i - for data[i] != '\n' { + for i < len(data) && data[i] != '\n' { i++ } - i++ + if i < len(data) && data[i] == '\n' { + i++ + } blankline := p.isEmpty(data[beg:i]) > 0 if pre := p.codePrefix(data[beg:i]); pre > 0 {@@ -1022,7 +1036,7 @@ i = beg
break } - // verbatim copy to the working buffeu + // verbatim copy to the working buffer if blankline { work.WriteByte('\n') } else {@@ -1052,15 +1066,16 @@
// returns unordered list item prefix func (p *parser) uliPrefix(data []byte) int { i := 0 - // start with up to 3 spaces - for i < 3 && data[i] == ' ' { + for i < len(data) && i < 3 && data[i] == ' ' { i++ } - + if i >= len(data)-1 { + return 0 + } // need a *, +, or - followed by a space if (data[i] != '*' && data[i] != '+' && data[i] != '-') || - data[i+1] != ' ' { + (data[i+1] != ' ' && data[i+1] != '\t') { return 0 } return i + 2@@ -1071,18 +1086,21 @@ func (p *parser) oliPrefix(data []byte) int {
i := 0 // start with up to 3 spaces - for i < 3 && data[i] == ' ' { + for i < 3 && i < len(data) && data[i] == ' ' { i++ } // count the digits start := i - for data[i] >= '0' && data[i] <= '9' { + for i < len(data) && data[i] >= '0' && data[i] <= '9' { i++ } + if start == i || i >= len(data)-1 { + return 0 + } // we need >= 1 digits followed by a dot and a space - if start == i || data[i] != '.' || data[i+1] != ' ' { + if data[i] != '.' || !(data[i+1] == ' ' || data[i+1] == '\t') { return 0 } return i + 2@@ -1090,13 +1108,15 @@ }
// returns definition list item prefix func (p *parser) dliPrefix(data []byte) int { + if len(data) < 2 { + return 0 + } i := 0 - // need a : followed by a spaces - if data[i] != ':' || data[i+1] != ' ' { + if data[i] != ':' || !(data[i+1] == ' ' || data[i+1] == '\t') { return 0 } - for data[i] == ' ' { + for i < len(data) && data[i] == ' ' { i++ } return i + 2@@ -1174,6 +1194,9 @@ // Assumes initial prefix is already removed if this is a sublist.
func (p *parser) listItem(data []byte, flags *ListType) int { // keep track of the indentation of the first line itemIndent := 0 + if data[itemIndent] == '\t' { + itemIndent += 4 + } for itemIndent < 3 && data[itemIndent] == ' ' { itemIndent++ }@@ -1202,13 +1225,13 @@ }
} // skip leading whitespace on first line - for data[i] == ' ' { + for i < len(data) && data[i] == ' ' { i++ } // find the end of the line line := i - for i > 0 && data[i-1] != '\n' { + for i > 0 && i < len(data) && data[i-1] != '\n' { i++ }@@ -1228,7 +1251,7 @@ for line < len(data) {
i++ // find the end of this line - for data[i-1] != '\n' { + for i < len(data) && data[i-1] != '\n' { i++ }@@ -1242,11 +1265,18 @@ }
// calculate the indentation indent := 0 - for indent < 4 && line+indent < i && data[line+indent] == ' ' { - indent++ + indentIndex := 0 + if data[line] == '\t' { + indentIndex++ + indent += 4 + } else { + for indent < 4 && line+indent < i && data[line+indent] == ' ' { + indent++ + indentIndex++ + } } - chunk := data[line+indent : i] + chunk := data[line+indentIndex : i] // evaluate how this line fits in switch {@@ -1287,7 +1317,7 @@ case containsBlankLine && indent < 4:
if *flags&ListTypeDefinition != 0 && i < len(data)-1 { // is the next item still a part of this list? next := i - for data[next] != '\n' { + for next < len(data) && data[next] != '\n' { next++ } for next < len(data)-1 && data[next] == '\n' {@@ -1315,7 +1345,7 @@ raw.WriteByte('\n')
} // add the line into the working buffer without prefix - raw.Write(data[line+indent : i]) + raw.Write(data[line+indentIndex : i]) line = i }@@ -1363,8 +1393,11 @@ for data[beg] == ' ' {
beg++ } + end := len(data) // trim trailing newline - end := len(data) - 1 + if data[len(data)-1] == '\n' { + end-- + } // trim trailing spaces for end > beg && data[end-1] == ' ' {@@ -1403,7 +1436,8 @@ if n := p.isEmpty(current); n > 0 {
// did this blank line followed by a definition list item? if p.flags&DefinitionLists != 0 { if i < len(data)-1 && data[i+1] == ':' { - return p.list(data[prev:], ListTypeDefinition) + ret := p.list(data[prev:], ListTypeDefinition) + return ret } }@@ -1436,7 +1470,7 @@ block.Level = level
block.HeaderID = id // find the end of the underline - for data[i] != '\n' { + for i < len(data) && data[i] != '\n' { i++ } return i@@ -1469,7 +1503,8 @@
// if there's a definition list item, prev line is a definition term if p.flags&DefinitionLists != 0 { if p.dliPrefix(current) != 0 { - return p.list(data[prev:], ListTypeDefinition) + ret := p.list(data[prev:], ListTypeDefinition) + return ret } }@@ -1485,7 +1520,12 @@ }
} // otherwise, scan to the beginning of the next line - i += bytes.IndexByte(data[i:], '\n') + 1 + nl := bytes.IndexByte(data[i:], '\n') + if nl >= 0 { + i += nl + 1 + } else { + i += len(data[i:]) + } } p.renderParagraph(data[:i])
@@ -1655,14 +1655,14 @@ func TestIsFenceLine(t *testing.T) {
tests := []struct { data []byte syntaxRequested bool - newlineOptional bool wantEnd int wantMarker string wantSyntax string }{ { - data: []byte("```"), - wantEnd: 0, + data: []byte("```"), + wantEnd: 3, + wantMarker: "```", }, { data: []byte("```\nstuff here\n"),@@ -1681,21 +1681,13 @@ wantEnd: 0,
}, { data: []byte("```"), - newlineOptional: true, - wantEnd: 3, - wantMarker: "```", - }, - { - data: []byte("```"), syntaxRequested: true, - newlineOptional: true, wantEnd: 3, wantMarker: "```", }, { data: []byte("``` go"), syntaxRequested: true, - newlineOptional: true, wantEnd: 6, wantMarker: "```", wantSyntax: "go",@@ -1707,7 +1699,7 @@ var syntax *string
if test.syntaxRequested { syntax = new(string) } - end, marker := isFenceLine(test.data, syntax, "```", test.newlineOptional) + end, marker := isFenceLine(test.data, syntax, "```") if got, want := end, test.wantEnd; got != want { t.Errorf("got end %v, want %v", got, want) }
@@ -1142,7 +1142,7 @@
func TestSkipHTML(t *testing.T) { doTestsParam(t, []string{ "<div class=\"foo\"></div>\n\ntext\n\n<form>the form</form>", - "<p>text</p>\n", + "<p>text</p>\n\n<p>the form</p>\n", "text <em>inline html</em> more text", "<p>text inline html more text</p>\n",
@@ -387,7 +387,7 @@ if extensions&Footnotes != 0 {
p.notes = make([]*reference, 0) } - p.block(preprocess(p, input)) + p.block(input) // Walk the tree and finish up some of unfinished blocks for p.tip != nil { p.finalize(p.tip)@@ -440,63 +440,6 @@ node.content = nil
} return GoToNext }) -} - -// preprocess does a preparatory first pass over the input: -// - normalize newlines -// - expand tabs (outside of fenced code blocks) -// - copy everything else -func preprocess(p *parser, input []byte) []byte { - var out bytes.Buffer - tabSize := TabSizeDefault - if p.flags&TabSizeEight != 0 { - tabSize = TabSizeDouble - } - beg := 0 - lastFencedCodeBlockEnd := 0 - for beg < len(input) { - // Find end of this line, then process the line. - end := beg - for end < len(input) && input[end] != '\n' && input[end] != '\r' { - end++ - } - - if p.flags&FencedCode != 0 { - // track fenced code block boundaries to suppress tab expansion - // and reference extraction inside them: - if beg >= lastFencedCodeBlockEnd { - if i := p.fencedCodeBlock(input[beg:], false); i > 0 { - lastFencedCodeBlockEnd = beg + i - } - } - } - - // add the line body if present - if end > beg { - if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks. - out.Write(input[beg:end]) - } else { - expandTabs(&out, input[beg:end], tabSize) - } - } - - if end < len(input) && input[end] == '\r' { - end++ - } - if end < len(input) && input[end] == '\n' { - end++ - } - out.WriteByte('\n') - - beg = end - } - - // empty input? - if out.Len() == 0 { - out.WriteByte('\n') - } - - return out.Bytes() } //
@@ -1,13 +1,13 @@
<p>Here's a simple block:</p> <div> - foo + foo </div> <p>This should be a code block, though:</p> <pre><code><div> - foo + foo </div> </code></pre>@@ -19,11 +19,11 @@
<p>Now, nested:</p> <div> - <div> - <div> - foo - </div> - </div> + <div> + <div> + foo + </div> + </div> </div> <p>This should just be an HTML comment:</p>
@@ -3,7 +3,7 @@
<!-- This is a simple comment --> <!-- - This is another comment. + This is another comment. --> <p>Paragraph two.</p>
@@ -939,8 +939,8 @@ {} curly braces
[] square brackets () parentheses # hash mark -+ plus sign -- minus sign (hyphen) ++ plus sign +- minus sign (hyphen) . dot ! exclamation mark </code></pre>
@@ -13,13 +13,13 @@ </code></pre>
<p>And:</p> -<pre><code> this code block is indented by two tabs +<pre><code> this code block is indented by two tabs </code></pre> <p>And:</p> -<pre><code>+ this is an example list item - indented with tabs +<pre><code>+ this is an example list item + indented with tabs + this is an example list item indented with spaces