icy does git — grayfriday: 120bb2fae155ccb71b20303046b7cee8e4bbc9c4

Get rid of the preprocess stage

Yay!!

Vytautas Šaltenis vytas@rtfb.lt

Thu, 10 Nov 2016 21:49:58 +0200

commit

120bb2fae155ccb71b20303046b7cee8e4bbc9c4

parent

22a3e5b744f34ee53bd864384eedb821791d4951

8 files changed, 132 insertions(+), 157 deletions(-)

jump to

block.go

block_test.go

inline_test.go

markdown.go

testdata/Inline HTML (Simple).html

testdata/Inline HTML comments.html

testdata/Markdown Documentation - Syntax.html

testdata/Tabs.html

M block.go → block.go

@@ -35,10 +35,6 @@ // Parse block-level data.
 // Note: this function and many that it calls assume that
 // the input buffer ends with a newline.
 func (p *parser) block(data []byte) {
-	if len(data) == 0 || data[len(data)-1] != '\n' {
-		panic("block input is missing terminating newline")
-	}
-
 	// this is called recursively: enforce a maximum depth
 	if p.nesting >= p.maxNesting {
 		return
@@ -130,7 +126,7 @@ // ______
 		if p.isHRule(data) {
 			p.addBlock(HorizontalRule, nil)
 			var i int
-			for i = 0; data[i] != '\n'; i++ {
+			for i = 0; i < len(data) && data[i] != '\n'; i++ {
 			}
 			data = data[i:]
 			continue
@@ -215,10 +211,10 @@ }
 
 	if p.flags&SpaceHeaders != 0 {
 		level := 0
-		for level < 6 && data[level] == '#' {
+		for level < 6 && level < len(data) && data[level] == '#' {
 			level++
 		}
-		if data[level] != ' ' {
+		if level == len(data) || data[level] != ' ' {
 			return false
 		}
 	}
@@ -227,7 +223,7 @@ }
 
 func (p *parser) prefixHeader(data []byte) int {
 	level := 0
-	for level < 6 && data[level] == '#' {
+	for level < 6 && level < len(data) && data[level] == '#' {
 		level++
 	}
 	i := skipChar(data, level, ' ')
@@ -276,7 +272,7 @@ // test of level 1 header
 	if data[0] == '=' {
 		i := skipChar(data, 1, '=')
 		i = skipChar(data, i, ' ')
-		if data[i] == '\n' {
+		if i < len(data) && data[i] == '\n' {
 			return 1
 		}
 		return 0
@@ -286,7 +282,7 @@ // test of level 2 header
 	if data[0] == '-' {
 		i := skipChar(data, 1, '-')
 		i = skipChar(data, i, ' ')
-		if data[i] == '\n' {
+		if i < len(data) && data[i] == '\n' {
 			return 2
 		}
 		return 0
@@ -444,6 +440,9 @@ }
 
 // HR, which is the only self-closing block tag considered
 func (p *parser) htmlHr(data []byte, doRender bool) int {
+	if len(data) < 4 {
+		return 0
+	}
 	if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') {
 		return 0
 	}
@@ -451,13 +450,11 @@ if data[3] != ' ' && data[3] != '/' && data[3] != '>' {
 		// not an <hr> tag after all; at least not a valid one
 		return 0
 	}
-
 	i := 3
-	for data[i] != '>' && data[i] != '\n' {
+	for i < len(data) && data[i] != '>' && data[i] != '\n' {
 		i++
 	}
-
-	if data[i] == '>' {
+	if i < len(data) && data[i] == '>' {
 		i++
 		if j := p.isEmpty(data[i:]); j > 0 {
 			size := i + j
@@ -472,13 +469,12 @@ }
 			return size
 		}
 	}
-
 	return 0
 }
 
 func (p *parser) htmlFindTag(data []byte) (string, bool) {
 	i := 0
-	for isalnum(data[i]) {
+	for i < len(data) && isalnum(data[i]) {
 		i++
 	}
 	key := string(data[:i])
@@ -535,7 +531,10 @@ if data[i] != ' ' && data[i] != '\t' {
 			return 0
 		}
 	}
-	return i + 1
+	if i < len(data) && data[i] == '\n' {
+		i++
+	}
+	return i
 }
 
 func (*parser) isHRule(data []byte) bool {
@@ -554,7 +553,7 @@ c := data[i]
 
 	// the whole line must be the char or whitespace
 	n := 0
-	for data[i] != '\n' {
+	for i < len(data) && data[i] != '\n' {
 		switch {
 		case data[i] == c:
 			n++
@@ -570,8 +569,7 @@
 // isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data,
 // and returns the end index if so, or 0 otherwise. It also returns the marker found.
 // If syntax is not nil, it gets set to the syntax specified in the fence line.
-// A final newline is mandatory to recognize the fence line, unless newlineOptional is true.
-func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional bool) (end int, marker string) {
+func isFenceLine(data []byte, syntax *string, oldmarker string) (end int, marker string) {
 	i, size := 0, 0
 
 	// skip up to three spaces
@@ -613,7 +611,7 @@ syn := 0
 		i = skipChar(data, i, ' ')
 
 		if i >= len(data) {
-			if newlineOptional && i == len(data) {
+			if i == len(data) {
 				return i, marker
 			}
 			return 0, ""
@@ -658,12 +656,11 @@ }
 
 	i = skipChar(data, i, ' ')
 	if i >= len(data) || data[i] != '\n' {
-		if newlineOptional && i == len(data) {
+		if i == len(data) {
 			return i, marker
 		}
 		return 0, ""
 	}
-
 	return i + 1, marker // Take newline into account.
 }
 
@@ -672,7 +669,7 @@ // or 0 otherwise. It writes to out if doRender is true, otherwise it has no side effects.
 // If doRender is true, a final newline is mandatory to recognize the fenced code block.
 func (p *parser) fencedCodeBlock(data []byte, doRender bool) int {
 	var syntax string
-	beg, marker := isFenceLine(data, &syntax, "", false)
+	beg, marker := isFenceLine(data, &syntax, "")
 	if beg == 0 || beg >= len(data) {
 		return 0
 	}
@@ -685,8 +682,7 @@ for {
 		// safe to assume beg < len(data)
 
 		// check for the end of the code block
-		newlineOptional := !doRender
-		fenceEnd, _ := isFenceLine(data[beg:], nil, marker, newlineOptional)
+		fenceEnd, _ := isFenceLine(data[beg:], nil, marker)
 		if fenceEnd != 0 {
 			beg += fenceEnd
 			break
@@ -756,7 +752,7 @@ p.addBlock(TableBody, nil)
 
 	for i < len(data) {
 		pipes, rowStart := 0, i
-		for ; data[i] != '\n'; i++ {
+		for ; i < len(data) && data[i] != '\n'; i++ {
 			if data[i] == '|' {
 				pipes++
 			}
@@ -768,7 +764,9 @@ break
 		}
 
 		// include the newline in data sent to tableRow
-		i++
+		if i < len(data)-1 && data[i] == '\n' {
+			i++
+		}
 		p.tableRow(data[rowStart:i], columns, false)
 	}
 
@@ -787,7 +785,7 @@
 func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
 	i := 0
 	colCount := 1
-	for i = 0; data[i] != '\n'; i++ {
+	for i = 0; i < len(data) && data[i] != '\n'; i++ {
 		if data[i] == '|' && !isBackslashEscaped(data, i) {
 			colCount++
 		}
@@ -799,7 +797,11 @@ return
 	}
 
 	// include the newline in the data sent to tableRow
-	header := data[:i+1]
+	j := i
+	if j < len(data) && data[j] == '\n' {
+		j++
+	}
+	header := data[:j]
 
 	// column count ignores pipes at beginning or end of line
 	if data[0] == '|' {
@@ -825,7 +827,7 @@
 	// each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3
 	// and trailing | optional on last column
 	col := 0
-	for data[i] != '\n' {
+	for i < len(data) && data[i] != '\n' {
 		dashes := 0
 
 		if data[i] == ':' {
@@ -833,19 +835,21 @@ i++
 			columns[col] |= TableAlignmentLeft
 			dashes++
 		}
-		for data[i] == '-' {
+		for i < len(data) && data[i] == '-' {
 			i++
 			dashes++
 		}
-		if data[i] == ':' {
+		if i < len(data) && data[i] == ':' {
 			i++
 			columns[col] |= TableAlignmentRight
 			dashes++
 		}
-		for data[i] == ' ' {
+		for i < len(data) && data[i] == ' ' {
 			i++
 		}
-
+		if i == len(data) {
+			return
+		}
 		// end of column test is messy
 		switch {
 		case dashes < 3:
@@ -856,12 +860,12 @@ case data[i] == '|' && !isBackslashEscaped(data, i):
 			// marker found, now skip past trailing whitespace
 			col++
 			i++
-			for data[i] == ' ' {
+			for i < len(data) && data[i] == ' ' {
 				i++
 			}
 
 			// trailing junk found after last column
-			if col >= colCount && data[i] != '\n' {
+			if col >= colCount && i < len(data) && data[i] != '\n' {
 				return
 			}
 
@@ -884,7 +888,10 @@ }
 
 	p.addBlock(TableHead, nil)
 	p.tableRow(header, columns, true)
-	size = i + 1
+	size = i
+	if size < len(data) && data[size] == '\n' {
+		size++
+	}
 	return
 }
 
@@ -897,13 +904,13 @@ i++
 	}
 
 	for col = 0; col < len(columns) && i < len(data); col++ {
-		for data[i] == ' ' {
+		for i < len(data) && data[i] == ' ' {
 			i++
 		}
 
 		cellStart := i
 
-		for (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
+		for i < len(data) && (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
 			i++
 		}
 
@@ -912,7 +919,7 @@
 		// skip the end-of-cell marker, possibly taking us past end of buffer
 		i++
 
-		for cellEnd > cellStart && data[cellEnd-1] == ' ' {
+		for cellEnd > cellStart && cellEnd-1 < len(data) && data[cellEnd-1] == ' ' {
 			cellEnd--
 		}
 
@@ -934,11 +941,11 @@
 // returns blockquote prefix length
 func (p *parser) quotePrefix(data []byte) int {
 	i := 0
-	for i < 3 && data[i] == ' ' {
+	for i < 3 && i < len(data) && data[i] == ' ' {
 		i++
 	}
-	if data[i] == '>' {
-		if data[i+1] == ' ' {
+	if i < len(data) && data[i] == '>' {
+		if i < len(data)-1 && data[i+1] == ' ' {
 			return i + 2
 		}
 		return i + 1
@@ -968,7 +975,7 @@ end = beg
 		// Step over whole lines, collecting them. While doing that, check for
 		// fenced code and if one's found, incorporate it altogether,
 		// irregardless of any contents inside it
-		for data[end] != '\n' {
+		for end < len(data) && data[end] != '\n' {
 			if p.flags&FencedCode != 0 {
 				if i := p.fencedCodeBlock(data[end:], false); i > 0 {
 					// -1 to compensate for the extra end++ after the loop:
@@ -978,7 +985,9 @@ }
 			}
 			end++
 		}
-		end++
+		if end < len(data) && data[end] == '\n' {
+			end++
+		}
 		if pre := p.quotePrefix(data[beg:]); pre > 0 {
 			// skip the prefix
 			beg += pre
@@ -996,7 +1005,10 @@ }
 
 // returns prefix length for block code
 func (p *parser) codePrefix(data []byte) int {
-	if data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
+	if data[0] == '\t' {
+		return 1
+	}
+	if len(data) >= 4 && data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
 		return 4
 	}
 	return 0
@@ -1008,10 +1020,12 @@
 	i := 0
 	for i < len(data) {
 		beg := i
-		for data[i] != '\n' {
+		for i < len(data) && data[i] != '\n' {
 			i++
 		}
-		i++
+		if i < len(data) && data[i] == '\n' {
+			i++
+		}
 
 		blankline := p.isEmpty(data[beg:i]) > 0
 		if pre := p.codePrefix(data[beg:i]); pre > 0 {
@@ -1022,7 +1036,7 @@ i = beg
 			break
 		}
 
-		// verbatim copy to the working buffeu
+		// verbatim copy to the working buffer
 		if blankline {
 			work.WriteByte('\n')
 		} else {
@@ -1052,15 +1066,16 @@
 // returns unordered list item prefix
 func (p *parser) uliPrefix(data []byte) int {
 	i := 0
-
 	// start with up to 3 spaces
-	for i < 3 && data[i] == ' ' {
+	for i < len(data) && i < 3 && data[i] == ' ' {
 		i++
 	}
-
+	if i >= len(data)-1 {
+		return 0
+	}
 	// need a *, +, or - followed by a space
 	if (data[i] != '*' && data[i] != '+' && data[i] != '-') ||
-		data[i+1] != ' ' {
+		(data[i+1] != ' ' && data[i+1] != '\t') {
 		return 0
 	}
 	return i + 2
@@ -1071,18 +1086,21 @@ func (p *parser) oliPrefix(data []byte) int {
 	i := 0
 
 	// start with up to 3 spaces
-	for i < 3 && data[i] == ' ' {
+	for i < 3 && i < len(data) && data[i] == ' ' {
 		i++
 	}
 
 	// count the digits
 	start := i
-	for data[i] >= '0' && data[i] <= '9' {
+	for i < len(data) && data[i] >= '0' && data[i] <= '9' {
 		i++
 	}
+	if start == i || i >= len(data)-1 {
+		return 0
+	}
 
 	// we need >= 1 digits followed by a dot and a space
-	if start == i || data[i] != '.' || data[i+1] != ' ' {
+	if data[i] != '.' || !(data[i+1] == ' ' || data[i+1] == '\t') {
 		return 0
 	}
 	return i + 2
@@ -1090,13 +1108,15 @@ }
 
 // returns definition list item prefix
 func (p *parser) dliPrefix(data []byte) int {
+	if len(data) < 2 {
+		return 0
+	}
 	i := 0
-
 	// need a : followed by a spaces
-	if data[i] != ':' || data[i+1] != ' ' {
+	if data[i] != ':' || !(data[i+1] == ' ' || data[i+1] == '\t') {
 		return 0
 	}
-	for data[i] == ' ' {
+	for i < len(data) && data[i] == ' ' {
 		i++
 	}
 	return i + 2
@@ -1174,6 +1194,9 @@ // Assumes initial prefix is already removed if this is a sublist.
 func (p *parser) listItem(data []byte, flags *ListType) int {
 	// keep track of the indentation of the first line
 	itemIndent := 0
+	if data[itemIndent] == '\t' {
+		itemIndent += 4
+	}
 	for itemIndent < 3 && data[itemIndent] == ' ' {
 		itemIndent++
 	}
@@ -1202,13 +1225,13 @@ }
 	}
 
 	// skip leading whitespace on first line
-	for data[i] == ' ' {
+	for i < len(data) && data[i] == ' ' {
 		i++
 	}
 
 	// find the end of the line
 	line := i
-	for i > 0 && data[i-1] != '\n' {
+	for i > 0 && i < len(data) && data[i-1] != '\n' {
 		i++
 	}
 
@@ -1228,7 +1251,7 @@ for line < len(data) {
 		i++
 
 		// find the end of this line
-		for data[i-1] != '\n' {
+		for i < len(data) && data[i-1] != '\n' {
 			i++
 		}
 
@@ -1242,11 +1265,18 @@ }
 
 		// calculate the indentation
 		indent := 0
-		for indent < 4 && line+indent < i && data[line+indent] == ' ' {
-			indent++
+		indentIndex := 0
+		if data[line] == '\t' {
+			indentIndex++
+			indent += 4
+		} else {
+			for indent < 4 && line+indent < i && data[line+indent] == ' ' {
+				indent++
+				indentIndex++
+			}
 		}
 
-		chunk := data[line+indent : i]
+		chunk := data[line+indentIndex : i]
 
 		// evaluate how this line fits in
 		switch {
@@ -1287,7 +1317,7 @@ case containsBlankLine && indent < 4:
 			if *flags&ListTypeDefinition != 0 && i < len(data)-1 {
 				// is the next item still a part of this list?
 				next := i
-				for data[next] != '\n' {
+				for next < len(data) && data[next] != '\n' {
 					next++
 				}
 				for next < len(data)-1 && data[next] == '\n' {
@@ -1315,7 +1345,7 @@ raw.WriteByte('\n')
 		}
 
 		// add the line into the working buffer without prefix
-		raw.Write(data[line+indent : i])
+		raw.Write(data[line+indentIndex : i])
 
 		line = i
 	}
@@ -1363,8 +1393,11 @@ for data[beg] == ' ' {
 		beg++
 	}
 
+	end := len(data)
 	// trim trailing newline
-	end := len(data) - 1
+	if data[len(data)-1] == '\n' {
+		end--
+	}
 
 	// trim trailing spaces
 	for end > beg && data[end-1] == ' ' {
@@ -1403,7 +1436,8 @@ if n := p.isEmpty(current); n > 0 {
 			// did this blank line followed by a definition list item?
 			if p.flags&DefinitionLists != 0 {
 				if i < len(data)-1 && data[i+1] == ':' {
-					return p.list(data[prev:], ListTypeDefinition)
+					ret := p.list(data[prev:], ListTypeDefinition)
+					return ret
 				}
 			}
 
@@ -1436,7 +1470,7 @@ block.Level = level
 				block.HeaderID = id
 
 				// find the end of the underline
-				for data[i] != '\n' {
+				for i < len(data) && data[i] != '\n' {
 					i++
 				}
 				return i
@@ -1469,7 +1503,8 @@
 		// if there's a definition list item, prev line is a definition term
 		if p.flags&DefinitionLists != 0 {
 			if p.dliPrefix(current) != 0 {
-				return p.list(data[prev:], ListTypeDefinition)
+				ret := p.list(data[prev:], ListTypeDefinition)
+				return ret
 			}
 		}
 
@@ -1485,7 +1520,12 @@ }
 		}
 
 		// otherwise, scan to the beginning of the next line
-		i += bytes.IndexByte(data[i:], '\n') + 1
+		nl := bytes.IndexByte(data[i:], '\n')
+		if nl >= 0 {
+			i += nl + 1
+		} else {
+			i += len(data[i:])
+		}
 	}
 
 	p.renderParagraph(data[:i])

M block_test.go → block_test.go

@@ -1655,14 +1655,14 @@ func TestIsFenceLine(t *testing.T) {
 	tests := []struct {
 		data            []byte
 		syntaxRequested bool
-		newlineOptional bool
 		wantEnd         int
 		wantMarker      string
 		wantSyntax      string
 	}{
 		{
-			data:    []byte("```"),
-			wantEnd: 0,
+			data:       []byte("```"),
+			wantEnd:    3,
+			wantMarker: "```",
 		},
 		{
 			data:       []byte("```\nstuff here\n"),
@@ -1681,21 +1681,13 @@ wantEnd: 0,
 		},
 		{
 			data:            []byte("```"),
-			newlineOptional: true,
-			wantEnd:         3,
-			wantMarker:      "```",
-		},
-		{
-			data:            []byte("```"),
 			syntaxRequested: true,
-			newlineOptional: true,
 			wantEnd:         3,
 			wantMarker:      "```",
 		},
 		{
 			data:            []byte("``` go"),
 			syntaxRequested: true,
-			newlineOptional: true,
 			wantEnd:         6,
 			wantMarker:      "```",
 			wantSyntax:      "go",
@@ -1707,7 +1699,7 @@ var syntax *string
 		if test.syntaxRequested {
 			syntax = new(string)
 		}
-		end, marker := isFenceLine(test.data, syntax, "```", test.newlineOptional)
+		end, marker := isFenceLine(test.data, syntax, "```")
 		if got, want := end, test.wantEnd; got != want {
 			t.Errorf("got end %v, want %v", got, want)
 		}

M inline_test.go → inline_test.go

@@ -1142,7 +1142,7 @@
 func TestSkipHTML(t *testing.T) {
 	doTestsParam(t, []string{
 		"<div class=\"foo\"></div>\n\ntext\n\n<form>the form</form>",
-		"<p>text</p>\n",
+		"<p>text</p>\n\n<p>the form</p>\n",
 
 		"text <em>inline html</em> more text",
 		"<p>text inline html more text</p>\n",

M markdown.go → markdown.go

@@ -387,7 +387,7 @@ if extensions&Footnotes != 0 {
 		p.notes = make([]*reference, 0)
 	}
 
-	p.block(preprocess(p, input))
+	p.block(input)
 	// Walk the tree and finish up some of unfinished blocks
 	for p.tip != nil {
 		p.finalize(p.tip)
@@ -440,63 +440,6 @@ node.content = nil
 		}
 		return GoToNext
 	})
-}
-
-// preprocess does a preparatory first pass over the input:
-// - normalize newlines
-// - expand tabs (outside of fenced code blocks)
-// - copy everything else
-func preprocess(p *parser, input []byte) []byte {
-	var out bytes.Buffer
-	tabSize := TabSizeDefault
-	if p.flags&TabSizeEight != 0 {
-		tabSize = TabSizeDouble
-	}
-	beg := 0
-	lastFencedCodeBlockEnd := 0
-	for beg < len(input) {
-		// Find end of this line, then process the line.
-		end := beg
-		for end < len(input) && input[end] != '\n' && input[end] != '\r' {
-			end++
-		}
-
-		if p.flags&FencedCode != 0 {
-			// track fenced code block boundaries to suppress tab expansion
-			// and reference extraction inside them:
-			if beg >= lastFencedCodeBlockEnd {
-				if i := p.fencedCodeBlock(input[beg:], false); i > 0 {
-					lastFencedCodeBlockEnd = beg + i
-				}
-			}
-		}
-
-		// add the line body if present
-		if end > beg {
-			if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
-				out.Write(input[beg:end])
-			} else {
-				expandTabs(&out, input[beg:end], tabSize)
-			}
-		}
-
-		if end < len(input) && input[end] == '\r' {
-			end++
-		}
-		if end < len(input) && input[end] == '\n' {
-			end++
-		}
-		out.WriteByte('\n')
-
-		beg = end
-	}
-
-	// empty input?
-	if out.Len() == 0 {
-		out.WriteByte('\n')
-	}
-
-	return out.Bytes()
 }
 
 //

M testdata/Inline HTML (Simple).html → testdata/Inline HTML (Simple).html

@@ -1,13 +1,13 @@
 <p>Here's a simple block:</p>
 
 <div>
-    foo
+	foo
 </div>
 
 <p>This should be a code block, though:</p>
 
 <pre><code>&lt;div&gt;
-    foo
+	foo
 &lt;/div&gt;
 </code></pre>
 
@@ -19,11 +19,11 @@
 <p>Now, nested:</p>
 
 <div>
-    <div>
-        <div>
-            foo
-        </div>
-    </div>
+	<div>
+		<div>
+			foo
+		</div>
+	</div>
 </div>
 
 <p>This should just be an HTML comment:</p>

M testdata/Inline HTML comments.html → testdata/Inline HTML comments.html

@@ -3,7 +3,7 @@
 <!-- This is a simple comment -->
 
 <!--
-    This is another comment.
+	This is another comment.
 -->
 
 <p>Paragraph two.</p>

M testdata/Markdown Documentation - Syntax.html → testdata/Markdown Documentation - Syntax.html

@@ -939,8 +939,8 @@ {}  curly braces
 []  square brackets
 ()  parentheses
 #   hash mark
-+   plus sign
--   minus sign (hyphen)
++	plus sign
+-	minus sign (hyphen)
 .   dot
 !   exclamation mark
 </code></pre>

M testdata/Tabs.html → testdata/Tabs.html

@@ -13,13 +13,13 @@ </code></pre>
 
 <p>And:</p>
 
-<pre><code>    this code block is indented by two tabs
+<pre><code>	this code block is indented by two tabs
 </code></pre>
 
 <p>And:</p>
 
-<pre><code>+   this is an example list item
-    indented with tabs
+<pre><code>+	this is an example list item
+	indented with tabs
 
 +   this is an example list item
     indented with spaces

all repos — grayfriday @ 120bb2fae155ccb71b20303046b7cee8e4bbc9c4

blackfriday fork with a few changes