icy does git — grayfriday (91753e8bc7f0f5b54d9f62667940d359bc18d052): block.go

block.go (view raw)
   1//
   2// Blackfriday Markdown Processor
   3// Available at http://github.com/russross/blackfriday
   4//
   5// Copyright © 2011 Russ Ross <russ@russross.com>.
   6// Distributed under the Simplified BSD License.
   7// See README.md for details.
   8//
   9
  10//
  11// Functions to parse block-level elements.
  12//
  13
  14package blackfriday
  15
  16import (
  17	"bytes"
  18	"html"
  19	"regexp"
  20
  21	"github.com/shurcooL/sanitized_anchor_name"
  22)
  23
  24const (
  25	charEntity = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"
  26	escapable  = "[!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]"
  27)
  28
  29var (
  30	reBackslashOrAmp      = regexp.MustCompile("[\\&]")
  31	reEntityOrEscapedChar = regexp.MustCompile("(?i)\\\\" + escapable + "|" + charEntity)
  32	reTrailingWhitespace  = regexp.MustCompile("(\n *)+$")
  33)
  34
  35// Parse block-level data.
  36// Note: this function and many that it calls assume that
  37// the input buffer ends with a newline.
  38func (p *parser) block(data []byte) {
  39	if len(data) == 0 || data[len(data)-1] != '\n' {
  40		panic("block input is missing terminating newline")
  41	}
  42
  43	// this is called recursively: enforce a maximum depth
  44	if p.nesting >= p.maxNesting {
  45		return
  46	}
  47	p.nesting++
  48
  49	// parse out one block-level construct at a time
  50	for len(data) > 0 {
  51		// prefixed header:
  52		//
  53		// # Header 1
  54		// ## Header 2
  55		// ...
  56		// ###### Header 6
  57		if p.isPrefixHeader(data) {
  58			data = data[p.prefixHeader(data):]
  59			continue
  60		}
  61
  62		// block of preformatted HTML:
  63		//
  64		// <div>
  65		//     ...
  66		// </div>
  67		if data[0] == '<' {
  68			if i := p.html(data, true); i > 0 {
  69				data = data[i:]
  70				continue
  71			}
  72		}
  73
  74		// title block
  75		//
  76		// % stuff
  77		// % more stuff
  78		// % even more stuff
  79		if p.flags&Titleblock != 0 {
  80			if data[0] == '%' {
  81				if i := p.titleBlock(data, true); i > 0 {
  82					data = data[i:]
  83					continue
  84				}
  85			}
  86		}
  87
  88		// blank lines.  note: returns the # of bytes to skip
  89		if i := p.isEmpty(data); i > 0 {
  90			data = data[i:]
  91			continue
  92		}
  93
  94		// indented code block:
  95		//
  96		//     func max(a, b int) int {
  97		//         if a > b {
  98		//             return a
  99		//         }
 100		//         return b
 101		//      }
 102		if p.codePrefix(data) > 0 {
 103			data = data[p.code(data):]
 104			continue
 105		}
 106
 107		// fenced code block:
 108		//
 109		// ``` go
 110		// func fact(n int) int {
 111		//     if n <= 1 {
 112		//         return n
 113		//     }
 114		//     return n * fact(n-1)
 115		// }
 116		// ```
 117		if p.flags&FencedCode != 0 {
 118			if i := p.fencedCodeBlock(data, true); i > 0 {
 119				data = data[i:]
 120				continue
 121			}
 122		}
 123
 124		// horizontal rule:
 125		//
 126		// ------
 127		// or
 128		// ******
 129		// or
 130		// ______
 131		if p.isHRule(data) {
 132			p.addBlock(HorizontalRule, nil)
 133			var i int
 134			for i = 0; data[i] != '\n'; i++ {
 135			}
 136			data = data[i:]
 137			continue
 138		}
 139
 140		// block quote:
 141		//
 142		// > A big quote I found somewhere
 143		// > on the web
 144		if p.quotePrefix(data) > 0 {
 145			data = data[p.quote(data):]
 146			continue
 147		}
 148
 149		// table:
 150		//
 151		// Name  | Age | Phone
 152		// ------|-----|---------
 153		// Bob   | 31  | 555-1234
 154		// Alice | 27  | 555-4321
 155		if p.flags&Tables != 0 {
 156			if i := p.table(data); i > 0 {
 157				data = data[i:]
 158				continue
 159			}
 160		}
 161
 162		// an itemized/unordered list:
 163		//
 164		// * Item 1
 165		// * Item 2
 166		//
 167		// also works with + or -
 168		if p.uliPrefix(data) > 0 {
 169			data = data[p.list(data, 0):]
 170			continue
 171		}
 172
 173		// a numbered/ordered list:
 174		//
 175		// 1. Item 1
 176		// 2. Item 2
 177		if p.oliPrefix(data) > 0 {
 178			data = data[p.list(data, ListTypeOrdered):]
 179			continue
 180		}
 181
 182		// definition lists:
 183		//
 184		// Term 1
 185		// :   Definition a
 186		// :   Definition b
 187		//
 188		// Term 2
 189		// :   Definition c
 190		if p.flags&DefinitionLists != 0 {
 191			if p.dliPrefix(data) > 0 {
 192				data = data[p.list(data, ListTypeDefinition):]
 193				continue
 194			}
 195		}
 196
 197		// anything else must look like a normal paragraph
 198		// note: this finds underlined headers, too
 199		data = data[p.paragraph(data):]
 200	}
 201
 202	p.nesting--
 203}
 204
 205func (p *parser) addBlock(typ NodeType, content []byte) *Node {
 206	p.closeUnmatchedBlocks()
 207	container := p.addChild(typ, 0)
 208	container.content = content
 209	return container
 210}
 211
 212func (p *parser) isPrefixHeader(data []byte) bool {
 213	if data[0] != '#' {
 214		return false
 215	}
 216
 217	if p.flags&SpaceHeaders != 0 {
 218		level := 0
 219		for level < 6 && data[level] == '#' {
 220			level++
 221		}
 222		if data[level] != ' ' {
 223			return false
 224		}
 225	}
 226	return true
 227}
 228
 229func (p *parser) prefixHeader(data []byte) int {
 230	level := 0
 231	for level < 6 && data[level] == '#' {
 232		level++
 233	}
 234	i := skipChar(data, level, ' ')
 235	end := skipUntilChar(data, i, '\n')
 236	skip := end
 237	id := ""
 238	if p.flags&HeaderIDs != 0 {
 239		j, k := 0, 0
 240		// find start/end of header id
 241		for j = i; j < end-1 && (data[j] != '{' || data[j+1] != '#'); j++ {
 242		}
 243		for k = j + 1; k < end && data[k] != '}'; k++ {
 244		}
 245		// extract header id iff found
 246		if j < end && k < end {
 247			id = string(data[j+2 : k])
 248			end = j
 249			skip = k + 1
 250			for end > 0 && data[end-1] == ' ' {
 251				end--
 252			}
 253		}
 254	}
 255	for end > 0 && data[end-1] == '#' {
 256		if isBackslashEscaped(data, end-1) {
 257			break
 258		}
 259		end--
 260	}
 261	for end > 0 && data[end-1] == ' ' {
 262		end--
 263	}
 264	if end > i {
 265		if id == "" && p.flags&AutoHeaderIDs != 0 {
 266			id = sanitized_anchor_name.Create(string(data[i:end]))
 267		}
 268		block := p.addBlock(Header, data[i:end])
 269		block.HeaderID = id
 270		block.Level = level
 271	}
 272	return skip
 273}
 274
 275func (p *parser) isUnderlinedHeader(data []byte) int {
 276	// test of level 1 header
 277	if data[0] == '=' {
 278		i := skipChar(data, 1, '=')
 279		i = skipChar(data, i, ' ')
 280		if data[i] == '\n' {
 281			return 1
 282		}
 283		return 0
 284	}
 285
 286	// test of level 2 header
 287	if data[0] == '-' {
 288		i := skipChar(data, 1, '-')
 289		i = skipChar(data, i, ' ')
 290		if data[i] == '\n' {
 291			return 2
 292		}
 293		return 0
 294	}
 295
 296	return 0
 297}
 298
 299func (p *parser) titleBlock(data []byte, doRender bool) int {
 300	if data[0] != '%' {
 301		return 0
 302	}
 303	splitData := bytes.Split(data, []byte("\n"))
 304	var i int
 305	for idx, b := range splitData {
 306		if !bytes.HasPrefix(b, []byte("%")) {
 307			i = idx // - 1
 308			break
 309		}
 310	}
 311
 312	data = bytes.Join(splitData[0:i], []byte("\n"))
 313	consumed := len(data)
 314	data = bytes.TrimPrefix(data, []byte("% "))
 315	data = bytes.Replace(data, []byte("\n% "), []byte("\n"), -1)
 316	block := p.addBlock(Header, data)
 317	block.Level = 1
 318	block.IsTitleblock = true
 319
 320	return consumed
 321}
 322
 323func (p *parser) html(data []byte, doRender bool) int {
 324	var i, j int
 325
 326	// identify the opening tag
 327	if data[0] != '<' {
 328		return 0
 329	}
 330	curtag, tagfound := p.htmlFindTag(data[1:])
 331
 332	// handle special cases
 333	if !tagfound {
 334		// check for an HTML comment
 335		if size := p.htmlComment(data, doRender); size > 0 {
 336			return size
 337		}
 338
 339		// check for an <hr> tag
 340		if size := p.htmlHr(data, doRender); size > 0 {
 341			return size
 342		}
 343
 344		// no special case recognized
 345		return 0
 346	}
 347
 348	// look for an unindented matching closing tag
 349	// followed by a blank line
 350	found := false
 351	/*
 352		closetag := []byte("\n</" + curtag + ">")
 353		j = len(curtag) + 1
 354		for !found {
 355			// scan for a closing tag at the beginning of a line
 356			if skip := bytes.Index(data[j:], closetag); skip >= 0 {
 357				j += skip + len(closetag)
 358			} else {
 359				break
 360			}
 361
 362			// see if it is the only thing on the line
 363			if skip := p.isEmpty(data[j:]); skip > 0 {
 364				// see if it is followed by a blank line/eof
 365				j += skip
 366				if j >= len(data) {
 367					found = true
 368					i = j
 369				} else {
 370					if skip := p.isEmpty(data[j:]); skip > 0 {
 371						j += skip
 372						found = true
 373						i = j
 374					}
 375				}
 376			}
 377		}
 378	*/
 379
 380	// if not found, try a second pass looking for indented match
 381	// but not if tag is "ins" or "del" (following original Markdown.pl)
 382	if !found && curtag != "ins" && curtag != "del" {
 383		i = 1
 384		for i < len(data) {
 385			i++
 386			for i < len(data) && !(data[i-1] == '<' && data[i] == '/') {
 387				i++
 388			}
 389
 390			if i+2+len(curtag) >= len(data) {
 391				break
 392			}
 393
 394			j = p.htmlFindEnd(curtag, data[i-1:])
 395
 396			if j > 0 {
 397				i += j - 1
 398				found = true
 399				break
 400			}
 401		}
 402	}
 403
 404	if !found {
 405		return 0
 406	}
 407
 408	// the end of the block has been found
 409	if doRender {
 410		// trim newlines
 411		end := i
 412		for end > 0 && data[end-1] == '\n' {
 413			end--
 414		}
 415		finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
 416	}
 417
 418	return i
 419}
 420
 421func finalizeHTMLBlock(block *Node) {
 422	block.Literal = reTrailingWhitespace.ReplaceAll(block.content, []byte{})
 423	block.content = []byte{}
 424}
 425
 426// HTML comment, lax form
 427func (p *parser) htmlComment(data []byte, doRender bool) int {
 428	i := p.inlineHTMLComment(data)
 429	// needs to end with a blank line
 430	if j := p.isEmpty(data[i:]); j > 0 {
 431		size := i + j
 432		if doRender {
 433			// trim trailing newlines
 434			end := size
 435			for end > 0 && data[end-1] == '\n' {
 436				end--
 437			}
 438			block := p.addBlock(HTMLBlock, data[:end])
 439			finalizeHTMLBlock(block)
 440		}
 441		return size
 442	}
 443	return 0
 444}
 445
 446// HR, which is the only self-closing block tag considered
 447func (p *parser) htmlHr(data []byte, doRender bool) int {
 448	if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') {
 449		return 0
 450	}
 451	if data[3] != ' ' && data[3] != '/' && data[3] != '>' {
 452		// not an <hr> tag after all; at least not a valid one
 453		return 0
 454	}
 455
 456	i := 3
 457	for data[i] != '>' && data[i] != '\n' {
 458		i++
 459	}
 460
 461	if data[i] == '>' {
 462		i++
 463		if j := p.isEmpty(data[i:]); j > 0 {
 464			size := i + j
 465			if doRender {
 466				// trim newlines
 467				end := size
 468				for end > 0 && data[end-1] == '\n' {
 469					end--
 470				}
 471				finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
 472			}
 473			return size
 474		}
 475	}
 476
 477	return 0
 478}
 479
 480func (p *parser) htmlFindTag(data []byte) (string, bool) {
 481	i := 0
 482	for isalnum(data[i]) {
 483		i++
 484	}
 485	key := string(data[:i])
 486	if _, ok := blockTags[key]; ok {
 487		return key, true
 488	}
 489	return "", false
 490}
 491
 492func (p *parser) htmlFindEnd(tag string, data []byte) int {
 493	// assume data[0] == '<' && data[1] == '/' already tested
 494	if tag == "hr" {
 495		return 2
 496	}
 497	// check if tag is a match
 498	closetag := []byte("</" + tag + ">")
 499	if !bytes.HasPrefix(data, closetag) {
 500		return 0
 501	}
 502	i := len(closetag)
 503
 504	// check that the rest of the line is blank
 505	skip := 0
 506	if skip = p.isEmpty(data[i:]); skip == 0 {
 507		return 0
 508	}
 509	i += skip
 510	skip = 0
 511
 512	if i >= len(data) {
 513		return i
 514	}
 515
 516	if p.flags&LaxHTMLBlocks != 0 {
 517		return i
 518	}
 519	if skip = p.isEmpty(data[i:]); skip == 0 {
 520		// following line must be blank
 521		return 0
 522	}
 523
 524	return i + skip
 525}
 526
 527func (*parser) isEmpty(data []byte) int {
 528	// it is okay to call isEmpty on an empty buffer
 529	if len(data) == 0 {
 530		return 0
 531	}
 532
 533	var i int
 534	for i = 0; i < len(data) && data[i] != '\n'; i++ {
 535		if data[i] != ' ' && data[i] != '\t' {
 536			return 0
 537		}
 538	}
 539	return i + 1
 540}
 541
 542func (*parser) isHRule(data []byte) bool {
 543	i := 0
 544
 545	// skip up to three spaces
 546	for i < 3 && data[i] == ' ' {
 547		i++
 548	}
 549
 550	// look at the hrule char
 551	if data[i] != '*' && data[i] != '-' && data[i] != '_' {
 552		return false
 553	}
 554	c := data[i]
 555
 556	// the whole line must be the char or whitespace
 557	n := 0
 558	for data[i] != '\n' {
 559		switch {
 560		case data[i] == c:
 561			n++
 562		case data[i] != ' ':
 563			return false
 564		}
 565		i++
 566	}
 567
 568	return n >= 3
 569}
 570
 571// isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data,
 572// and returns the end index if so, or 0 otherwise. It also returns the marker found.
 573// If syntax is not nil, it gets set to the syntax specified in the fence line.
 574// A final newline is mandatory to recognize the fence line, unless newlineOptional is true.
 575func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional bool) (end int, marker string) {
 576	i, size := 0, 0
 577
 578	// skip up to three spaces
 579	for i < len(data) && i < 3 && data[i] == ' ' {
 580		i++
 581	}
 582
 583	// check for the marker characters: ~ or `
 584	if i >= len(data) {
 585		return 0, ""
 586	}
 587	if data[i] != '~' && data[i] != '`' {
 588		return 0, ""
 589	}
 590
 591	c := data[i]
 592
 593	// the whole line must be the same char or whitespace
 594	for i < len(data) && data[i] == c {
 595		size++
 596		i++
 597	}
 598
 599	// the marker char must occur at least 3 times
 600	if size < 3 {
 601		return 0, ""
 602	}
 603	marker = string(data[i-size : i])
 604
 605	// if this is the end marker, it must match the beginning marker
 606	if oldmarker != "" && marker != oldmarker {
 607		return 0, ""
 608	}
 609
 610	// TODO(shurcooL): It's probably a good idea to simplify the 2 code paths here
 611	// into one, always get the syntax, and discard it if the caller doesn't care.
 612	if syntax != nil {
 613		syn := 0
 614		i = skipChar(data, i, ' ')
 615
 616		if i >= len(data) {
 617			if newlineOptional && i == len(data) {
 618				return i, marker
 619			}
 620			return 0, ""
 621		}
 622
 623		syntaxStart := i
 624
 625		if data[i] == '{' {
 626			i++
 627			syntaxStart++
 628
 629			for i < len(data) && data[i] != '}' && data[i] != '\n' {
 630				syn++
 631				i++
 632			}
 633
 634			if i >= len(data) || data[i] != '}' {
 635				return 0, ""
 636			}
 637
 638			// strip all whitespace at the beginning and the end
 639			// of the {} block
 640			for syn > 0 && isspace(data[syntaxStart]) {
 641				syntaxStart++
 642				syn--
 643			}
 644
 645			for syn > 0 && isspace(data[syntaxStart+syn-1]) {
 646				syn--
 647			}
 648
 649			i++
 650		} else {
 651			for i < len(data) && !isspace(data[i]) {
 652				syn++
 653				i++
 654			}
 655		}
 656
 657		*syntax = string(data[syntaxStart : syntaxStart+syn])
 658	}
 659
 660	i = skipChar(data, i, ' ')
 661	if i >= len(data) || data[i] != '\n' {
 662		if newlineOptional && i == len(data) {
 663			return i, marker
 664		}
 665		return 0, ""
 666	}
 667
 668	return i + 1, marker // Take newline into account.
 669}
 670
 671// fencedCodeBlock returns the end index if data contains a fenced code block at the beginning,
 672// or 0 otherwise. It writes to out if doRender is true, otherwise it has no side effects.
 673// If doRender is true, a final newline is mandatory to recognize the fenced code block.
 674func (p *parser) fencedCodeBlock(data []byte, doRender bool) int {
 675	var syntax string
 676	beg, marker := isFenceLine(data, &syntax, "", false)
 677	if beg == 0 || beg >= len(data) {
 678		return 0
 679	}
 680
 681	var work bytes.Buffer
 682	work.Write([]byte(syntax))
 683	work.WriteByte('\n')
 684
 685	for {
 686		// safe to assume beg < len(data)
 687
 688		// check for the end of the code block
 689		newlineOptional := !doRender
 690		fenceEnd, _ := isFenceLine(data[beg:], nil, marker, newlineOptional)
 691		if fenceEnd != 0 {
 692			beg += fenceEnd
 693			break
 694		}
 695
 696		// copy the current line
 697		end := skipUntilChar(data, beg, '\n') + 1
 698
 699		// did we reach the end of the buffer without a closing marker?
 700		if end >= len(data) {
 701			return 0
 702		}
 703
 704		// verbatim copy to the working buffer
 705		if doRender {
 706			work.Write(data[beg:end])
 707		}
 708		beg = end
 709	}
 710
 711	if doRender {
 712		block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
 713		block.IsFenced = true
 714		finalizeCodeBlock(block)
 715	}
 716
 717	return beg
 718}
 719
 720func unescapeChar(str []byte) []byte {
 721	if str[0] == '\\' {
 722		return []byte{str[1]}
 723	}
 724	return []byte(html.UnescapeString(string(str)))
 725}
 726
 727func unescapeString(str []byte) []byte {
 728	if reBackslashOrAmp.Match(str) {
 729		return reEntityOrEscapedChar.ReplaceAllFunc(str, unescapeChar)
 730	}
 731	return str
 732}
 733
 734func finalizeCodeBlock(block *Node) {
 735	if block.IsFenced {
 736		newlinePos := bytes.IndexByte(block.content, '\n')
 737		firstLine := block.content[:newlinePos]
 738		rest := block.content[newlinePos+1:]
 739		block.Info = unescapeString(bytes.Trim(firstLine, "\n"))
 740		block.Literal = rest
 741	} else {
 742		block.Literal = reTrailingWhitespace.ReplaceAll(block.content, []byte{'\n'})
 743	}
 744	block.content = nil
 745}
 746
 747func (p *parser) table(data []byte) int {
 748	table := p.addBlock(Table, nil)
 749	i, columns := p.tableHeader(data)
 750	if i == 0 {
 751		p.tip = table.Parent
 752		table.Unlink()
 753		return 0
 754	}
 755
 756	p.addBlock(TableBody, nil)
 757
 758	for i < len(data) {
 759		pipes, rowStart := 0, i
 760		for ; data[i] != '\n'; i++ {
 761			if data[i] == '|' {
 762				pipes++
 763			}
 764		}
 765
 766		if pipes == 0 {
 767			i = rowStart
 768			break
 769		}
 770
 771		// include the newline in data sent to tableRow
 772		i++
 773		p.tableRow(data[rowStart:i], columns, false)
 774	}
 775
 776	return i
 777}
 778
 779// check if the specified position is preceded by an odd number of backslashes
 780func isBackslashEscaped(data []byte, i int) bool {
 781	backslashes := 0
 782	for i-backslashes-1 >= 0 && data[i-backslashes-1] == '\\' {
 783		backslashes++
 784	}
 785	return backslashes&1 == 1
 786}
 787
 788func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
 789	i := 0
 790	colCount := 1
 791	for i = 0; data[i] != '\n'; i++ {
 792		if data[i] == '|' && !isBackslashEscaped(data, i) {
 793			colCount++
 794		}
 795	}
 796
 797	// doesn't look like a table header
 798	if colCount == 1 {
 799		return
 800	}
 801
 802	// include the newline in the data sent to tableRow
 803	header := data[:i+1]
 804
 805	// column count ignores pipes at beginning or end of line
 806	if data[0] == '|' {
 807		colCount--
 808	}
 809	if i > 2 && data[i-1] == '|' && !isBackslashEscaped(data, i-1) {
 810		colCount--
 811	}
 812
 813	columns = make([]CellAlignFlags, colCount)
 814
 815	// move on to the header underline
 816	i++
 817	if i >= len(data) {
 818		return
 819	}
 820
 821	if data[i] == '|' && !isBackslashEscaped(data, i) {
 822		i++
 823	}
 824	i = skipChar(data, i, ' ')
 825
 826	// each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3
 827	// and trailing | optional on last column
 828	col := 0
 829	for data[i] != '\n' {
 830		dashes := 0
 831
 832		if data[i] == ':' {
 833			i++
 834			columns[col] |= TableAlignmentLeft
 835			dashes++
 836		}
 837		for data[i] == '-' {
 838			i++
 839			dashes++
 840		}
 841		if data[i] == ':' {
 842			i++
 843			columns[col] |= TableAlignmentRight
 844			dashes++
 845		}
 846		for data[i] == ' ' {
 847			i++
 848		}
 849
 850		// end of column test is messy
 851		switch {
 852		case dashes < 3:
 853			// not a valid column
 854			return
 855
 856		case data[i] == '|' && !isBackslashEscaped(data, i):
 857			// marker found, now skip past trailing whitespace
 858			col++
 859			i++
 860			for data[i] == ' ' {
 861				i++
 862			}
 863
 864			// trailing junk found after last column
 865			if col >= colCount && data[i] != '\n' {
 866				return
 867			}
 868
 869		case (data[i] != '|' || isBackslashEscaped(data, i)) && col+1 < colCount:
 870			// something else found where marker was required
 871			return
 872
 873		case data[i] == '\n':
 874			// marker is optional for the last column
 875			col++
 876
 877		default:
 878			// trailing junk found after last column
 879			return
 880		}
 881	}
 882	if col != colCount {
 883		return
 884	}
 885
 886	p.addBlock(TableHead, nil)
 887	p.tableRow(header, columns, true)
 888	size = i + 1
 889	return
 890}
 891
 892func (p *parser) tableRow(data []byte, columns []CellAlignFlags, header bool) {
 893	p.addBlock(TableRow, nil)
 894	i, col := 0, 0
 895
 896	if data[i] == '|' && !isBackslashEscaped(data, i) {
 897		i++
 898	}
 899
 900	for col = 0; col < len(columns) && i < len(data); col++ {
 901		for data[i] == ' ' {
 902			i++
 903		}
 904
 905		cellStart := i
 906
 907		for (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
 908			i++
 909		}
 910
 911		cellEnd := i
 912
 913		// skip the end-of-cell marker, possibly taking us past end of buffer
 914		i++
 915
 916		for cellEnd > cellStart && data[cellEnd-1] == ' ' {
 917			cellEnd--
 918		}
 919
 920		cell := p.addBlock(TableCell, data[cellStart:cellEnd])
 921		cell.IsHeader = header
 922		cell.Align = columns[col]
 923	}
 924
 925	// pad it out with empty columns to get the right number
 926	for ; col < len(columns); col++ {
 927		cell := p.addBlock(TableCell, nil)
 928		cell.IsHeader = header
 929		cell.Align = columns[col]
 930	}
 931
 932	// silently ignore rows with too many cells
 933}
 934
 935// returns blockquote prefix length
 936func (p *parser) quotePrefix(data []byte) int {
 937	i := 0
 938	for i < 3 && data[i] == ' ' {
 939		i++
 940	}
 941	if data[i] == '>' {
 942		if data[i+1] == ' ' {
 943			return i + 2
 944		}
 945		return i + 1
 946	}
 947	return 0
 948}
 949
 950// blockquote ends with at least one blank line
 951// followed by something without a blockquote prefix
 952func (p *parser) terminateBlockquote(data []byte, beg, end int) bool {
 953	if p.isEmpty(data[beg:]) <= 0 {
 954		return false
 955	}
 956	if end >= len(data) {
 957		return true
 958	}
 959	return p.quotePrefix(data[end:]) == 0 && p.isEmpty(data[end:]) == 0
 960}
 961
 962// parse a blockquote fragment
 963func (p *parser) quote(data []byte) int {
 964	block := p.addBlock(BlockQuote, nil)
 965	var raw bytes.Buffer
 966	beg, end := 0, 0
 967	for beg < len(data) {
 968		end = beg
 969		// Step over whole lines, collecting them. While doing that, check for
 970		// fenced code and if one's found, incorporate it altogether,
 971		// irregardless of any contents inside it
 972		for data[end] != '\n' {
 973			if p.flags&FencedCode != 0 {
 974				if i := p.fencedCodeBlock(data[end:], false); i > 0 {
 975					// -1 to compensate for the extra end++ after the loop:
 976					end += i - 1
 977					break
 978				}
 979			}
 980			end++
 981		}
 982		end++
 983		if pre := p.quotePrefix(data[beg:]); pre > 0 {
 984			// skip the prefix
 985			beg += pre
 986		} else if p.terminateBlockquote(data, beg, end) {
 987			break
 988		}
 989		// this line is part of the blockquote
 990		raw.Write(data[beg:end])
 991		beg = end
 992	}
 993	p.block(raw.Bytes())
 994	p.finalize(block)
 995	return end
 996}
 997
 998// returns prefix length for block code
 999func (p *parser) codePrefix(data []byte) int {
1000	if data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
1001		return 4
1002	}
1003	return 0
1004}
1005
1006func (p *parser) code(data []byte) int {
1007	var work bytes.Buffer
1008
1009	i := 0
1010	for i < len(data) {
1011		beg := i
1012		for data[i] != '\n' {
1013			i++
1014		}
1015		i++
1016
1017		blankline := p.isEmpty(data[beg:i]) > 0
1018		if pre := p.codePrefix(data[beg:i]); pre > 0 {
1019			beg += pre
1020		} else if !blankline {
1021			// non-empty, non-prefixed line breaks the pre
1022			i = beg
1023			break
1024		}
1025
1026		// verbatim copy to the working buffeu
1027		if blankline {
1028			work.WriteByte('\n')
1029		} else {
1030			work.Write(data[beg:i])
1031		}
1032	}
1033
1034	// trim all the \n off the end of work
1035	workbytes := work.Bytes()
1036	eol := len(workbytes)
1037	for eol > 0 && workbytes[eol-1] == '\n' {
1038		eol--
1039	}
1040	if eol != len(workbytes) {
1041		work.Truncate(eol)
1042	}
1043
1044	work.WriteByte('\n')
1045
1046	block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
1047	block.IsFenced = false
1048	finalizeCodeBlock(block)
1049
1050	return i
1051}
1052
1053// returns unordered list item prefix
1054func (p *parser) uliPrefix(data []byte) int {
1055	i := 0
1056
1057	// start with up to 3 spaces
1058	for i < 3 && data[i] == ' ' {
1059		i++
1060	}
1061
1062	// need a *, +, or - followed by a space
1063	if (data[i] != '*' && data[i] != '+' && data[i] != '-') ||
1064		data[i+1] != ' ' {
1065		return 0
1066	}
1067	return i + 2
1068}
1069
1070// returns ordered list item prefix
1071func (p *parser) oliPrefix(data []byte) int {
1072	i := 0
1073
1074	// start with up to 3 spaces
1075	for i < 3 && data[i] == ' ' {
1076		i++
1077	}
1078
1079	// count the digits
1080	start := i
1081	for data[i] >= '0' && data[i] <= '9' {
1082		i++
1083	}
1084
1085	// we need >= 1 digits followed by a dot and a space
1086	if start == i || data[i] != '.' || data[i+1] != ' ' {
1087		return 0
1088	}
1089	return i + 2
1090}
1091
1092// returns definition list item prefix
1093func (p *parser) dliPrefix(data []byte) int {
1094	i := 0
1095
1096	// need a : followed by a spaces
1097	if data[i] != ':' || data[i+1] != ' ' {
1098		return 0
1099	}
1100	for data[i] == ' ' {
1101		i++
1102	}
1103	return i + 2
1104}
1105
1106// parse ordered or unordered list block
1107func (p *parser) list(data []byte, flags ListType) int {
1108	i := 0
1109	flags |= ListItemBeginningOfList
1110	block := p.addBlock(List, nil)
1111	block.ListFlags = flags
1112	block.Tight = true
1113
1114	for i < len(data) {
1115		skip := p.listItem(data[i:], &flags)
1116		if flags&ListItemContainsBlock != 0 {
1117			block.ListData.Tight = false
1118		}
1119		i += skip
1120		if skip == 0 || flags&ListItemEndOfList != 0 {
1121			break
1122		}
1123		flags &= ^ListItemBeginningOfList
1124	}
1125
1126	above := block.Parent
1127	finalizeList(block)
1128	p.tip = above
1129	return i
1130}
1131
1132// Returns true if block ends with a blank line, descending if needed
1133// into lists and sublists.
1134func endsWithBlankLine(block *Node) bool {
1135	// TODO: figure this out. Always false now.
1136	for block != nil {
1137		//if block.lastLineBlank {
1138		//return true
1139		//}
1140		t := block.Type
1141		if t == List || t == Item {
1142			block = block.LastChild
1143		} else {
1144			break
1145		}
1146	}
1147	return false
1148}
1149
1150func finalizeList(block *Node) {
1151	block.open = false
1152	item := block.FirstChild
1153	for item != nil {
1154		// check for non-final list item ending with blank line:
1155		if endsWithBlankLine(item) && item.Next != nil {
1156			block.ListData.Tight = false
1157			break
1158		}
1159		// recurse into children of list item, to see if there are spaces
1160		// between any of them:
1161		subItem := item.FirstChild
1162		for subItem != nil {
1163			if endsWithBlankLine(subItem) && (item.Next != nil || subItem.Next != nil) {
1164				block.ListData.Tight = false
1165				break
1166			}
1167			subItem = subItem.Next
1168		}
1169		item = item.Next
1170	}
1171}
1172
1173// Parse a single list item.
1174// Assumes initial prefix is already removed if this is a sublist.
1175func (p *parser) listItem(data []byte, flags *ListType) int {
1176	// keep track of the indentation of the first line
1177	itemIndent := 0
1178	for itemIndent < 3 && data[itemIndent] == ' ' {
1179		itemIndent++
1180	}
1181
1182	var bulletChar byte = '*'
1183	i := p.uliPrefix(data)
1184	if i == 0 {
1185		i = p.oliPrefix(data)
1186	} else {
1187		bulletChar = data[i-2]
1188	}
1189	if i == 0 {
1190		i = p.dliPrefix(data)
1191		// reset definition term flag
1192		if i > 0 {
1193			*flags &= ^ListTypeTerm
1194		}
1195	}
1196	if i == 0 {
1197		// if in definition list, set term flag and continue
1198		if *flags&ListTypeDefinition != 0 {
1199			*flags |= ListTypeTerm
1200		} else {
1201			return 0
1202		}
1203	}
1204
1205	// skip leading whitespace on first line
1206	for data[i] == ' ' {
1207		i++
1208	}
1209
1210	// find the end of the line
1211	line := i
1212	for i > 0 && data[i-1] != '\n' {
1213		i++
1214	}
1215
1216	// get working buffer
1217	var raw bytes.Buffer
1218
1219	// put the first line into the working buffer
1220	raw.Write(data[line:i])
1221	line = i
1222
1223	// process the following lines
1224	containsBlankLine := false
1225	sublist := 0
1226
1227gatherlines:
1228	for line < len(data) {
1229		i++
1230
1231		// find the end of this line
1232		for data[i-1] != '\n' {
1233			i++
1234		}
1235
1236		// if it is an empty line, guess that it is part of this item
1237		// and move on to the next line
1238		if p.isEmpty(data[line:i]) > 0 {
1239			containsBlankLine = true
1240			line = i
1241			continue
1242		}
1243
1244		// calculate the indentation
1245		indent := 0
1246		for indent < 4 && line+indent < i && data[line+indent] == ' ' {
1247			indent++
1248		}
1249
1250		chunk := data[line+indent : i]
1251
1252		// evaluate how this line fits in
1253		switch {
1254		// is this a nested list item?
1255		case (p.uliPrefix(chunk) > 0 && !p.isHRule(chunk)) ||
1256			p.oliPrefix(chunk) > 0 ||
1257			p.dliPrefix(chunk) > 0:
1258
1259			if containsBlankLine {
1260				*flags |= ListItemContainsBlock
1261			}
1262
1263			// to be a nested list, it must be indented more
1264			// if not, it is the next item in the same list
1265			if indent <= itemIndent {
1266				break gatherlines
1267			}
1268
1269			// is this the first item in the nested list?
1270			if sublist == 0 {
1271				sublist = raw.Len()
1272			}
1273
1274		// is this a nested prefix header?
1275		case p.isPrefixHeader(chunk):
1276			// if the header is not indented, it is not nested in the list
1277			// and thus ends the list
1278			if containsBlankLine && indent < 4 {
1279				*flags |= ListItemEndOfList
1280				break gatherlines
1281			}
1282			*flags |= ListItemContainsBlock
1283
1284		// anything following an empty line is only part
1285		// of this item if it is indented 4 spaces
1286		// (regardless of the indentation of the beginning of the item)
1287		case containsBlankLine && indent < 4:
1288			if *flags&ListTypeDefinition != 0 && i < len(data)-1 {
1289				// is the next item still a part of this list?
1290				next := i
1291				for data[next] != '\n' {
1292					next++
1293				}
1294				for next < len(data)-1 && data[next] == '\n' {
1295					next++
1296				}
1297				if i < len(data)-1 && data[i] != ':' && data[next] != ':' {
1298					*flags |= ListItemEndOfList
1299				}
1300			} else {
1301				*flags |= ListItemEndOfList
1302			}
1303			break gatherlines
1304
1305		// a blank line means this should be parsed as a block
1306		case containsBlankLine:
1307			raw.WriteByte('\n')
1308			*flags |= ListItemContainsBlock
1309		}
1310
1311		// if this line was preceded by one or more blanks,
1312		// re-introduce the blank into the buffer
1313		if containsBlankLine {
1314			containsBlankLine = false
1315			raw.WriteByte('\n')
1316		}
1317
1318		// add the line into the working buffer without prefix
1319		raw.Write(data[line+indent : i])
1320
1321		line = i
1322	}
1323
1324	rawBytes := raw.Bytes()
1325
1326	block := p.addBlock(Item, nil)
1327	block.ListFlags = *flags
1328	block.Tight = false
1329	block.BulletChar = bulletChar
1330	block.Delimiter = '.' // Only '.' is possible in Markdown, but ')' will also be possible in CommonMark
1331
1332	// render the contents of the list item
1333	if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 {
1334		// intermediate render of block item, except for definition term
1335		if sublist > 0 {
1336			p.block(rawBytes[:sublist])
1337			p.block(rawBytes[sublist:])
1338		} else {
1339			p.block(rawBytes)
1340		}
1341	} else {
1342		// intermediate render of inline item
1343		if sublist > 0 {
1344			child := p.addChild(Paragraph, 0)
1345			child.content = rawBytes[:sublist]
1346			p.block(rawBytes[sublist:])
1347		} else {
1348			child := p.addChild(Paragraph, 0)
1349			child.content = rawBytes
1350		}
1351	}
1352	return line
1353}
1354
1355// render a single paragraph that has already been parsed out
1356func (p *parser) renderParagraph(data []byte) {
1357	if len(data) == 0 {
1358		return
1359	}
1360
1361	// trim leading spaces
1362	beg := 0
1363	for data[beg] == ' ' {
1364		beg++
1365	}
1366
1367	// trim trailing newline
1368	end := len(data) - 1
1369
1370	// trim trailing spaces
1371	for end > beg && data[end-1] == ' ' {
1372		end--
1373	}
1374
1375	p.addBlock(Paragraph, data[beg:end])
1376}
1377
1378func (p *parser) paragraph(data []byte) int {
1379	// prev: index of 1st char of previous line
1380	// line: index of 1st char of current line
1381	// i: index of cursor/end of current line
1382	var prev, line, i int
1383
1384	// keep going until we find something to mark the end of the paragraph
1385	for i < len(data) {
1386		// mark the beginning of the current line
1387		prev = line
1388		current := data[i:]
1389		line = i
1390
1391		// did we find a blank line marking the end of the paragraph?
1392		if n := p.isEmpty(current); n > 0 {
1393			// did this blank line followed by a definition list item?
1394			if p.flags&DefinitionLists != 0 {
1395				if i < len(data)-1 && data[i+1] == ':' {
1396					return p.list(data[prev:], ListTypeDefinition)
1397				}
1398			}
1399
1400			p.renderParagraph(data[:i])
1401			return i + n
1402		}
1403
1404		// an underline under some text marks a header, so our paragraph ended on prev line
1405		if i > 0 {
1406			if level := p.isUnderlinedHeader(current); level > 0 {
1407				// render the paragraph
1408				p.renderParagraph(data[:prev])
1409
1410				// ignore leading and trailing whitespace
1411				eol := i - 1
1412				for prev < eol && data[prev] == ' ' {
1413					prev++
1414				}
1415				for eol > prev && data[eol-1] == ' ' {
1416					eol--
1417				}
1418
1419				id := ""
1420				if p.flags&AutoHeaderIDs != 0 {
1421					id = sanitized_anchor_name.Create(string(data[prev:eol]))
1422				}
1423
1424				block := p.addBlock(Header, data[prev:eol])
1425				block.Level = level
1426				block.HeaderID = id
1427
1428				// find the end of the underline
1429				for data[i] != '\n' {
1430					i++
1431				}
1432				return i
1433			}
1434		}
1435
1436		// if the next line starts a block of HTML, then the paragraph ends here
1437		if p.flags&LaxHTMLBlocks != 0 {
1438			if data[i] == '<' && p.html(current, false) > 0 {
1439				// rewind to before the HTML block
1440				p.renderParagraph(data[:i])
1441				return i
1442			}
1443		}
1444
1445		// if there's a prefixed header or a horizontal rule after this, paragraph is over
1446		if p.isPrefixHeader(current) || p.isHRule(current) {
1447			p.renderParagraph(data[:i])
1448			return i
1449		}
1450
1451		// if there's a fenced code block, paragraph is over
1452		if p.flags&FencedCode != 0 {
1453			if p.fencedCodeBlock(current, false) > 0 {
1454				p.renderParagraph(data[:i])
1455				return i
1456			}
1457		}
1458
1459		// if there's a definition list item, prev line is a definition term
1460		if p.flags&DefinitionLists != 0 {
1461			if p.dliPrefix(current) != 0 {
1462				return p.list(data[prev:], ListTypeDefinition)
1463			}
1464		}
1465
1466		// if there's a list after this, paragraph is over
1467		if p.flags&NoEmptyLineBeforeBlock != 0 {
1468			if p.uliPrefix(current) != 0 ||
1469				p.oliPrefix(current) != 0 ||
1470				p.quotePrefix(current) != 0 ||
1471				p.codePrefix(current) != 0 {
1472				p.renderParagraph(data[:i])
1473				return i
1474			}
1475		}
1476
1477		// otherwise, scan to the beginning of the next line
1478		for data[i] != '\n' {
1479			i++
1480		}
1481		i++
1482	}
1483
1484	p.renderParagraph(data[:i])
1485	return i
1486}
1487
1488func skipChar(data []byte, start int, char byte) int {
1489	i := start
1490	for i < len(data) && data[i] == char {
1491		i++
1492	}
1493	return i
1494}
1495
1496func skipUntilChar(text []byte, start int, char byte) int {
1497	i := start
1498	for i < len(text) && text[i] != char {
1499		i++
1500	}
1501	return i
1502}
all repos — grayfriday @ 91753e8bc7f0f5b54d9f62667940d359bc18d052

blackfriday fork with a few changes