icy does git — grayfriday (9357a8f949cba3a9472fe5baf0202041ac5a46fd): block.go

block.go (view raw)
   1//
   2// Blackfriday Markdown Processor
   3// Available at http://github.com/russross/blackfriday
   4//
   5// Copyright © 2011 Russ Ross <russ@russross.com>.
   6// Distributed under the Simplified BSD License.
   7// See README.md for details.
   8//
   9
  10//
  11// Functions to parse block-level elements.
  12//
  13
  14package blackfriday
  15
  16import (
  17	"bytes"
  18	"html"
  19	"regexp"
  20
  21	"github.com/shurcooL/sanitized_anchor_name"
  22)
  23
  24const (
  25	charEntity = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"
  26	escapable  = "[!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]"
  27)
  28
  29var (
  30	reBackslashOrAmp      = regexp.MustCompile("[\\&]")
  31	reEntityOrEscapedChar = regexp.MustCompile("(?i)\\\\" + escapable + "|" + charEntity)
  32)
  33
  34// Parse block-level data.
  35// Note: this function and many that it calls assume that
  36// the input buffer ends with a newline.
  37func (p *parser) block(data []byte) {
  38	if len(data) == 0 || data[len(data)-1] != '\n' {
  39		panic("block input is missing terminating newline")
  40	}
  41
  42	// this is called recursively: enforce a maximum depth
  43	if p.nesting >= p.maxNesting {
  44		return
  45	}
  46	p.nesting++
  47
  48	// parse out one block-level construct at a time
  49	for len(data) > 0 {
  50		// prefixed header:
  51		//
  52		// # Header 1
  53		// ## Header 2
  54		// ...
  55		// ###### Header 6
  56		if p.isPrefixHeader(data) {
  57			data = data[p.prefixHeader(data):]
  58			continue
  59		}
  60
  61		// block of preformatted HTML:
  62		//
  63		// <div>
  64		//     ...
  65		// </div>
  66		if data[0] == '<' {
  67			if i := p.html(data, true); i > 0 {
  68				data = data[i:]
  69				continue
  70			}
  71		}
  72
  73		// title block
  74		//
  75		// % stuff
  76		// % more stuff
  77		// % even more stuff
  78		if p.flags&Titleblock != 0 {
  79			if data[0] == '%' {
  80				if i := p.titleBlock(data, true); i > 0 {
  81					data = data[i:]
  82					continue
  83				}
  84			}
  85		}
  86
  87		// blank lines.  note: returns the # of bytes to skip
  88		if i := p.isEmpty(data); i > 0 {
  89			data = data[i:]
  90			continue
  91		}
  92
  93		// indented code block:
  94		//
  95		//     func max(a, b int) int {
  96		//         if a > b {
  97		//             return a
  98		//         }
  99		//         return b
 100		//      }
 101		if p.codePrefix(data) > 0 {
 102			data = data[p.code(data):]
 103			continue
 104		}
 105
 106		// fenced code block:
 107		//
 108		// ``` go
 109		// func fact(n int) int {
 110		//     if n <= 1 {
 111		//         return n
 112		//     }
 113		//     return n * fact(n-1)
 114		// }
 115		// ```
 116		if p.flags&FencedCode != 0 {
 117			if i := p.fencedCodeBlock(data, true); i > 0 {
 118				data = data[i:]
 119				continue
 120			}
 121		}
 122
 123		// horizontal rule:
 124		//
 125		// ------
 126		// or
 127		// ******
 128		// or
 129		// ______
 130		if p.isHRule(data) {
 131			p.addBlock(HorizontalRule, nil)
 132			var i int
 133			for i = 0; data[i] != '\n'; i++ {
 134			}
 135			data = data[i:]
 136			continue
 137		}
 138
 139		// block quote:
 140		//
 141		// > A big quote I found somewhere
 142		// > on the web
 143		if p.quotePrefix(data) > 0 {
 144			data = data[p.quote(data):]
 145			continue
 146		}
 147
 148		// table:
 149		//
 150		// Name  | Age | Phone
 151		// ------|-----|---------
 152		// Bob   | 31  | 555-1234
 153		// Alice | 27  | 555-4321
 154		if p.flags&Tables != 0 {
 155			if i := p.table(data); i > 0 {
 156				data = data[i:]
 157				continue
 158			}
 159		}
 160
 161		// an itemized/unordered list:
 162		//
 163		// * Item 1
 164		// * Item 2
 165		//
 166		// also works with + or -
 167		if p.uliPrefix(data) > 0 {
 168			data = data[p.list(data, 0):]
 169			continue
 170		}
 171
 172		// a numbered/ordered list:
 173		//
 174		// 1. Item 1
 175		// 2. Item 2
 176		if p.oliPrefix(data) > 0 {
 177			data = data[p.list(data, ListTypeOrdered):]
 178			continue
 179		}
 180
 181		// definition lists:
 182		//
 183		// Term 1
 184		// :   Definition a
 185		// :   Definition b
 186		//
 187		// Term 2
 188		// :   Definition c
 189		if p.flags&DefinitionLists != 0 {
 190			if p.dliPrefix(data) > 0 {
 191				data = data[p.list(data, ListTypeDefinition):]
 192				continue
 193			}
 194		}
 195
 196		// anything else must look like a normal paragraph
 197		// note: this finds underlined headers, too
 198		data = data[p.paragraph(data):]
 199	}
 200
 201	p.nesting--
 202}
 203
 204func (p *parser) addBlock(typ NodeType, content []byte) *Node {
 205	p.closeUnmatchedBlocks()
 206	container := p.addChild(typ, 0)
 207	container.content = content
 208	return container
 209}
 210
 211func (p *parser) isPrefixHeader(data []byte) bool {
 212	if data[0] != '#' {
 213		return false
 214	}
 215
 216	if p.flags&SpaceHeaders != 0 {
 217		level := 0
 218		for level < 6 && data[level] == '#' {
 219			level++
 220		}
 221		if data[level] != ' ' {
 222			return false
 223		}
 224	}
 225	return true
 226}
 227
 228func (p *parser) prefixHeader(data []byte) int {
 229	level := 0
 230	for level < 6 && data[level] == '#' {
 231		level++
 232	}
 233	i := skipChar(data, level, ' ')
 234	end := skipUntilChar(data, i, '\n')
 235	skip := end
 236	id := ""
 237	if p.flags&HeaderIDs != 0 {
 238		j, k := 0, 0
 239		// find start/end of header id
 240		for j = i; j < end-1 && (data[j] != '{' || data[j+1] != '#'); j++ {
 241		}
 242		for k = j + 1; k < end && data[k] != '}'; k++ {
 243		}
 244		// extract header id iff found
 245		if j < end && k < end {
 246			id = string(data[j+2 : k])
 247			end = j
 248			skip = k + 1
 249			for end > 0 && data[end-1] == ' ' {
 250				end--
 251			}
 252		}
 253	}
 254	for end > 0 && data[end-1] == '#' {
 255		if isBackslashEscaped(data, end-1) {
 256			break
 257		}
 258		end--
 259	}
 260	for end > 0 && data[end-1] == ' ' {
 261		end--
 262	}
 263	if end > i {
 264		if id == "" && p.flags&AutoHeaderIDs != 0 {
 265			id = sanitized_anchor_name.Create(string(data[i:end]))
 266		}
 267		block := p.addBlock(Header, data[i:end])
 268		block.HeaderID = id
 269		block.Level = level
 270	}
 271	return skip
 272}
 273
 274func (p *parser) isUnderlinedHeader(data []byte) int {
 275	// test of level 1 header
 276	if data[0] == '=' {
 277		i := skipChar(data, 1, '=')
 278		i = skipChar(data, i, ' ')
 279		if data[i] == '\n' {
 280			return 1
 281		}
 282		return 0
 283	}
 284
 285	// test of level 2 header
 286	if data[0] == '-' {
 287		i := skipChar(data, 1, '-')
 288		i = skipChar(data, i, ' ')
 289		if data[i] == '\n' {
 290			return 2
 291		}
 292		return 0
 293	}
 294
 295	return 0
 296}
 297
 298func (p *parser) titleBlock(data []byte, doRender bool) int {
 299	if data[0] != '%' {
 300		return 0
 301	}
 302	splitData := bytes.Split(data, []byte("\n"))
 303	var i int
 304	for idx, b := range splitData {
 305		if !bytes.HasPrefix(b, []byte("%")) {
 306			i = idx // - 1
 307			break
 308		}
 309	}
 310
 311	data = bytes.Join(splitData[0:i], []byte("\n"))
 312	consumed := len(data)
 313	data = bytes.TrimPrefix(data, []byte("% "))
 314	data = bytes.Replace(data, []byte("\n% "), []byte("\n"), -1)
 315	block := p.addBlock(Header, data)
 316	block.Level = 1
 317	block.IsTitleblock = true
 318
 319	return consumed
 320}
 321
 322func (p *parser) html(data []byte, doRender bool) int {
 323	var i, j int
 324
 325	// identify the opening tag
 326	if data[0] != '<' {
 327		return 0
 328	}
 329	curtag, tagfound := p.htmlFindTag(data[1:])
 330
 331	// handle special cases
 332	if !tagfound {
 333		// check for an HTML comment
 334		if size := p.htmlComment(data, doRender); size > 0 {
 335			return size
 336		}
 337
 338		// check for an <hr> tag
 339		if size := p.htmlHr(data, doRender); size > 0 {
 340			return size
 341		}
 342
 343		// no special case recognized
 344		return 0
 345	}
 346
 347	// look for an unindented matching closing tag
 348	// followed by a blank line
 349	found := false
 350	/*
 351		closetag := []byte("\n</" + curtag + ">")
 352		j = len(curtag) + 1
 353		for !found {
 354			// scan for a closing tag at the beginning of a line
 355			if skip := bytes.Index(data[j:], closetag); skip >= 0 {
 356				j += skip + len(closetag)
 357			} else {
 358				break
 359			}
 360
 361			// see if it is the only thing on the line
 362			if skip := p.isEmpty(data[j:]); skip > 0 {
 363				// see if it is followed by a blank line/eof
 364				j += skip
 365				if j >= len(data) {
 366					found = true
 367					i = j
 368				} else {
 369					if skip := p.isEmpty(data[j:]); skip > 0 {
 370						j += skip
 371						found = true
 372						i = j
 373					}
 374				}
 375			}
 376		}
 377	*/
 378
 379	// if not found, try a second pass looking for indented match
 380	// but not if tag is "ins" or "del" (following original Markdown.pl)
 381	if !found && curtag != "ins" && curtag != "del" {
 382		i = 1
 383		for i < len(data) {
 384			i++
 385			for i < len(data) && !(data[i-1] == '<' && data[i] == '/') {
 386				i++
 387			}
 388
 389			if i+2+len(curtag) >= len(data) {
 390				break
 391			}
 392
 393			j = p.htmlFindEnd(curtag, data[i-1:])
 394
 395			if j > 0 {
 396				i += j - 1
 397				found = true
 398				break
 399			}
 400		}
 401	}
 402
 403	if !found {
 404		return 0
 405	}
 406
 407	// the end of the block has been found
 408	if doRender {
 409		// trim newlines
 410		end := i
 411		for end > 0 && data[end-1] == '\n' {
 412			end--
 413		}
 414		finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
 415	}
 416
 417	return i
 418}
 419
 420func finalizeHTMLBlock(block *Node) {
 421	block.Literal = block.content
 422	block.content = nil
 423}
 424
 425// HTML comment, lax form
 426func (p *parser) htmlComment(data []byte, doRender bool) int {
 427	i := p.inlineHTMLComment(data)
 428	// needs to end with a blank line
 429	if j := p.isEmpty(data[i:]); j > 0 {
 430		size := i + j
 431		if doRender {
 432			// trim trailing newlines
 433			end := size
 434			for end > 0 && data[end-1] == '\n' {
 435				end--
 436			}
 437			block := p.addBlock(HTMLBlock, data[:end])
 438			finalizeHTMLBlock(block)
 439		}
 440		return size
 441	}
 442	return 0
 443}
 444
 445// HR, which is the only self-closing block tag considered
 446func (p *parser) htmlHr(data []byte, doRender bool) int {
 447	if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') {
 448		return 0
 449	}
 450	if data[3] != ' ' && data[3] != '/' && data[3] != '>' {
 451		// not an <hr> tag after all; at least not a valid one
 452		return 0
 453	}
 454
 455	i := 3
 456	for data[i] != '>' && data[i] != '\n' {
 457		i++
 458	}
 459
 460	if data[i] == '>' {
 461		i++
 462		if j := p.isEmpty(data[i:]); j > 0 {
 463			size := i + j
 464			if doRender {
 465				// trim newlines
 466				end := size
 467				for end > 0 && data[end-1] == '\n' {
 468					end--
 469				}
 470				finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
 471			}
 472			return size
 473		}
 474	}
 475
 476	return 0
 477}
 478
 479func (p *parser) htmlFindTag(data []byte) (string, bool) {
 480	i := 0
 481	for isalnum(data[i]) {
 482		i++
 483	}
 484	key := string(data[:i])
 485	if _, ok := blockTags[key]; ok {
 486		return key, true
 487	}
 488	return "", false
 489}
 490
 491func (p *parser) htmlFindEnd(tag string, data []byte) int {
 492	// assume data[0] == '<' && data[1] == '/' already tested
 493	if tag == "hr" {
 494		return 2
 495	}
 496	// check if tag is a match
 497	closetag := []byte("</" + tag + ">")
 498	if !bytes.HasPrefix(data, closetag) {
 499		return 0
 500	}
 501	i := len(closetag)
 502
 503	// check that the rest of the line is blank
 504	skip := 0
 505	if skip = p.isEmpty(data[i:]); skip == 0 {
 506		return 0
 507	}
 508	i += skip
 509	skip = 0
 510
 511	if i >= len(data) {
 512		return i
 513	}
 514
 515	if p.flags&LaxHTMLBlocks != 0 {
 516		return i
 517	}
 518	if skip = p.isEmpty(data[i:]); skip == 0 {
 519		// following line must be blank
 520		return 0
 521	}
 522
 523	return i + skip
 524}
 525
 526func (*parser) isEmpty(data []byte) int {
 527	// it is okay to call isEmpty on an empty buffer
 528	if len(data) == 0 {
 529		return 0
 530	}
 531
 532	var i int
 533	for i = 0; i < len(data) && data[i] != '\n'; i++ {
 534		if data[i] != ' ' && data[i] != '\t' {
 535			return 0
 536		}
 537	}
 538	return i + 1
 539}
 540
 541func (*parser) isHRule(data []byte) bool {
 542	i := 0
 543
 544	// skip up to three spaces
 545	for i < 3 && data[i] == ' ' {
 546		i++
 547	}
 548
 549	// look at the hrule char
 550	if data[i] != '*' && data[i] != '-' && data[i] != '_' {
 551		return false
 552	}
 553	c := data[i]
 554
 555	// the whole line must be the char or whitespace
 556	n := 0
 557	for data[i] != '\n' {
 558		switch {
 559		case data[i] == c:
 560			n++
 561		case data[i] != ' ':
 562			return false
 563		}
 564		i++
 565	}
 566
 567	return n >= 3
 568}
 569
 570// isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data,
 571// and returns the end index if so, or 0 otherwise. It also returns the marker found.
 572// If syntax is not nil, it gets set to the syntax specified in the fence line.
 573// A final newline is mandatory to recognize the fence line, unless newlineOptional is true.
 574func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional bool) (end int, marker string) {
 575	i, size := 0, 0
 576
 577	// skip up to three spaces
 578	for i < len(data) && i < 3 && data[i] == ' ' {
 579		i++
 580	}
 581
 582	// check for the marker characters: ~ or `
 583	if i >= len(data) {
 584		return 0, ""
 585	}
 586	if data[i] != '~' && data[i] != '`' {
 587		return 0, ""
 588	}
 589
 590	c := data[i]
 591
 592	// the whole line must be the same char or whitespace
 593	for i < len(data) && data[i] == c {
 594		size++
 595		i++
 596	}
 597
 598	// the marker char must occur at least 3 times
 599	if size < 3 {
 600		return 0, ""
 601	}
 602	marker = string(data[i-size : i])
 603
 604	// if this is the end marker, it must match the beginning marker
 605	if oldmarker != "" && marker != oldmarker {
 606		return 0, ""
 607	}
 608
 609	// TODO(shurcooL): It's probably a good idea to simplify the 2 code paths here
 610	// into one, always get the syntax, and discard it if the caller doesn't care.
 611	if syntax != nil {
 612		syn := 0
 613		i = skipChar(data, i, ' ')
 614
 615		if i >= len(data) {
 616			if newlineOptional && i == len(data) {
 617				return i, marker
 618			}
 619			return 0, ""
 620		}
 621
 622		syntaxStart := i
 623
 624		if data[i] == '{' {
 625			i++
 626			syntaxStart++
 627
 628			for i < len(data) && data[i] != '}' && data[i] != '\n' {
 629				syn++
 630				i++
 631			}
 632
 633			if i >= len(data) || data[i] != '}' {
 634				return 0, ""
 635			}
 636
 637			// strip all whitespace at the beginning and the end
 638			// of the {} block
 639			for syn > 0 && isspace(data[syntaxStart]) {
 640				syntaxStart++
 641				syn--
 642			}
 643
 644			for syn > 0 && isspace(data[syntaxStart+syn-1]) {
 645				syn--
 646			}
 647
 648			i++
 649		} else {
 650			for i < len(data) && !isspace(data[i]) {
 651				syn++
 652				i++
 653			}
 654		}
 655
 656		*syntax = string(data[syntaxStart : syntaxStart+syn])
 657	}
 658
 659	i = skipChar(data, i, ' ')
 660	if i >= len(data) || data[i] != '\n' {
 661		if newlineOptional && i == len(data) {
 662			return i, marker
 663		}
 664		return 0, ""
 665	}
 666
 667	return i + 1, marker // Take newline into account.
 668}
 669
 670// fencedCodeBlock returns the end index if data contains a fenced code block at the beginning,
 671// or 0 otherwise. It writes to out if doRender is true, otherwise it has no side effects.
 672// If doRender is true, a final newline is mandatory to recognize the fenced code block.
 673func (p *parser) fencedCodeBlock(data []byte, doRender bool) int {
 674	var syntax string
 675	beg, marker := isFenceLine(data, &syntax, "", false)
 676	if beg == 0 || beg >= len(data) {
 677		return 0
 678	}
 679
 680	var work bytes.Buffer
 681	work.Write([]byte(syntax))
 682	work.WriteByte('\n')
 683
 684	for {
 685		// safe to assume beg < len(data)
 686
 687		// check for the end of the code block
 688		newlineOptional := !doRender
 689		fenceEnd, _ := isFenceLine(data[beg:], nil, marker, newlineOptional)
 690		if fenceEnd != 0 {
 691			beg += fenceEnd
 692			break
 693		}
 694
 695		// copy the current line
 696		end := skipUntilChar(data, beg, '\n') + 1
 697
 698		// did we reach the end of the buffer without a closing marker?
 699		if end >= len(data) {
 700			return 0
 701		}
 702
 703		// verbatim copy to the working buffer
 704		if doRender {
 705			work.Write(data[beg:end])
 706		}
 707		beg = end
 708	}
 709
 710	if doRender {
 711		block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
 712		block.IsFenced = true
 713		finalizeCodeBlock(block)
 714	}
 715
 716	return beg
 717}
 718
 719func unescapeChar(str []byte) []byte {
 720	if str[0] == '\\' {
 721		return []byte{str[1]}
 722	}
 723	return []byte(html.UnescapeString(string(str)))
 724}
 725
 726func unescapeString(str []byte) []byte {
 727	if reBackslashOrAmp.Match(str) {
 728		return reEntityOrEscapedChar.ReplaceAllFunc(str, unescapeChar)
 729	}
 730	return str
 731}
 732
 733func finalizeCodeBlock(block *Node) {
 734	if block.IsFenced {
 735		newlinePos := bytes.IndexByte(block.content, '\n')
 736		firstLine := block.content[:newlinePos]
 737		rest := block.content[newlinePos+1:]
 738		block.Info = unescapeString(bytes.Trim(firstLine, "\n"))
 739		block.Literal = rest
 740	} else {
 741		block.Literal = block.content
 742	}
 743	block.content = nil
 744}
 745
 746func (p *parser) table(data []byte) int {
 747	table := p.addBlock(Table, nil)
 748	i, columns := p.tableHeader(data)
 749	if i == 0 {
 750		p.tip = table.Parent
 751		table.Unlink()
 752		return 0
 753	}
 754
 755	p.addBlock(TableBody, nil)
 756
 757	for i < len(data) {
 758		pipes, rowStart := 0, i
 759		for ; data[i] != '\n'; i++ {
 760			if data[i] == '|' {
 761				pipes++
 762			}
 763		}
 764
 765		if pipes == 0 {
 766			i = rowStart
 767			break
 768		}
 769
 770		// include the newline in data sent to tableRow
 771		i++
 772		p.tableRow(data[rowStart:i], columns, false)
 773	}
 774
 775	return i
 776}
 777
 778// check if the specified position is preceded by an odd number of backslashes
 779func isBackslashEscaped(data []byte, i int) bool {
 780	backslashes := 0
 781	for i-backslashes-1 >= 0 && data[i-backslashes-1] == '\\' {
 782		backslashes++
 783	}
 784	return backslashes&1 == 1
 785}
 786
 787func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
 788	i := 0
 789	colCount := 1
 790	for i = 0; data[i] != '\n'; i++ {
 791		if data[i] == '|' && !isBackslashEscaped(data, i) {
 792			colCount++
 793		}
 794	}
 795
 796	// doesn't look like a table header
 797	if colCount == 1 {
 798		return
 799	}
 800
 801	// include the newline in the data sent to tableRow
 802	header := data[:i+1]
 803
 804	// column count ignores pipes at beginning or end of line
 805	if data[0] == '|' {
 806		colCount--
 807	}
 808	if i > 2 && data[i-1] == '|' && !isBackslashEscaped(data, i-1) {
 809		colCount--
 810	}
 811
 812	columns = make([]CellAlignFlags, colCount)
 813
 814	// move on to the header underline
 815	i++
 816	if i >= len(data) {
 817		return
 818	}
 819
 820	if data[i] == '|' && !isBackslashEscaped(data, i) {
 821		i++
 822	}
 823	i = skipChar(data, i, ' ')
 824
 825	// each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3
 826	// and trailing | optional on last column
 827	col := 0
 828	for data[i] != '\n' {
 829		dashes := 0
 830
 831		if data[i] == ':' {
 832			i++
 833			columns[col] |= TableAlignmentLeft
 834			dashes++
 835		}
 836		for data[i] == '-' {
 837			i++
 838			dashes++
 839		}
 840		if data[i] == ':' {
 841			i++
 842			columns[col] |= TableAlignmentRight
 843			dashes++
 844		}
 845		for data[i] == ' ' {
 846			i++
 847		}
 848
 849		// end of column test is messy
 850		switch {
 851		case dashes < 3:
 852			// not a valid column
 853			return
 854
 855		case data[i] == '|' && !isBackslashEscaped(data, i):
 856			// marker found, now skip past trailing whitespace
 857			col++
 858			i++
 859			for data[i] == ' ' {
 860				i++
 861			}
 862
 863			// trailing junk found after last column
 864			if col >= colCount && data[i] != '\n' {
 865				return
 866			}
 867
 868		case (data[i] != '|' || isBackslashEscaped(data, i)) && col+1 < colCount:
 869			// something else found where marker was required
 870			return
 871
 872		case data[i] == '\n':
 873			// marker is optional for the last column
 874			col++
 875
 876		default:
 877			// trailing junk found after last column
 878			return
 879		}
 880	}
 881	if col != colCount {
 882		return
 883	}
 884
 885	p.addBlock(TableHead, nil)
 886	p.tableRow(header, columns, true)
 887	size = i + 1
 888	return
 889}
 890
 891func (p *parser) tableRow(data []byte, columns []CellAlignFlags, header bool) {
 892	p.addBlock(TableRow, nil)
 893	i, col := 0, 0
 894
 895	if data[i] == '|' && !isBackslashEscaped(data, i) {
 896		i++
 897	}
 898
 899	for col = 0; col < len(columns) && i < len(data); col++ {
 900		for data[i] == ' ' {
 901			i++
 902		}
 903
 904		cellStart := i
 905
 906		for (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
 907			i++
 908		}
 909
 910		cellEnd := i
 911
 912		// skip the end-of-cell marker, possibly taking us past end of buffer
 913		i++
 914
 915		for cellEnd > cellStart && data[cellEnd-1] == ' ' {
 916			cellEnd--
 917		}
 918
 919		cell := p.addBlock(TableCell, data[cellStart:cellEnd])
 920		cell.IsHeader = header
 921		cell.Align = columns[col]
 922	}
 923
 924	// pad it out with empty columns to get the right number
 925	for ; col < len(columns); col++ {
 926		cell := p.addBlock(TableCell, nil)
 927		cell.IsHeader = header
 928		cell.Align = columns[col]
 929	}
 930
 931	// silently ignore rows with too many cells
 932}
 933
 934// returns blockquote prefix length
 935func (p *parser) quotePrefix(data []byte) int {
 936	i := 0
 937	for i < 3 && data[i] == ' ' {
 938		i++
 939	}
 940	if data[i] == '>' {
 941		if data[i+1] == ' ' {
 942			return i + 2
 943		}
 944		return i + 1
 945	}
 946	return 0
 947}
 948
 949// blockquote ends with at least one blank line
 950// followed by something without a blockquote prefix
 951func (p *parser) terminateBlockquote(data []byte, beg, end int) bool {
 952	if p.isEmpty(data[beg:]) <= 0 {
 953		return false
 954	}
 955	if end >= len(data) {
 956		return true
 957	}
 958	return p.quotePrefix(data[end:]) == 0 && p.isEmpty(data[end:]) == 0
 959}
 960
 961// parse a blockquote fragment
 962func (p *parser) quote(data []byte) int {
 963	block := p.addBlock(BlockQuote, nil)
 964	var raw bytes.Buffer
 965	beg, end := 0, 0
 966	for beg < len(data) {
 967		end = beg
 968		// Step over whole lines, collecting them. While doing that, check for
 969		// fenced code and if one's found, incorporate it altogether,
 970		// irregardless of any contents inside it
 971		for data[end] != '\n' {
 972			if p.flags&FencedCode != 0 {
 973				if i := p.fencedCodeBlock(data[end:], false); i > 0 {
 974					// -1 to compensate for the extra end++ after the loop:
 975					end += i - 1
 976					break
 977				}
 978			}
 979			end++
 980		}
 981		end++
 982		if pre := p.quotePrefix(data[beg:]); pre > 0 {
 983			// skip the prefix
 984			beg += pre
 985		} else if p.terminateBlockquote(data, beg, end) {
 986			break
 987		}
 988		// this line is part of the blockquote
 989		raw.Write(data[beg:end])
 990		beg = end
 991	}
 992	p.block(raw.Bytes())
 993	p.finalize(block)
 994	return end
 995}
 996
 997// returns prefix length for block code
 998func (p *parser) codePrefix(data []byte) int {
 999	if data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
1000		return 4
1001	}
1002	return 0
1003}
1004
1005func (p *parser) code(data []byte) int {
1006	var work bytes.Buffer
1007
1008	i := 0
1009	for i < len(data) {
1010		beg := i
1011		for data[i] != '\n' {
1012			i++
1013		}
1014		i++
1015
1016		blankline := p.isEmpty(data[beg:i]) > 0
1017		if pre := p.codePrefix(data[beg:i]); pre > 0 {
1018			beg += pre
1019		} else if !blankline {
1020			// non-empty, non-prefixed line breaks the pre
1021			i = beg
1022			break
1023		}
1024
1025		// verbatim copy to the working buffeu
1026		if blankline {
1027			work.WriteByte('\n')
1028		} else {
1029			work.Write(data[beg:i])
1030		}
1031	}
1032
1033	// trim all the \n off the end of work
1034	workbytes := work.Bytes()
1035	eol := len(workbytes)
1036	for eol > 0 && workbytes[eol-1] == '\n' {
1037		eol--
1038	}
1039	if eol != len(workbytes) {
1040		work.Truncate(eol)
1041	}
1042
1043	work.WriteByte('\n')
1044
1045	block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
1046	block.IsFenced = false
1047	finalizeCodeBlock(block)
1048
1049	return i
1050}
1051
1052// returns unordered list item prefix
1053func (p *parser) uliPrefix(data []byte) int {
1054	i := 0
1055
1056	// start with up to 3 spaces
1057	for i < 3 && data[i] == ' ' {
1058		i++
1059	}
1060
1061	// need a *, +, or - followed by a space
1062	if (data[i] != '*' && data[i] != '+' && data[i] != '-') ||
1063		data[i+1] != ' ' {
1064		return 0
1065	}
1066	return i + 2
1067}
1068
1069// returns ordered list item prefix
1070func (p *parser) oliPrefix(data []byte) int {
1071	i := 0
1072
1073	// start with up to 3 spaces
1074	for i < 3 && data[i] == ' ' {
1075		i++
1076	}
1077
1078	// count the digits
1079	start := i
1080	for data[i] >= '0' && data[i] <= '9' {
1081		i++
1082	}
1083
1084	// we need >= 1 digits followed by a dot and a space
1085	if start == i || data[i] != '.' || data[i+1] != ' ' {
1086		return 0
1087	}
1088	return i + 2
1089}
1090
1091// returns definition list item prefix
1092func (p *parser) dliPrefix(data []byte) int {
1093	i := 0
1094
1095	// need a : followed by a spaces
1096	if data[i] != ':' || data[i+1] != ' ' {
1097		return 0
1098	}
1099	for data[i] == ' ' {
1100		i++
1101	}
1102	return i + 2
1103}
1104
1105// parse ordered or unordered list block
1106func (p *parser) list(data []byte, flags ListType) int {
1107	i := 0
1108	flags |= ListItemBeginningOfList
1109	block := p.addBlock(List, nil)
1110	block.ListFlags = flags
1111	block.Tight = true
1112
1113	for i < len(data) {
1114		skip := p.listItem(data[i:], &flags)
1115		if flags&ListItemContainsBlock != 0 {
1116			block.ListData.Tight = false
1117		}
1118		i += skip
1119		if skip == 0 || flags&ListItemEndOfList != 0 {
1120			break
1121		}
1122		flags &= ^ListItemBeginningOfList
1123	}
1124
1125	above := block.Parent
1126	finalizeList(block)
1127	p.tip = above
1128	return i
1129}
1130
1131// Returns true if block ends with a blank line, descending if needed
1132// into lists and sublists.
1133func endsWithBlankLine(block *Node) bool {
1134	// TODO: figure this out. Always false now.
1135	for block != nil {
1136		//if block.lastLineBlank {
1137		//return true
1138		//}
1139		t := block.Type
1140		if t == List || t == Item {
1141			block = block.LastChild
1142		} else {
1143			break
1144		}
1145	}
1146	return false
1147}
1148
1149func finalizeList(block *Node) {
1150	block.open = false
1151	item := block.FirstChild
1152	for item != nil {
1153		// check for non-final list item ending with blank line:
1154		if endsWithBlankLine(item) && item.Next != nil {
1155			block.ListData.Tight = false
1156			break
1157		}
1158		// recurse into children of list item, to see if there are spaces
1159		// between any of them:
1160		subItem := item.FirstChild
1161		for subItem != nil {
1162			if endsWithBlankLine(subItem) && (item.Next != nil || subItem.Next != nil) {
1163				block.ListData.Tight = false
1164				break
1165			}
1166			subItem = subItem.Next
1167		}
1168		item = item.Next
1169	}
1170}
1171
1172// Parse a single list item.
1173// Assumes initial prefix is already removed if this is a sublist.
1174func (p *parser) listItem(data []byte, flags *ListType) int {
1175	// keep track of the indentation of the first line
1176	itemIndent := 0
1177	for itemIndent < 3 && data[itemIndent] == ' ' {
1178		itemIndent++
1179	}
1180
1181	var bulletChar byte = '*'
1182	i := p.uliPrefix(data)
1183	if i == 0 {
1184		i = p.oliPrefix(data)
1185	} else {
1186		bulletChar = data[i-2]
1187	}
1188	if i == 0 {
1189		i = p.dliPrefix(data)
1190		// reset definition term flag
1191		if i > 0 {
1192			*flags &= ^ListTypeTerm
1193		}
1194	}
1195	if i == 0 {
1196		// if in definition list, set term flag and continue
1197		if *flags&ListTypeDefinition != 0 {
1198			*flags |= ListTypeTerm
1199		} else {
1200			return 0
1201		}
1202	}
1203
1204	// skip leading whitespace on first line
1205	for data[i] == ' ' {
1206		i++
1207	}
1208
1209	// find the end of the line
1210	line := i
1211	for i > 0 && data[i-1] != '\n' {
1212		i++
1213	}
1214
1215	// get working buffer
1216	var raw bytes.Buffer
1217
1218	// put the first line into the working buffer
1219	raw.Write(data[line:i])
1220	line = i
1221
1222	// process the following lines
1223	containsBlankLine := false
1224	sublist := 0
1225
1226gatherlines:
1227	for line < len(data) {
1228		i++
1229
1230		// find the end of this line
1231		for data[i-1] != '\n' {
1232			i++
1233		}
1234
1235		// if it is an empty line, guess that it is part of this item
1236		// and move on to the next line
1237		if p.isEmpty(data[line:i]) > 0 {
1238			containsBlankLine = true
1239			line = i
1240			continue
1241		}
1242
1243		// calculate the indentation
1244		indent := 0
1245		for indent < 4 && line+indent < i && data[line+indent] == ' ' {
1246			indent++
1247		}
1248
1249		chunk := data[line+indent : i]
1250
1251		// evaluate how this line fits in
1252		switch {
1253		// is this a nested list item?
1254		case (p.uliPrefix(chunk) > 0 && !p.isHRule(chunk)) ||
1255			p.oliPrefix(chunk) > 0 ||
1256			p.dliPrefix(chunk) > 0:
1257
1258			if containsBlankLine {
1259				*flags |= ListItemContainsBlock
1260			}
1261
1262			// to be a nested list, it must be indented more
1263			// if not, it is the next item in the same list
1264			if indent <= itemIndent {
1265				break gatherlines
1266			}
1267
1268			// is this the first item in the nested list?
1269			if sublist == 0 {
1270				sublist = raw.Len()
1271			}
1272
1273		// is this a nested prefix header?
1274		case p.isPrefixHeader(chunk):
1275			// if the header is not indented, it is not nested in the list
1276			// and thus ends the list
1277			if containsBlankLine && indent < 4 {
1278				*flags |= ListItemEndOfList
1279				break gatherlines
1280			}
1281			*flags |= ListItemContainsBlock
1282
1283		// anything following an empty line is only part
1284		// of this item if it is indented 4 spaces
1285		// (regardless of the indentation of the beginning of the item)
1286		case containsBlankLine && indent < 4:
1287			if *flags&ListTypeDefinition != 0 && i < len(data)-1 {
1288				// is the next item still a part of this list?
1289				next := i
1290				for data[next] != '\n' {
1291					next++
1292				}
1293				for next < len(data)-1 && data[next] == '\n' {
1294					next++
1295				}
1296				if i < len(data)-1 && data[i] != ':' && data[next] != ':' {
1297					*flags |= ListItemEndOfList
1298				}
1299			} else {
1300				*flags |= ListItemEndOfList
1301			}
1302			break gatherlines
1303
1304		// a blank line means this should be parsed as a block
1305		case containsBlankLine:
1306			raw.WriteByte('\n')
1307			*flags |= ListItemContainsBlock
1308		}
1309
1310		// if this line was preceded by one or more blanks,
1311		// re-introduce the blank into the buffer
1312		if containsBlankLine {
1313			containsBlankLine = false
1314			raw.WriteByte('\n')
1315		}
1316
1317		// add the line into the working buffer without prefix
1318		raw.Write(data[line+indent : i])
1319
1320		line = i
1321	}
1322
1323	rawBytes := raw.Bytes()
1324
1325	block := p.addBlock(Item, nil)
1326	block.ListFlags = *flags
1327	block.Tight = false
1328	block.BulletChar = bulletChar
1329	block.Delimiter = '.' // Only '.' is possible in Markdown, but ')' will also be possible in CommonMark
1330
1331	// render the contents of the list item
1332	if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 {
1333		// intermediate render of block item, except for definition term
1334		if sublist > 0 {
1335			p.block(rawBytes[:sublist])
1336			p.block(rawBytes[sublist:])
1337		} else {
1338			p.block(rawBytes)
1339		}
1340	} else {
1341		// intermediate render of inline item
1342		if sublist > 0 {
1343			child := p.addChild(Paragraph, 0)
1344			child.content = rawBytes[:sublist]
1345			p.block(rawBytes[sublist:])
1346		} else {
1347			child := p.addChild(Paragraph, 0)
1348			child.content = rawBytes
1349		}
1350	}
1351	return line
1352}
1353
1354// render a single paragraph that has already been parsed out
1355func (p *parser) renderParagraph(data []byte) {
1356	if len(data) == 0 {
1357		return
1358	}
1359
1360	// trim leading spaces
1361	beg := 0
1362	for data[beg] == ' ' {
1363		beg++
1364	}
1365
1366	// trim trailing newline
1367	end := len(data) - 1
1368
1369	// trim trailing spaces
1370	for end > beg && data[end-1] == ' ' {
1371		end--
1372	}
1373
1374	p.addBlock(Paragraph, data[beg:end])
1375}
1376
1377func (p *parser) paragraph(data []byte) int {
1378	// prev: index of 1st char of previous line
1379	// line: index of 1st char of current line
1380	// i: index of cursor/end of current line
1381	var prev, line, i int
1382	tabSize := TabSizeDefault
1383	if p.flags&TabSizeEight != 0 {
1384		tabSize = TabSizeDouble
1385	}
1386	// keep going until we find something to mark the end of the paragraph
1387	for i < len(data) {
1388		// mark the beginning of the current line
1389		prev = line
1390		current := data[i:]
1391		line = i
1392
1393		// did we find a reference or a footnote? If so, end a paragraph
1394		// preceding it and report that we have consumed up to the end of that
1395		// reference:
1396		if refEnd := isReference(p, current, tabSize); refEnd > 0 {
1397			p.renderParagraph(data[:i])
1398			return i + refEnd
1399		}
1400
1401		// did we find a blank line marking the end of the paragraph?
1402		if n := p.isEmpty(current); n > 0 {
1403			// did this blank line followed by a definition list item?
1404			if p.flags&DefinitionLists != 0 {
1405				if i < len(data)-1 && data[i+1] == ':' {
1406					return p.list(data[prev:], ListTypeDefinition)
1407				}
1408			}
1409
1410			p.renderParagraph(data[:i])
1411			return i + n
1412		}
1413
1414		// an underline under some text marks a header, so our paragraph ended on prev line
1415		if i > 0 {
1416			if level := p.isUnderlinedHeader(current); level > 0 {
1417				// render the paragraph
1418				p.renderParagraph(data[:prev])
1419
1420				// ignore leading and trailing whitespace
1421				eol := i - 1
1422				for prev < eol && data[prev] == ' ' {
1423					prev++
1424				}
1425				for eol > prev && data[eol-1] == ' ' {
1426					eol--
1427				}
1428
1429				id := ""
1430				if p.flags&AutoHeaderIDs != 0 {
1431					id = sanitized_anchor_name.Create(string(data[prev:eol]))
1432				}
1433
1434				block := p.addBlock(Header, data[prev:eol])
1435				block.Level = level
1436				block.HeaderID = id
1437
1438				// find the end of the underline
1439				for data[i] != '\n' {
1440					i++
1441				}
1442				return i
1443			}
1444		}
1445
1446		// if the next line starts a block of HTML, then the paragraph ends here
1447		if p.flags&LaxHTMLBlocks != 0 {
1448			if data[i] == '<' && p.html(current, false) > 0 {
1449				// rewind to before the HTML block
1450				p.renderParagraph(data[:i])
1451				return i
1452			}
1453		}
1454
1455		// if there's a prefixed header or a horizontal rule after this, paragraph is over
1456		if p.isPrefixHeader(current) || p.isHRule(current) {
1457			p.renderParagraph(data[:i])
1458			return i
1459		}
1460
1461		// if there's a fenced code block, paragraph is over
1462		if p.flags&FencedCode != 0 {
1463			if p.fencedCodeBlock(current, false) > 0 {
1464				p.renderParagraph(data[:i])
1465				return i
1466			}
1467		}
1468
1469		// if there's a definition list item, prev line is a definition term
1470		if p.flags&DefinitionLists != 0 {
1471			if p.dliPrefix(current) != 0 {
1472				return p.list(data[prev:], ListTypeDefinition)
1473			}
1474		}
1475
1476		// if there's a list after this, paragraph is over
1477		if p.flags&NoEmptyLineBeforeBlock != 0 {
1478			if p.uliPrefix(current) != 0 ||
1479				p.oliPrefix(current) != 0 ||
1480				p.quotePrefix(current) != 0 ||
1481				p.codePrefix(current) != 0 {
1482				p.renderParagraph(data[:i])
1483				return i
1484			}
1485		}
1486
1487		// otherwise, scan to the beginning of the next line
1488		i += bytes.IndexByte(data[i:], '\n') + 1
1489	}
1490
1491	p.renderParagraph(data[:i])
1492	return i
1493}
1494
1495func skipChar(data []byte, start int, char byte) int {
1496	i := start
1497	for i < len(data) && data[i] == char {
1498		i++
1499	}
1500	return i
1501}
1502
1503func skipUntilChar(text []byte, start int, char byte) int {
1504	i := start
1505	for i < len(text) && text[i] != char {
1506		i++
1507	}
1508	return i
1509}
all repos — grayfriday @ 9357a8f949cba3a9472fe5baf0202041ac5a46fd

blackfriday fork with a few changes