all repos — grayfriday @ d96905437ea043766135ee4c5cd4d32366547288

blackfriday fork with a few changes

block.go (view raw)

   1//
   2// Blackfriday Markdown Processor
   3// Available at http://github.com/russross/blackfriday
   4//
   5// Copyright © 2011 Russ Ross <russ@russross.com>.
   6// Distributed under the Simplified BSD License.
   7// See README.md for details.
   8//
   9
  10//
  11// Functions to parse block-level elements.
  12//
  13
  14package blackfriday
  15
  16import (
  17	"bytes"
  18	"html"
  19	"regexp"
  20	"strings"
  21	"unicode"
  22)
  23
  24const (
  25	charEntity = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"
  26	escapable  = "[!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]"
  27)
  28
  29var (
  30	reBackslashOrAmp      = regexp.MustCompile("[\\&]")
  31	reEntityOrEscapedChar = regexp.MustCompile("(?i)\\\\" + escapable + "|" + charEntity)
  32)
  33
  34// Parse block-level data.
  35// Note: this function and many that it calls assume that
  36// the input buffer ends with a newline.
  37func (p *Markdown) block(data []byte) {
  38	// this is called recursively: enforce a maximum depth
  39	if p.nesting >= p.maxNesting {
  40		return
  41	}
  42	p.nesting++
  43
  44	// parse out one block-level construct at a time
  45	for len(data) > 0 {
  46		// prefixed heading:
  47		//
  48		// # Heading 1
  49		// ## Heading 2
  50		// ...
  51		// ###### Heading 6
  52		if p.isPrefixHeading(data) {
  53			data = data[p.prefixHeading(data):]
  54			continue
  55		}
  56
  57		// block of preformatted HTML:
  58		//
  59		// <div>
  60		//     ...
  61		// </div>
  62		if data[0] == '<' {
  63			if i := p.html(data, true); i > 0 {
  64				data = data[i:]
  65				continue
  66			}
  67		}
  68
  69		// title block
  70		//
  71		// % stuff
  72		// % more stuff
  73		// % even more stuff
  74		if p.extensions&Titleblock != 0 {
  75			if data[0] == '%' {
  76				if i := p.titleBlock(data, true); i > 0 {
  77					data = data[i:]
  78					continue
  79				}
  80			}
  81		}
  82
  83		// blank lines.  note: returns the # of bytes to skip
  84		if i := p.isEmpty(data); i > 0 {
  85			data = data[i:]
  86			continue
  87		}
  88
  89		// indented code block:
  90		//
  91		//     func max(a, b int) int {
  92		//         if a > b {
  93		//             return a
  94		//         }
  95		//         return b
  96		//      }
  97		if p.codePrefix(data) > 0 {
  98			data = data[p.code(data):]
  99			continue
 100		}
 101
 102		// fenced code block:
 103		//
 104		// ``` go
 105		// func fact(n int) int {
 106		//     if n <= 1 {
 107		//         return n
 108		//     }
 109		//     return n * fact(n-1)
 110		// }
 111		// ```
 112		if p.extensions&FencedCode != 0 {
 113			if i := p.fencedCodeBlock(data, true); i > 0 {
 114				data = data[i:]
 115				continue
 116			}
 117		}
 118
 119		// horizontal rule:
 120		//
 121		// ------
 122		// or
 123		// ******
 124		// or
 125		// ______
 126		if p.isHRule(data) {
 127			p.addBlock(HorizontalRule, nil)
 128			var i int
 129			for i = 0; i < len(data) && data[i] != '\n'; i++ {
 130			}
 131			data = data[i:]
 132			continue
 133		}
 134
 135		// block quote:
 136		//
 137		// > A big quote I found somewhere
 138		// > on the web
 139		if p.quotePrefix(data) > 0 {
 140			data = data[p.quote(data):]
 141			continue
 142		}
 143
 144		// table:
 145		//
 146		// Name  | Age | Phone
 147		// ------|-----|---------
 148		// Bob   | 31  | 555-1234
 149		// Alice | 27  | 555-4321
 150		if p.extensions&Tables != 0 {
 151			if i := p.table(data); i > 0 {
 152				data = data[i:]
 153				continue
 154			}
 155		}
 156
 157		// an itemized/unordered list:
 158		//
 159		// * Item 1
 160		// * Item 2
 161		//
 162		// also works with + or -
 163		if p.uliPrefix(data) > 0 {
 164			data = data[p.list(data, 0):]
 165			continue
 166		}
 167
 168		// a numbered/ordered list:
 169		//
 170		// 1. Item 1
 171		// 2. Item 2
 172		if p.oliPrefix(data) > 0 {
 173			data = data[p.list(data, ListTypeOrdered):]
 174			continue
 175		}
 176
 177		// definition lists:
 178		//
 179		// Term 1
 180		// :   Definition a
 181		// :   Definition b
 182		//
 183		// Term 2
 184		// :   Definition c
 185		if p.extensions&DefinitionLists != 0 {
 186			if p.dliPrefix(data) > 0 {
 187				data = data[p.list(data, ListTypeDefinition):]
 188				continue
 189			}
 190		}
 191
 192		// anything else must look like a normal paragraph
 193		// note: this finds underlined headings, too
 194		data = data[p.paragraph(data):]
 195	}
 196
 197	p.nesting--
 198}
 199
 200func (p *Markdown) addBlock(typ NodeType, content []byte) *Node {
 201	p.closeUnmatchedBlocks()
 202	container := p.addChild(typ, 0)
 203	container.content = content
 204	return container
 205}
 206
 207func (p *Markdown) isPrefixHeading(data []byte) bool {
 208	if data[0] != '#' {
 209		return false
 210	}
 211
 212	if p.extensions&SpaceHeadings != 0 {
 213		level := 0
 214		for level < 6 && level < len(data) && data[level] == '#' {
 215			level++
 216		}
 217		if level == len(data) || data[level] != ' ' {
 218			return false
 219		}
 220	}
 221	return true
 222}
 223
 224func (p *Markdown) prefixHeading(data []byte) int {
 225	level := 0
 226	for level < 6 && level < len(data) && data[level] == '#' {
 227		level++
 228	}
 229	i := skipChar(data, level, ' ')
 230	end := skipUntilChar(data, i, '\n')
 231	skip := end
 232	id := ""
 233	if p.extensions&HeadingIDs != 0 {
 234		j, k := 0, 0
 235		// find start/end of heading id
 236		for j = i; j < end-1 && (data[j] != '{' || data[j+1] != '#'); j++ {
 237		}
 238		for k = j + 1; k < end && data[k] != '}'; k++ {
 239		}
 240		// extract heading id iff found
 241		if j < end && k < end {
 242			id = string(data[j+2 : k])
 243			end = j
 244			skip = k + 1
 245			for end > 0 && data[end-1] == ' ' {
 246				end--
 247			}
 248		}
 249	}
 250	for end > 0 && data[end-1] == '#' {
 251		if isBackslashEscaped(data, end-1) {
 252			break
 253		}
 254		end--
 255	}
 256	for end > 0 && data[end-1] == ' ' {
 257		end--
 258	}
 259	if end > i {
 260		if id == "" && p.extensions&AutoHeadingIDs != 0 {
 261			id = SanitizedAnchorName(string(data[i:end]))
 262		}
 263		block := p.addBlock(Heading, data[i:end])
 264		block.HeadingID = id
 265		block.Level = level
 266	}
 267	return skip
 268}
 269
 270func (p *Markdown) isUnderlinedHeading(data []byte) int {
 271	// test of level 1 heading
 272	if data[0] == '=' {
 273		i := skipChar(data, 1, '=')
 274		i = skipChar(data, i, ' ')
 275		if i < len(data) && data[i] == '\n' {
 276			return 1
 277		}
 278		return 0
 279	}
 280
 281	// test of level 2 heading
 282	if data[0] == '-' {
 283		i := skipChar(data, 1, '-')
 284		i = skipChar(data, i, ' ')
 285		if i < len(data) && data[i] == '\n' {
 286			return 2
 287		}
 288		return 0
 289	}
 290
 291	return 0
 292}
 293
 294func (p *Markdown) titleBlock(data []byte, doRender bool) int {
 295	if data[0] != '%' {
 296		return 0
 297	}
 298	splitData := bytes.Split(data, []byte("\n"))
 299	var i int
 300	for idx, b := range splitData {
 301		if !bytes.HasPrefix(b, []byte("%")) {
 302			i = idx // - 1
 303			break
 304		}
 305	}
 306
 307	data = bytes.Join(splitData[0:i], []byte("\n"))
 308	consumed := len(data)
 309	data = bytes.TrimPrefix(data, []byte("% "))
 310	data = bytes.Replace(data, []byte("\n% "), []byte("\n"), -1)
 311	block := p.addBlock(Heading, data)
 312	block.Level = 1
 313	block.IsTitleblock = true
 314
 315	return consumed
 316}
 317
 318func (p *Markdown) html(data []byte, doRender bool) int {
 319	var i, j int
 320
 321	// identify the opening tag
 322	if data[0] != '<' {
 323		return 0
 324	}
 325	curtag, tagfound := p.htmlFindTag(data[1:])
 326
 327	// handle special cases
 328	if !tagfound {
 329		// check for an HTML comment
 330		if size := p.htmlComment(data, doRender); size > 0 {
 331			return size
 332		}
 333
 334		// check for an <hr> tag
 335		if size := p.htmlHr(data, doRender); size > 0 {
 336			return size
 337		}
 338
 339		// no special case recognized
 340		return 0
 341	}
 342
 343	// look for an unindented matching closing tag
 344	// followed by a blank line
 345	found := false
 346	/*
 347		closetag := []byte("\n</" + curtag + ">")
 348		j = len(curtag) + 1
 349		for !found {
 350			// scan for a closing tag at the beginning of a line
 351			if skip := bytes.Index(data[j:], closetag); skip >= 0 {
 352				j += skip + len(closetag)
 353			} else {
 354				break
 355			}
 356
 357			// see if it is the only thing on the line
 358			if skip := p.isEmpty(data[j:]); skip > 0 {
 359				// see if it is followed by a blank line/eof
 360				j += skip
 361				if j >= len(data) {
 362					found = true
 363					i = j
 364				} else {
 365					if skip := p.isEmpty(data[j:]); skip > 0 {
 366						j += skip
 367						found = true
 368						i = j
 369					}
 370				}
 371			}
 372		}
 373	*/
 374
 375	// if not found, try a second pass looking for indented match
 376	// but not if tag is "ins" or "del" (following original Markdown.pl)
 377	if !found && curtag != "ins" && curtag != "del" {
 378		i = 1
 379		for i < len(data) {
 380			i++
 381			for i < len(data) && !(data[i-1] == '<' && data[i] == '/') {
 382				i++
 383			}
 384
 385			if i+2+len(curtag) >= len(data) {
 386				break
 387			}
 388
 389			j = p.htmlFindEnd(curtag, data[i-1:])
 390
 391			if j > 0 {
 392				i += j - 1
 393				found = true
 394				break
 395			}
 396		}
 397	}
 398
 399	if !found {
 400		return 0
 401	}
 402
 403	// the end of the block has been found
 404	if doRender {
 405		// trim newlines
 406		end := i
 407		for end > 0 && data[end-1] == '\n' {
 408			end--
 409		}
 410		finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
 411	}
 412
 413	return i
 414}
 415
 416func finalizeHTMLBlock(block *Node) {
 417	block.Literal = block.content
 418	block.content = nil
 419}
 420
 421// HTML comment, lax form
 422func (p *Markdown) htmlComment(data []byte, doRender bool) int {
 423	i := p.inlineHTMLComment(data)
 424	// needs to end with a blank line
 425	if j := p.isEmpty(data[i:]); j > 0 {
 426		size := i + j
 427		if doRender {
 428			// trim trailing newlines
 429			end := size
 430			for end > 0 && data[end-1] == '\n' {
 431				end--
 432			}
 433			block := p.addBlock(HTMLBlock, data[:end])
 434			finalizeHTMLBlock(block)
 435		}
 436		return size
 437	}
 438	return 0
 439}
 440
 441// HR, which is the only self-closing block tag considered
 442func (p *Markdown) htmlHr(data []byte, doRender bool) int {
 443	if len(data) < 4 {
 444		return 0
 445	}
 446	if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') {
 447		return 0
 448	}
 449	if data[3] != ' ' && data[3] != '/' && data[3] != '>' {
 450		// not an <hr> tag after all; at least not a valid one
 451		return 0
 452	}
 453	i := 3
 454	for i < len(data) && data[i] != '>' && data[i] != '\n' {
 455		i++
 456	}
 457	if i < len(data) && data[i] == '>' {
 458		i++
 459		if j := p.isEmpty(data[i:]); j > 0 {
 460			size := i + j
 461			if doRender {
 462				// trim newlines
 463				end := size
 464				for end > 0 && data[end-1] == '\n' {
 465					end--
 466				}
 467				finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
 468			}
 469			return size
 470		}
 471	}
 472	return 0
 473}
 474
 475func (p *Markdown) htmlFindTag(data []byte) (string, bool) {
 476	i := 0
 477	for i < len(data) && isalnum(data[i]) {
 478		i++
 479	}
 480	key := string(data[:i])
 481	if _, ok := blockTags[key]; ok {
 482		return key, true
 483	}
 484	return "", false
 485}
 486
 487func (p *Markdown) htmlFindEnd(tag string, data []byte) int {
 488	// assume data[0] == '<' && data[1] == '/' already tested
 489	if tag == "hr" {
 490		return 2
 491	}
 492	// check if tag is a match
 493	closetag := []byte("</" + tag + ">")
 494	if !bytes.HasPrefix(data, closetag) {
 495		return 0
 496	}
 497	i := len(closetag)
 498
 499	// check that the rest of the line is blank
 500	skip := 0
 501	if skip = p.isEmpty(data[i:]); skip == 0 {
 502		return 0
 503	}
 504	i += skip
 505	skip = 0
 506
 507	if i >= len(data) {
 508		return i
 509	}
 510
 511	if p.extensions&LaxHTMLBlocks != 0 {
 512		return i
 513	}
 514	if skip = p.isEmpty(data[i:]); skip == 0 {
 515		// following line must be blank
 516		return 0
 517	}
 518
 519	return i + skip
 520}
 521
 522func (*Markdown) isEmpty(data []byte) int {
 523	// it is okay to call isEmpty on an empty buffer
 524	if len(data) == 0 {
 525		return 0
 526	}
 527
 528	var i int
 529	for i = 0; i < len(data) && data[i] != '\n'; i++ {
 530		if data[i] != ' ' && data[i] != '\t' {
 531			return 0
 532		}
 533	}
 534	if i < len(data) && data[i] == '\n' {
 535		i++
 536	}
 537	return i
 538}
 539
 540func (*Markdown) isHRule(data []byte) bool {
 541	i := 0
 542
 543	// skip up to three spaces
 544	for i < 3 && data[i] == ' ' {
 545		i++
 546	}
 547
 548	// look at the hrule char
 549	if data[i] != '*' && data[i] != '-' && data[i] != '_' {
 550		return false
 551	}
 552	c := data[i]
 553
 554	// the whole line must be the char or whitespace
 555	n := 0
 556	for i < len(data) && data[i] != '\n' {
 557		switch {
 558		case data[i] == c:
 559			n++
 560		case data[i] != ' ':
 561			return false
 562		}
 563		i++
 564	}
 565
 566	return n >= 3
 567}
 568
 569// isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data,
 570// and returns the end index if so, or 0 otherwise. It also returns the marker found.
 571// If info is not nil, it gets set to the syntax specified in the fence line.
 572func isFenceLine(data []byte, info *string, oldmarker string) (end int, marker string) {
 573	i, size := 0, 0
 574
 575	// skip up to three spaces
 576	for i < len(data) && i < 3 && data[i] == ' ' {
 577		i++
 578	}
 579
 580	// check for the marker characters: ~ or `
 581	if i >= len(data) {
 582		return 0, ""
 583	}
 584	if data[i] != '~' && data[i] != '`' {
 585		return 0, ""
 586	}
 587
 588	c := data[i]
 589
 590	// the whole line must be the same char or whitespace
 591	for i < len(data) && data[i] == c {
 592		size++
 593		i++
 594	}
 595
 596	// the marker char must occur at least 3 times
 597	if size < 3 {
 598		return 0, ""
 599	}
 600	marker = string(data[i-size : i])
 601
 602	// if this is the end marker, it must match the beginning marker
 603	if oldmarker != "" && marker != oldmarker {
 604		return 0, ""
 605	}
 606
 607	// TODO(shurcooL): It's probably a good idea to simplify the 2 code paths here
 608	// into one, always get the info string, and discard it if the caller doesn't care.
 609	if info != nil {
 610		infoLength := 0
 611		i = skipChar(data, i, ' ')
 612
 613		if i >= len(data) {
 614			if i == len(data) {
 615				return i, marker
 616			}
 617			return 0, ""
 618		}
 619
 620		infoStart := i
 621
 622		if data[i] == '{' {
 623			i++
 624			infoStart++
 625
 626			for i < len(data) && data[i] != '}' && data[i] != '\n' {
 627				infoLength++
 628				i++
 629			}
 630
 631			if i >= len(data) || data[i] != '}' {
 632				return 0, ""
 633			}
 634
 635			// strip all whitespace at the beginning and the end
 636			// of the {} block
 637			for infoLength > 0 && isspace(data[infoStart]) {
 638				infoStart++
 639				infoLength--
 640			}
 641
 642			for infoLength > 0 && isspace(data[infoStart+infoLength-1]) {
 643				infoLength--
 644			}
 645			i++
 646			i = skipChar(data, i, ' ')
 647		} else {
 648			for i < len(data) && !isverticalspace(data[i]) {
 649				infoLength++
 650				i++
 651			}
 652		}
 653
 654		*info = strings.TrimSpace(string(data[infoStart : infoStart+infoLength]))
 655	}
 656
 657	if i == len(data) {
 658		return i, marker
 659	}
 660	if i > len(data) || data[i] != '\n' {
 661		return 0, ""
 662	}
 663	return i + 1, marker // Take newline into account.
 664}
 665
 666// fencedCodeBlock returns the end index if data contains a fenced code block at the beginning,
 667// or 0 otherwise. It writes to out if doRender is true, otherwise it has no side effects.
 668// If doRender is true, a final newline is mandatory to recognize the fenced code block.
 669func (p *Markdown) fencedCodeBlock(data []byte, doRender bool) int {
 670	var info string
 671	beg, marker := isFenceLine(data, &info, "")
 672	if beg == 0 || beg >= len(data) {
 673		return 0
 674	}
 675	fenceLength := beg - 1
 676
 677	var work bytes.Buffer
 678	work.Write([]byte(info))
 679	work.WriteByte('\n')
 680
 681	for {
 682		// safe to assume beg < len(data)
 683
 684		// check for the end of the code block
 685		fenceEnd, _ := isFenceLine(data[beg:], nil, marker)
 686		if fenceEnd != 0 {
 687			beg += fenceEnd
 688			break
 689		}
 690
 691		// copy the current line
 692		end := skipUntilChar(data, beg, '\n') + 1
 693
 694		// did we reach the end of the buffer without a closing marker?
 695		if end >= len(data) {
 696			return 0
 697		}
 698
 699		// verbatim copy to the working buffer
 700		if doRender {
 701			work.Write(data[beg:end])
 702		}
 703		beg = end
 704	}
 705
 706	if doRender {
 707		block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
 708		block.IsFenced = true
 709		block.FenceLength = fenceLength
 710		finalizeCodeBlock(block)
 711	}
 712
 713	return beg
 714}
 715
 716func unescapeChar(str []byte) []byte {
 717	if str[0] == '\\' {
 718		return []byte{str[1]}
 719	}
 720	return []byte(html.UnescapeString(string(str)))
 721}
 722
 723func unescapeString(str []byte) []byte {
 724	if reBackslashOrAmp.Match(str) {
 725		return reEntityOrEscapedChar.ReplaceAllFunc(str, unescapeChar)
 726	}
 727	return str
 728}
 729
 730func finalizeCodeBlock(block *Node) {
 731	if block.IsFenced {
 732		newlinePos := bytes.IndexByte(block.content, '\n')
 733		firstLine := block.content[:newlinePos]
 734		rest := block.content[newlinePos+1:]
 735		block.Info = unescapeString(bytes.Trim(firstLine, "\n"))
 736		block.Literal = rest
 737	} else {
 738		block.Literal = block.content
 739	}
 740	block.content = nil
 741}
 742
 743func (p *Markdown) table(data []byte) int {
 744	table := p.addBlock(Table, nil)
 745	i, columns := p.tableHeader(data)
 746	if i == 0 {
 747		p.tip = table.Parent
 748		table.Unlink()
 749		return 0
 750	}
 751
 752	p.addBlock(TableBody, nil)
 753
 754	for i < len(data) {
 755		pipes, rowStart := 0, i
 756		for ; i < len(data) && data[i] != '\n'; i++ {
 757			if data[i] == '|' {
 758				pipes++
 759			}
 760		}
 761
 762		if pipes == 0 {
 763			i = rowStart
 764			break
 765		}
 766
 767		// include the newline in data sent to tableRow
 768		if i < len(data) && data[i] == '\n' {
 769			i++
 770		}
 771		p.tableRow(data[rowStart:i], columns, false)
 772	}
 773
 774	return i
 775}
 776
 777// check if the specified position is preceded by an odd number of backslashes
 778func isBackslashEscaped(data []byte, i int) bool {
 779	backslashes := 0
 780	for i-backslashes-1 >= 0 && data[i-backslashes-1] == '\\' {
 781		backslashes++
 782	}
 783	return backslashes&1 == 1
 784}
 785
 786func (p *Markdown) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
 787	i := 0
 788	colCount := 1
 789	for i = 0; i < len(data) && data[i] != '\n'; i++ {
 790		if data[i] == '|' && !isBackslashEscaped(data, i) {
 791			colCount++
 792		}
 793	}
 794
 795	// doesn't look like a table header
 796	if colCount == 1 {
 797		return
 798	}
 799
 800	// include the newline in the data sent to tableRow
 801	j := i
 802	if j < len(data) && data[j] == '\n' {
 803		j++
 804	}
 805	header := data[:j]
 806
 807	// column count ignores pipes at beginning or end of line
 808	if data[0] == '|' {
 809		colCount--
 810	}
 811	if i > 2 && data[i-1] == '|' && !isBackslashEscaped(data, i-1) {
 812		colCount--
 813	}
 814
 815	columns = make([]CellAlignFlags, colCount)
 816
 817	// move on to the header underline
 818	i++
 819	if i >= len(data) {
 820		return
 821	}
 822
 823	if data[i] == '|' && !isBackslashEscaped(data, i) {
 824		i++
 825	}
 826	i = skipChar(data, i, ' ')
 827
 828	// each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3
 829	// and trailing | optional on last column
 830	col := 0
 831	for i < len(data) && data[i] != '\n' {
 832		dashes := 0
 833
 834		if data[i] == ':' {
 835			i++
 836			columns[col] |= TableAlignmentLeft
 837			dashes++
 838		}
 839		for i < len(data) && data[i] == '-' {
 840			i++
 841			dashes++
 842		}
 843		if i < len(data) && data[i] == ':' {
 844			i++
 845			columns[col] |= TableAlignmentRight
 846			dashes++
 847		}
 848		for i < len(data) && data[i] == ' ' {
 849			i++
 850		}
 851		if i == len(data) {
 852			return
 853		}
 854		// end of column test is messy
 855		switch {
 856		case dashes < 3:
 857			// not a valid column
 858			return
 859
 860		case data[i] == '|' && !isBackslashEscaped(data, i):
 861			// marker found, now skip past trailing whitespace
 862			col++
 863			i++
 864			for i < len(data) && data[i] == ' ' {
 865				i++
 866			}
 867
 868			// trailing junk found after last column
 869			if col >= colCount && i < len(data) && data[i] != '\n' {
 870				return
 871			}
 872
 873		case (data[i] != '|' || isBackslashEscaped(data, i)) && col+1 < colCount:
 874			// something else found where marker was required
 875			return
 876
 877		case data[i] == '\n':
 878			// marker is optional for the last column
 879			col++
 880
 881		default:
 882			// trailing junk found after last column
 883			return
 884		}
 885	}
 886	if col != colCount {
 887		return
 888	}
 889
 890	p.addBlock(TableHead, nil)
 891	p.tableRow(header, columns, true)
 892	size = i
 893	if size < len(data) && data[size] == '\n' {
 894		size++
 895	}
 896	return
 897}
 898
 899func (p *Markdown) tableRow(data []byte, columns []CellAlignFlags, header bool) {
 900	p.addBlock(TableRow, nil)
 901	i, col := 0, 0
 902
 903	if data[i] == '|' && !isBackslashEscaped(data, i) {
 904		i++
 905	}
 906
 907	for col = 0; col < len(columns) && i < len(data); col++ {
 908		for i < len(data) && data[i] == ' ' {
 909			i++
 910		}
 911
 912		cellStart := i
 913
 914		for i < len(data) && (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
 915			i++
 916		}
 917
 918		cellEnd := i
 919
 920		// skip the end-of-cell marker, possibly taking us past end of buffer
 921		i++
 922
 923		for cellEnd > cellStart && cellEnd-1 < len(data) && data[cellEnd-1] == ' ' {
 924			cellEnd--
 925		}
 926
 927		cell := p.addBlock(TableCell, data[cellStart:cellEnd])
 928		cell.IsHeader = header
 929		cell.Align = columns[col]
 930	}
 931
 932	// pad it out with empty columns to get the right number
 933	for ; col < len(columns); col++ {
 934		cell := p.addBlock(TableCell, nil)
 935		cell.IsHeader = header
 936		cell.Align = columns[col]
 937	}
 938
 939	// silently ignore rows with too many cells
 940}
 941
 942// returns blockquote prefix length
 943func (p *Markdown) quotePrefix(data []byte) int {
 944	i := 0
 945	for i < 3 && i < len(data) && data[i] == ' ' {
 946		i++
 947	}
 948	if i < len(data) && data[i] == '>' {
 949		if i+1 < len(data) && data[i+1] == ' ' {
 950			return i + 2
 951		}
 952		return i + 1
 953	}
 954	return 0
 955}
 956
 957// blockquote ends with at least one blank line
 958// followed by something without a blockquote prefix
 959func (p *Markdown) terminateBlockquote(data []byte, beg, end int) bool {
 960	if p.isEmpty(data[beg:]) <= 0 {
 961		return false
 962	}
 963	if end >= len(data) {
 964		return true
 965	}
 966	return p.quotePrefix(data[end:]) == 0 && p.isEmpty(data[end:]) == 0
 967}
 968
 969// parse a blockquote fragment
 970func (p *Markdown) quote(data []byte) int {
 971	block := p.addBlock(BlockQuote, nil)
 972	var raw bytes.Buffer
 973	beg, end := 0, 0
 974	for beg < len(data) {
 975		end = beg
 976		// Step over whole lines, collecting them. While doing that, check for
 977		// fenced code and if one's found, incorporate it altogether,
 978		// irregardless of any contents inside it
 979		for end < len(data) && data[end] != '\n' {
 980			if p.extensions&FencedCode != 0 {
 981				if i := p.fencedCodeBlock(data[end:], false); i > 0 {
 982					// -1 to compensate for the extra end++ after the loop:
 983					end += i - 1
 984					break
 985				}
 986			}
 987			end++
 988		}
 989		if end < len(data) && data[end] == '\n' {
 990			end++
 991		}
 992		if pre := p.quotePrefix(data[beg:]); pre > 0 {
 993			// skip the prefix
 994			beg += pre
 995		} else if p.terminateBlockquote(data, beg, end) {
 996			break
 997		}
 998		// this line is part of the blockquote
 999		raw.Write(data[beg:end])
1000		beg = end
1001	}
1002	p.block(raw.Bytes())
1003	p.finalize(block)
1004	return end
1005}
1006
1007// returns prefix length for block code
1008func (p *Markdown) codePrefix(data []byte) int {
1009	if len(data) >= 1 && data[0] == '\t' {
1010		return 1
1011	}
1012	if len(data) >= 4 && data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
1013		return 4
1014	}
1015	return 0
1016}
1017
1018func (p *Markdown) code(data []byte) int {
1019	var work bytes.Buffer
1020
1021	i := 0
1022	for i < len(data) {
1023		beg := i
1024		for i < len(data) && data[i] != '\n' {
1025			i++
1026		}
1027		if i < len(data) && data[i] == '\n' {
1028			i++
1029		}
1030
1031		blankline := p.isEmpty(data[beg:i]) > 0
1032		if pre := p.codePrefix(data[beg:i]); pre > 0 {
1033			beg += pre
1034		} else if !blankline {
1035			// non-empty, non-prefixed line breaks the pre
1036			i = beg
1037			break
1038		}
1039
1040		// verbatim copy to the working buffer
1041		if blankline {
1042			work.WriteByte('\n')
1043		} else {
1044			work.Write(data[beg:i])
1045		}
1046	}
1047
1048	// trim all the \n off the end of work
1049	workbytes := work.Bytes()
1050	eol := len(workbytes)
1051	for eol > 0 && workbytes[eol-1] == '\n' {
1052		eol--
1053	}
1054	if eol != len(workbytes) {
1055		work.Truncate(eol)
1056	}
1057
1058	work.WriteByte('\n')
1059
1060	block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
1061	block.IsFenced = false
1062	finalizeCodeBlock(block)
1063
1064	return i
1065}
1066
1067// returns unordered list item prefix
1068func (p *Markdown) uliPrefix(data []byte) int {
1069	i := 0
1070	// start with up to 3 spaces
1071	for i < len(data) && i < 3 && data[i] == ' ' {
1072		i++
1073	}
1074	if i >= len(data)-1 {
1075		return 0
1076	}
1077	// need one of {'*', '+', '-'} followed by a space or a tab
1078	if (data[i] != '*' && data[i] != '+' && data[i] != '-') ||
1079		(data[i+1] != ' ' && data[i+1] != '\t') {
1080		return 0
1081	}
1082	return i + 2
1083}
1084
1085// returns ordered list item prefix
1086func (p *Markdown) oliPrefix(data []byte) int {
1087	i := 0
1088
1089	// start with up to 3 spaces
1090	for i < 3 && i < len(data) && data[i] == ' ' {
1091		i++
1092	}
1093
1094	// count the digits
1095	start := i
1096	for i < len(data) && data[i] >= '0' && data[i] <= '9' {
1097		i++
1098	}
1099	if start == i || i >= len(data)-1 {
1100		return 0
1101	}
1102
1103	// we need >= 1 digits followed by a dot and a space or a tab
1104	if data[i] != '.' || !(data[i+1] == ' ' || data[i+1] == '\t') {
1105		return 0
1106	}
1107	return i + 2
1108}
1109
1110// returns definition list item prefix
1111func (p *Markdown) dliPrefix(data []byte) int {
1112	if len(data) < 2 {
1113		return 0
1114	}
1115	i := 0
1116	// need a ':' followed by a space or a tab
1117	if data[i] != ':' || !(data[i+1] == ' ' || data[i+1] == '\t') {
1118		return 0
1119	}
1120	for i < len(data) && data[i] == ' ' {
1121		i++
1122	}
1123	return i + 2
1124}
1125
1126// parse ordered or unordered list block
1127func (p *Markdown) list(data []byte, flags ListType) int {
1128	i := 0
1129	flags |= ListItemBeginningOfList
1130	block := p.addBlock(List, nil)
1131	block.ListFlags = flags
1132	block.Tight = true
1133
1134	for i < len(data) {
1135		skip := p.listItem(data[i:], &flags)
1136		if flags&ListItemContainsBlock != 0 {
1137			block.ListData.Tight = false
1138		}
1139		i += skip
1140		if skip == 0 || flags&ListItemEndOfList != 0 {
1141			break
1142		}
1143		flags &= ^ListItemBeginningOfList
1144	}
1145
1146	above := block.Parent
1147	finalizeList(block)
1148	p.tip = above
1149	return i
1150}
1151
1152// Returns true if the list item is not the same type as its parent list
1153func (p *Markdown) listTypeChanged(data []byte, flags *ListType) bool {
1154	if p.dliPrefix(data) > 0 && *flags&ListTypeDefinition == 0 {
1155		return true
1156	} else if p.oliPrefix(data) > 0 && *flags&ListTypeOrdered == 0 {
1157		return true
1158	} else if p.uliPrefix(data) > 0 && (*flags&ListTypeOrdered != 0 || *flags&ListTypeDefinition != 0) {
1159		return true
1160	}
1161	return false
1162}
1163
1164// Returns true if block ends with a blank line, descending if needed
1165// into lists and sublists.
1166func endsWithBlankLine(block *Node) bool {
1167	// TODO: figure this out. Always false now.
1168	for block != nil {
1169		//if block.lastLineBlank {
1170		//return true
1171		//}
1172		t := block.Type
1173		if t == List || t == Item {
1174			block = block.LastChild
1175		} else {
1176			break
1177		}
1178	}
1179	return false
1180}
1181
1182func finalizeList(block *Node) {
1183	block.open = false
1184	item := block.FirstChild
1185	for item != nil {
1186		// check for non-final list item ending with blank line:
1187		if endsWithBlankLine(item) && item.Next != nil {
1188			block.ListData.Tight = false
1189			break
1190		}
1191		// recurse into children of list item, to see if there are spaces
1192		// between any of them:
1193		subItem := item.FirstChild
1194		for subItem != nil {
1195			if endsWithBlankLine(subItem) && (item.Next != nil || subItem.Next != nil) {
1196				block.ListData.Tight = false
1197				break
1198			}
1199			subItem = subItem.Next
1200		}
1201		item = item.Next
1202	}
1203}
1204
1205// Parse a single list item.
1206// Assumes initial prefix is already removed if this is a sublist.
1207func (p *Markdown) listItem(data []byte, flags *ListType) int {
1208	// keep track of the indentation of the first line
1209	itemIndent := 0
1210	if data[0] == '\t' {
1211		itemIndent += 4
1212	} else {
1213		for itemIndent < 3 && data[itemIndent] == ' ' {
1214			itemIndent++
1215		}
1216	}
1217
1218	var bulletChar byte = '*'
1219	i := p.uliPrefix(data)
1220	if i == 0 {
1221		i = p.oliPrefix(data)
1222	} else {
1223		bulletChar = data[i-2]
1224	}
1225	if i == 0 {
1226		i = p.dliPrefix(data)
1227		// reset definition term flag
1228		if i > 0 {
1229			*flags &= ^ListTypeTerm
1230		}
1231	}
1232	if i == 0 {
1233		// if in definition list, set term flag and continue
1234		if *flags&ListTypeDefinition != 0 {
1235			*flags |= ListTypeTerm
1236		} else {
1237			return 0
1238		}
1239	}
1240
1241	// skip leading whitespace on first line
1242	for i < len(data) && data[i] == ' ' {
1243		i++
1244	}
1245
1246	// find the end of the line
1247	line := i
1248	for i > 0 && i < len(data) && data[i-1] != '\n' {
1249		i++
1250	}
1251
1252	// get working buffer
1253	var raw bytes.Buffer
1254
1255	// put the first line into the working buffer
1256	raw.Write(data[line:i])
1257	line = i
1258
1259	// process the following lines
1260	containsBlankLine := false
1261	sublist := 0
1262	codeBlockMarker := ""
1263
1264gatherlines:
1265	for line < len(data) {
1266		i++
1267
1268		// find the end of this line
1269		for i < len(data) && data[i-1] != '\n' {
1270			i++
1271		}
1272
1273		// if it is an empty line, guess that it is part of this item
1274		// and move on to the next line
1275		if p.isEmpty(data[line:i]) > 0 {
1276			containsBlankLine = true
1277			line = i
1278			continue
1279		}
1280
1281		// calculate the indentation
1282		indent := 0
1283		indentIndex := 0
1284		if data[line] == '\t' {
1285			indentIndex++
1286			indent += 4
1287		} else {
1288			for indent < 4 && line+indent < i && data[line+indent] == ' ' {
1289				indent++
1290				indentIndex++
1291			}
1292		}
1293
1294		chunk := data[line+indentIndex : i]
1295
1296		if p.extensions&FencedCode != 0 {
1297			// determine if in or out of codeblock
1298			// if in codeblock, ignore normal list processing
1299			_, marker := isFenceLine(chunk, nil, codeBlockMarker)
1300			if marker != "" {
1301				if codeBlockMarker == "" {
1302					// start of codeblock
1303					codeBlockMarker = marker
1304				} else {
1305					// end of codeblock.
1306					codeBlockMarker = ""
1307				}
1308			}
1309			// we are in a codeblock, write line, and continue
1310			if codeBlockMarker != "" || marker != "" {
1311				raw.Write(data[line+indentIndex : i])
1312				line = i
1313				continue gatherlines
1314			}
1315		}
1316
1317		// evaluate how this line fits in
1318		switch {
1319		// is this a nested list item?
1320		case (p.uliPrefix(chunk) > 0 && !p.isHRule(chunk)) ||
1321			p.oliPrefix(chunk) > 0 ||
1322			p.dliPrefix(chunk) > 0:
1323
1324			// to be a nested list, it must be indented more
1325			// if not, it is either a different kind of list
1326			// or the next item in the same list
1327			if indent <= itemIndent {
1328				if p.listTypeChanged(chunk, flags) {
1329					*flags |= ListItemEndOfList
1330				} else if containsBlankLine {
1331					*flags |= ListItemContainsBlock
1332				}
1333
1334				break gatherlines
1335			}
1336
1337			if containsBlankLine {
1338				*flags |= ListItemContainsBlock
1339			}
1340
1341			// is this the first item in the nested list?
1342			if sublist == 0 {
1343				sublist = raw.Len()
1344			}
1345
1346		// is this a nested prefix heading?
1347		case p.isPrefixHeading(chunk):
1348			// if the heading is not indented, it is not nested in the list
1349			// and thus ends the list
1350			if containsBlankLine && indent < 4 {
1351				*flags |= ListItemEndOfList
1352				break gatherlines
1353			}
1354			*flags |= ListItemContainsBlock
1355
1356		// anything following an empty line is only part
1357		// of this item if it is indented 4 spaces
1358		// (regardless of the indentation of the beginning of the item)
1359		case containsBlankLine && indent < 4:
1360			if *flags&ListTypeDefinition != 0 && i < len(data)-1 {
1361				// is the next item still a part of this list?
1362				next := i
1363				for next < len(data) && data[next] != '\n' {
1364					next++
1365				}
1366				for next < len(data)-1 && data[next] == '\n' {
1367					next++
1368				}
1369				if i < len(data)-1 && data[i] != ':' && data[next] != ':' {
1370					*flags |= ListItemEndOfList
1371				}
1372			} else {
1373				*flags |= ListItemEndOfList
1374			}
1375			break gatherlines
1376
1377		// a blank line means this should be parsed as a block
1378		case containsBlankLine:
1379			raw.WriteByte('\n')
1380			*flags |= ListItemContainsBlock
1381		}
1382
1383		// if this line was preceded by one or more blanks,
1384		// re-introduce the blank into the buffer
1385		if containsBlankLine {
1386			containsBlankLine = false
1387			raw.WriteByte('\n')
1388		}
1389
1390		// add the line into the working buffer without prefix
1391		raw.Write(data[line+indentIndex : i])
1392
1393		line = i
1394	}
1395
1396	rawBytes := raw.Bytes()
1397
1398	block := p.addBlock(Item, nil)
1399	block.ListFlags = *flags
1400	block.Tight = false
1401	block.BulletChar = bulletChar
1402	block.Delimiter = '.' // Only '.' is possible in Markdown, but ')' will also be possible in CommonMark
1403
1404	// render the contents of the list item
1405	if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 {
1406		// intermediate render of block item, except for definition term
1407		if sublist > 0 {
1408			p.block(rawBytes[:sublist])
1409			p.block(rawBytes[sublist:])
1410		} else {
1411			p.block(rawBytes)
1412		}
1413	} else {
1414		// intermediate render of inline item
1415		if sublist > 0 {
1416			child := p.addChild(Paragraph, 0)
1417			child.content = rawBytes[:sublist]
1418			p.block(rawBytes[sublist:])
1419		} else {
1420			child := p.addChild(Paragraph, 0)
1421			child.content = rawBytes
1422		}
1423	}
1424	return line
1425}
1426
1427// render a single paragraph that has already been parsed out
1428func (p *Markdown) renderParagraph(data []byte) {
1429	if len(data) == 0 {
1430		return
1431	}
1432
1433	// trim leading spaces
1434	beg := 0
1435	for data[beg] == ' ' {
1436		beg++
1437	}
1438
1439	end := len(data)
1440	// trim trailing newline
1441	if data[len(data)-1] == '\n' {
1442		end--
1443	}
1444
1445	// trim trailing spaces
1446	for end > beg && data[end-1] == ' ' {
1447		end--
1448	}
1449
1450	p.addBlock(Paragraph, data[beg:end])
1451}
1452
1453func (p *Markdown) paragraph(data []byte) int {
1454	// prev: index of 1st char of previous line
1455	// line: index of 1st char of current line
1456	// i: index of cursor/end of current line
1457	var prev, line, i int
1458	tabSize := TabSizeDefault
1459	if p.extensions&TabSizeEight != 0 {
1460		tabSize = TabSizeDouble
1461	}
1462	// keep going until we find something to mark the end of the paragraph
1463	for i < len(data) {
1464		// mark the beginning of the current line
1465		prev = line
1466		current := data[i:]
1467		line = i
1468
1469		// did we find a reference or a footnote? If so, end a paragraph
1470		// preceding it and report that we have consumed up to the end of that
1471		// reference:
1472		if refEnd := isReference(p, current, tabSize); refEnd > 0 {
1473			p.renderParagraph(data[:i])
1474			return i + refEnd
1475		}
1476
1477		// did we find a blank line marking the end of the paragraph?
1478		if n := p.isEmpty(current); n > 0 {
1479			// did this blank line followed by a definition list item?
1480			if p.extensions&DefinitionLists != 0 {
1481				if i < len(data)-1 && data[i+1] == ':' {
1482					return p.list(data[prev:], ListTypeDefinition)
1483				}
1484			}
1485
1486			p.renderParagraph(data[:i])
1487			return i + n
1488		}
1489
1490		// an underline under some text marks a heading, so our paragraph ended on prev line
1491		if i > 0 {
1492			if level := p.isUnderlinedHeading(current); level > 0 {
1493				// render the paragraph
1494				p.renderParagraph(data[:prev])
1495
1496				// ignore leading and trailing whitespace
1497				eol := i - 1
1498				for prev < eol && data[prev] == ' ' {
1499					prev++
1500				}
1501				for eol > prev && data[eol-1] == ' ' {
1502					eol--
1503				}
1504
1505				id := ""
1506				if p.extensions&AutoHeadingIDs != 0 {
1507					id = SanitizedAnchorName(string(data[prev:eol]))
1508				}
1509
1510				block := p.addBlock(Heading, data[prev:eol])
1511				block.Level = level
1512				block.HeadingID = id
1513
1514				// find the end of the underline
1515				for i < len(data) && data[i] != '\n' {
1516					i++
1517				}
1518				return i
1519			}
1520		}
1521
1522		// if the next line starts a block of HTML, then the paragraph ends here
1523		if p.extensions&LaxHTMLBlocks != 0 {
1524			if data[i] == '<' && p.html(current, false) > 0 {
1525				// rewind to before the HTML block
1526				p.renderParagraph(data[:i])
1527				return i
1528			}
1529		}
1530
1531		// if there's a prefixed heading or a horizontal rule after this, paragraph is over
1532		if p.isPrefixHeading(current) || p.isHRule(current) {
1533			p.renderParagraph(data[:i])
1534			return i
1535		}
1536
1537		// if there's a fenced code block, paragraph is over
1538		if p.extensions&FencedCode != 0 {
1539			if p.fencedCodeBlock(current, false) > 0 {
1540				p.renderParagraph(data[:i])
1541				return i
1542			}
1543		}
1544
1545		// if there's a definition list item, prev line is a definition term
1546		if p.extensions&DefinitionLists != 0 {
1547			if p.dliPrefix(current) != 0 {
1548				ret := p.list(data[prev:], ListTypeDefinition)
1549				return ret
1550			}
1551		}
1552
1553		// if there's a list after this, paragraph is over
1554		if p.extensions&NoEmptyLineBeforeBlock != 0 {
1555			if p.uliPrefix(current) != 0 ||
1556				p.oliPrefix(current) != 0 ||
1557				p.quotePrefix(current) != 0 ||
1558				p.codePrefix(current) != 0 {
1559				p.renderParagraph(data[:i])
1560				return i
1561			}
1562		}
1563
1564		// otherwise, scan to the beginning of the next line
1565		nl := bytes.IndexByte(data[i:], '\n')
1566		if nl >= 0 {
1567			i += nl + 1
1568		} else {
1569			i += len(data[i:])
1570		}
1571	}
1572
1573	p.renderParagraph(data[:i])
1574	return i
1575}
1576
1577func skipChar(data []byte, start int, char byte) int {
1578	i := start
1579	for i < len(data) && data[i] == char {
1580		i++
1581	}
1582	return i
1583}
1584
1585func skipUntilChar(text []byte, start int, char byte) int {
1586	i := start
1587	for i < len(text) && text[i] != char {
1588		i++
1589	}
1590	return i
1591}
1592
1593// SanitizedAnchorName returns a sanitized anchor name for the given text.
1594//
1595// It implements the algorithm specified in the package comment.
1596func SanitizedAnchorName(text string) string {
1597	var anchorName []rune
1598	futureDash := false
1599	for _, r := range text {
1600		switch {
1601		case unicode.IsLetter(r) || unicode.IsNumber(r):
1602			if futureDash && len(anchorName) > 0 {
1603				anchorName = append(anchorName, '-')
1604			}
1605			futureDash = false
1606			anchorName = append(anchorName, unicode.ToLower(r))
1607		default:
1608			futureDash = true
1609		}
1610	}
1611	return string(anchorName)
1612}