all repos — grayfriday @ 83b4cb6062026552be18b31998adc933abb2c39c

blackfriday fork with a few changes

markdown.go (view raw)

   1//
   2// Blackfriday Markdown Processor
   3// Available at http://github.com/russross/blackfriday
   4//
   5// Copyright © 2011 Russ Ross <russ@russross.com>.
   6// Distributed under the Simplified BSD License.
   7// See README.md for details.
   8//
   9
  10//
  11//
  12// Markdown parsing and processing
  13//
  14//
  15
  16// Blackfriday markdown processor.
  17//
  18// Translates plain text with simple formatting rules into HTML or LaTeX.
  19package blackfriday
  20
  21import (
  22	"bytes"
  23	"fmt"
  24	"strings"
  25	"unicode/utf8"
  26)
  27
  28const VERSION = "1.4"
  29
  30type Extensions int
  31
  32// These are the supported markdown parsing extensions.
  33// OR these values together to select multiple extensions.
  34const (
  35	NoExtensions            Extensions = 0
  36	NoIntraEmphasis         Extensions = 1 << iota // Ignore emphasis markers inside words
  37	Tables                                         // Render tables
  38	FencedCode                                     // Render fenced code blocks
  39	Autolink                                       // Detect embedded URLs that are not explicitly marked
  40	Strikethrough                                  // Strikethrough text using ~~test~~
  41	LaxHTMLBlocks                                  // Loosen up HTML block parsing rules
  42	SpaceHeaders                                   // Be strict about prefix header rules
  43	HardLineBreak                                  // Translate newlines into line breaks
  44	TabSizeEight                                   // Expand tabs to eight spaces instead of four
  45	Footnotes                                      // Pandoc-style footnotes
  46	NoEmptyLineBeforeBlock                         // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
  47	HeaderIDs                                      // specify header IDs  with {#id}
  48	Titleblock                                     // Titleblock ala pandoc
  49	AutoHeaderIDs                                  // Create the header ID from the text
  50	BackslashLineBreak                             // Translate trailing backslashes into line breaks
  51	DefinitionLists                                // Render definition lists
  52	Smartypants                                    // Enable smart punctuation substitutions
  53	SmartypantsFractions                           // Enable smart fractions (with Smartypants)
  54	SmartypantsDashes                              // Enable smart dashes (with Smartypants)
  55	SmartypantsLatexDashes                         // Enable LaTeX-style dashes (with Smartypants)
  56	SmartypantsAngledQuotes                        // Enable angled double quotes (with Smartypants) for double quotes rendering
  57	TOC                                            // Generate a table of contents
  58	OmitContents                                   // Skip the main contents (for a standalone table of contents)
  59
  60	CommonHtmlFlags HTMLFlags = UseXHTML
  61
  62	CommonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
  63		Autolink | Strikethrough | SpaceHeaders | HeaderIDs |
  64		BackslashLineBreak | DefinitionLists | Smartypants |
  65		SmartypantsFractions | SmartypantsDashes | SmartypantsLatexDashes
  66)
  67
  68var DefaultOptions = Options{
  69	Extensions: CommonExtensions,
  70}
  71
  72type LinkType int
  73
  74// These are the possible flag values for the link renderer.
  75// Only a single one of these values will be used; they are not ORed together.
  76// These are mostly of interest if you are writing a new output format.
  77const (
  78	LinkTypeNotAutolink LinkType = iota
  79	LinkTypeNormal
  80	LinkTypeEmail
  81)
  82
  83type ListType int
  84
  85// These are the possible flag values for the ListItem renderer.
  86// Multiple flag values may be ORed together.
  87// These are mostly of interest if you are writing a new output format.
  88const (
  89	ListTypeOrdered ListType = 1 << iota
  90	ListTypeDefinition
  91	ListTypeTerm
  92
  93	ListItemContainsBlock
  94	ListItemBeginningOfList
  95	ListItemEndOfList
  96)
  97
  98type CellAlignFlags int
  99
 100// These are the possible flag values for the table cell renderer.
 101// Only a single one of these values will be used; they are not ORed together.
 102// These are mostly of interest if you are writing a new output format.
 103const (
 104	TableAlignmentLeft = 1 << iota
 105	TableAlignmentRight
 106	TableAlignmentCenter = (TableAlignmentLeft | TableAlignmentRight)
 107)
 108
 109// The size of a tab stop.
 110const (
 111	TabSizeDefault = 4
 112	TabSizeDouble  = 8
 113)
 114
 115// blockTags is a set of tags that are recognized as HTML block tags.
 116// Any of these can be included in markdown text without special escaping.
 117var blockTags = map[string]struct{}{
 118	"blockquote": struct{}{},
 119	"del":        struct{}{},
 120	"div":        struct{}{},
 121	"dl":         struct{}{},
 122	"fieldset":   struct{}{},
 123	"form":       struct{}{},
 124	"h1":         struct{}{},
 125	"h2":         struct{}{},
 126	"h3":         struct{}{},
 127	"h4":         struct{}{},
 128	"h5":         struct{}{},
 129	"h6":         struct{}{},
 130	"iframe":     struct{}{},
 131	"ins":        struct{}{},
 132	"math":       struct{}{},
 133	"noscript":   struct{}{},
 134	"ol":         struct{}{},
 135	"pre":        struct{}{},
 136	"p":          struct{}{},
 137	"script":     struct{}{},
 138	"style":      struct{}{},
 139	"table":      struct{}{},
 140	"ul":         struct{}{},
 141
 142	// HTML5
 143	"address":    struct{}{},
 144	"article":    struct{}{},
 145	"aside":      struct{}{},
 146	"canvas":     struct{}{},
 147	"figcaption": struct{}{},
 148	"figure":     struct{}{},
 149	"footer":     struct{}{},
 150	"header":     struct{}{},
 151	"hgroup":     struct{}{},
 152	"main":       struct{}{},
 153	"nav":        struct{}{},
 154	"output":     struct{}{},
 155	"progress":   struct{}{},
 156	"section":    struct{}{},
 157	"video":      struct{}{},
 158}
 159
 160// Renderer is the rendering interface.
 161// This is mostly of interest if you are implementing a new rendering format.
 162//
 163// When a byte slice is provided, it contains the (rendered) contents of the
 164// element.
 165//
 166// When a callback is provided instead, it will write the contents of the
 167// respective element directly to the output buffer and return true on success.
 168// If the callback returns false, the rendering function should reset the
 169// output buffer as though it had never been called.
 170//
 171// Currently Html and Latex implementations are provided
 172type Renderer interface {
 173	// block-level callbacks
 174	BlockCode(text []byte, lang string)
 175	BlockQuote(text []byte)
 176	BlockHtml(text []byte)
 177	BeginHeader(level int, id string)
 178	EndHeader(level int, id string, header []byte)
 179	HRule()
 180	BeginList(flags ListType)
 181	EndList(flags ListType)
 182	ListItem(text []byte, flags ListType)
 183	BeginParagraph()
 184	EndParagraph()
 185	Table(header []byte, body []byte, columnData []CellAlignFlags)
 186	TableRow(text []byte)
 187	TableHeaderCell(out *bytes.Buffer, text []byte, flags CellAlignFlags)
 188	TableCell(out *bytes.Buffer, text []byte, flags CellAlignFlags)
 189	BeginFootnotes()
 190	EndFootnotes()
 191	FootnoteItem(name, text []byte, flags ListType)
 192	TitleBlock(text []byte)
 193
 194	// Span-level callbacks
 195	AutoLink(link []byte, kind LinkType)
 196	CodeSpan(text []byte)
 197	DoubleEmphasis(text []byte)
 198	Emphasis(text []byte)
 199	Image(link []byte, title []byte, alt []byte)
 200	LineBreak()
 201	Link(link []byte, title []byte, content []byte)
 202	RawHtmlTag(tag []byte)
 203	TripleEmphasis(text []byte)
 204	StrikeThrough(text []byte)
 205	FootnoteRef(ref []byte, id int)
 206
 207	// Low-level callbacks
 208	Entity(entity []byte)
 209	NormalText(text []byte)
 210
 211	// Header and footer
 212	DocumentHeader()
 213	DocumentFooter()
 214
 215	Write(b []byte) (int, error)
 216
 217	Render(ast *Node) []byte
 218}
 219
 220// Callback functions for inline parsing. One such function is defined
 221// for each character that triggers a response when parsing inline data.
 222type inlineParser func(p *parser, data []byte, offset int) int
 223
 224// Parser holds runtime state used by the parser.
 225// This is constructed by the Markdown function.
 226type parser struct {
 227	refOverride    ReferenceOverrideFunc
 228	refs           map[string]*reference
 229	inlineCallback [256]inlineParser
 230	flags          Extensions
 231	nesting        int
 232	maxNesting     int
 233	insideLink     bool
 234
 235	// Footnotes need to be ordered as well as available to quickly check for
 236	// presence. If a ref is also a footnote, it's stored both in refs and here
 237	// in notes. Slice is nil if footnotes not enabled.
 238	notes []*reference
 239
 240	doc                  *Node
 241	tip                  *Node // = doc
 242	oldTip               *Node
 243	lastMatchedContainer *Node // = doc
 244	allClosed            bool
 245	currBlock            *Node // a block node currently being parsed by inline parser
 246}
 247
 248func (p *parser) getRef(refid string) (ref *reference, found bool) {
 249	if p.refOverride != nil {
 250		r, overridden := p.refOverride(refid)
 251		if overridden {
 252			if r == nil {
 253				return nil, false
 254			}
 255			return &reference{
 256				link:     []byte(r.Link),
 257				title:    []byte(r.Title),
 258				noteId:   0,
 259				hasBlock: false,
 260				text:     []byte(r.Text)}, true
 261		}
 262	}
 263	// refs are case insensitive
 264	ref, found = p.refs[strings.ToLower(refid)]
 265	return ref, found
 266}
 267
 268func (p *parser) finalize(block *Node) {
 269	above := block.Parent
 270	block.open = false
 271	p.tip = above
 272}
 273
 274func (p *parser) addChild(node NodeType, offset uint32) *Node {
 275	for !p.tip.canContain(node) {
 276		p.finalize(p.tip)
 277	}
 278	newNode := NewNode(node)
 279	newNode.content = []byte{}
 280	p.tip.appendChild(newNode)
 281	p.tip = newNode
 282	return newNode
 283}
 284
 285func (p *parser) closeUnmatchedBlocks() {
 286	if !p.allClosed {
 287		for p.oldTip != p.lastMatchedContainer {
 288			parent := p.oldTip.Parent
 289			p.finalize(p.oldTip)
 290			p.oldTip = parent
 291		}
 292		p.allClosed = true
 293	}
 294}
 295
 296//
 297//
 298// Public interface
 299//
 300//
 301
 302// Reference represents the details of a link.
 303// See the documentation in Options for more details on use-case.
 304type Reference struct {
 305	// Link is usually the URL the reference points to.
 306	Link string
 307	// Title is the alternate text describing the link in more detail.
 308	Title string
 309	// Text is the optional text to override the ref with if the syntax used was
 310	// [refid][]
 311	Text string
 312}
 313
 314// ReferenceOverrideFunc is expected to be called with a reference string and
 315// return either a valid Reference type that the reference string maps to or
 316// nil. If overridden is false, the default reference logic will be executed.
 317// See the documentation in Options for more details on use-case.
 318type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
 319
 320// Options represents configurable overrides and callbacks (in addition to the
 321// extension flag set) for configuring a Markdown parse.
 322type Options struct {
 323	// Extensions is a flag set of bit-wise ORed extension bits. See the
 324	// Extensions flags defined in this package.
 325	Extensions Extensions
 326
 327	// ReferenceOverride is an optional function callback that is called every
 328	// time a reference is resolved.
 329	//
 330	// In Markdown, the link reference syntax can be made to resolve a link to
 331	// a reference instead of an inline URL, in one of the following ways:
 332	//
 333	//  * [link text][refid]
 334	//  * [refid][]
 335	//
 336	// Usually, the refid is defined at the bottom of the Markdown document. If
 337	// this override function is provided, the refid is passed to the override
 338	// function first, before consulting the defined refids at the bottom. If
 339	// the override function indicates an override did not occur, the refids at
 340	// the bottom will be used to fill in the link details.
 341	ReferenceOverride ReferenceOverrideFunc
 342}
 343
 344// MarkdownBasic is a convenience function for simple rendering.
 345// It processes markdown input with no extensions enabled.
 346func MarkdownBasic(input []byte) []byte {
 347	// set up the HTML renderer
 348	htmlFlags := UseXHTML
 349	renderer := HTMLRenderer(htmlFlags, CommonExtensions, "", "")
 350
 351	// set up the parser
 352	return MarkdownOptions(input, renderer, Options{Extensions: 0})
 353}
 354
 355// Call Markdown with most useful extensions enabled
 356// MarkdownCommon is a convenience function for simple rendering.
 357// It processes markdown input with common extensions enabled, including:
 358//
 359// * Smartypants processing with smart fractions and LaTeX dashes
 360//
 361// * Intra-word emphasis suppression
 362//
 363// * Tables
 364//
 365// * Fenced code blocks
 366//
 367// * Autolinking
 368//
 369// * Strikethrough support
 370//
 371// * Strict header parsing
 372//
 373// * Custom Header IDs
 374func MarkdownCommon(input []byte) []byte {
 375	// set up the HTML renderer
 376	renderer := HTMLRenderer(CommonHtmlFlags, CommonExtensions, "", "")
 377	return MarkdownOptions(input, renderer, DefaultOptions)
 378}
 379
 380// Markdown is the main rendering function.
 381// It parses and renders a block of markdown-encoded text.
 382// The supplied Renderer is used to format the output, and extensions dictates
 383// which non-standard extensions are enabled.
 384//
 385// To use the supplied Html or LaTeX renderers, see HtmlRenderer and
 386// LatexRenderer, respectively.
 387func Markdown(input []byte, renderer Renderer, extensions Extensions) []byte {
 388	return MarkdownOptions(input, renderer, Options{
 389		Extensions: extensions})
 390}
 391
 392// MarkdownOptions is just like Markdown but takes additional options through
 393// the Options struct.
 394func MarkdownOptions(input []byte, renderer Renderer, opts Options) []byte {
 395	// no point in parsing if we can't render
 396	if renderer == nil {
 397		return nil
 398	}
 399	return renderer.Render(Parse(input, opts))
 400}
 401
 402func Parse(input []byte, opts Options) *Node {
 403	extensions := opts.Extensions
 404
 405	// fill in the render structure
 406	p := new(parser)
 407	p.flags = extensions
 408	p.refOverride = opts.ReferenceOverride
 409	p.refs = make(map[string]*reference)
 410	p.maxNesting = 16
 411	p.insideLink = false
 412
 413	docNode := NewNode(Document)
 414	p.doc = docNode
 415	p.tip = docNode
 416	p.oldTip = docNode
 417	p.lastMatchedContainer = docNode
 418	p.allClosed = true
 419
 420	// register inline parsers
 421	p.inlineCallback['*'] = emphasis
 422	p.inlineCallback['_'] = emphasis
 423	if extensions&Strikethrough != 0 {
 424		p.inlineCallback['~'] = emphasis
 425	}
 426	p.inlineCallback['`'] = codeSpan
 427	p.inlineCallback['\n'] = lineBreak
 428	p.inlineCallback['['] = link
 429	p.inlineCallback['<'] = leftAngle
 430	p.inlineCallback['\\'] = escape
 431	p.inlineCallback['&'] = entity
 432	p.inlineCallback['!'] = maybeImage
 433	p.inlineCallback['^'] = maybeInlineFootnote
 434
 435	if extensions&Autolink != 0 {
 436		p.inlineCallback['h'] = maybeAutoLink
 437		p.inlineCallback['m'] = maybeAutoLink
 438		p.inlineCallback['f'] = maybeAutoLink
 439		p.inlineCallback['H'] = maybeAutoLink
 440		p.inlineCallback['M'] = maybeAutoLink
 441		p.inlineCallback['F'] = maybeAutoLink
 442	}
 443
 444	if extensions&Footnotes != 0 {
 445		p.notes = make([]*reference, 0)
 446	}
 447
 448	first := firstPass(p, input)
 449	secondPass(p, first)
 450	// Walk the tree and finish up some of unfinished blocks
 451	for p.tip != nil {
 452		p.finalize(p.tip)
 453	}
 454	// Walk the tree again and process inline markdown in each block
 455	p.doc.Walk(func(node *Node, entering bool) WalkStatus {
 456		if node.Type == Paragraph || node.Type == Header || node.Type == TableCell {
 457			p.currBlock = node
 458			p.inline(node.content)
 459			node.content = nil
 460		}
 461		return GoToNext
 462	})
 463	p.parseRefsToAST()
 464	p.generateTOC()
 465	return p.doc
 466}
 467
 468func (p *parser) generateTOC() {
 469	if p.flags&TOC == 0 && p.flags&OmitContents == 0 {
 470		return
 471	}
 472	navNode := NewNode(HTMLBlock)
 473	navNode.Literal = []byte("<nav>")
 474	navNode.open = false
 475
 476	var topList *Node
 477	var listNode *Node
 478	var lastItem *Node
 479	headerCount := 0
 480	var currentLevel uint32
 481	p.doc.Walk(func(node *Node, entering bool) WalkStatus {
 482		if entering && node.Type == Header {
 483			if node.Level > currentLevel {
 484				currentLevel++
 485				newList := NewNode(List)
 486				if lastItem != nil {
 487					lastItem.appendChild(newList)
 488					listNode = newList
 489				} else {
 490					listNode = newList
 491					topList = listNode
 492				}
 493			}
 494			if node.Level < currentLevel {
 495				finalizeList(listNode)
 496				lastItem = listNode.Parent
 497				listNode = lastItem.Parent
 498			}
 499			node.HeaderID = fmt.Sprintf("toc_%d", headerCount)
 500			headerCount++
 501			lastItem = NewNode(Item)
 502			listNode.appendChild(lastItem)
 503			anchorNode := NewNode(Link)
 504			anchorNode.Destination = []byte("#" + node.HeaderID)
 505			lastItem.appendChild(anchorNode)
 506			anchorNode.appendChild(text(node.FirstChild.Literal))
 507		}
 508		return GoToNext
 509	})
 510	firstChild := p.doc.FirstChild
 511	// Insert TOC only if there is anything to insert
 512	if topList != nil {
 513		finalizeList(topList)
 514		firstChild.insertBefore(navNode)
 515		firstChild.insertBefore(topList)
 516		navCloseNode := NewNode(HTMLBlock)
 517		navCloseNode.Literal = []byte("</nav>")
 518		navCloseNode.open = false
 519		firstChild.insertBefore(navCloseNode)
 520	}
 521	// Drop everything after the TOC if OmitContents was requested
 522	if p.flags&OmitContents != 0 {
 523		for firstChild != nil {
 524			next := firstChild.Next
 525			firstChild.unlink()
 526			firstChild = next
 527		}
 528	}
 529}
 530
 531func (p *parser) parseRefsToAST() {
 532	if p.flags&Footnotes == 0 || len(p.notes) == 0 {
 533		return
 534	}
 535	p.tip = p.doc
 536	finalizeHtmlBlock(p.addBlock(HTMLBlock, []byte(`<div class="footnotes">`)))
 537	p.addBlock(HorizontalRule, nil)
 538	block := p.addBlock(List, nil)
 539	block.ListFlags = ListTypeOrdered
 540	flags := ListItemBeginningOfList
 541	// Note: this loop is intentionally explicit, not range-form. This is
 542	// because the body of the loop will append nested footnotes to p.notes and
 543	// we need to process those late additions. Range form would only walk over
 544	// the fixed initial set.
 545	for i := 0; i < len(p.notes); i++ {
 546		ref := p.notes[i]
 547		block := p.addBlock(Item, nil)
 548		block.ListFlags = ListTypeOrdered
 549		block.RefLink = ref.link
 550		if ref.hasBlock {
 551			flags |= ListItemContainsBlock
 552			p.block(ref.title)
 553		} else {
 554			p.currBlock = block
 555			p.inline(ref.title)
 556		}
 557		flags &^= ListItemBeginningOfList | ListItemContainsBlock
 558	}
 559	above := block.Parent
 560	finalizeList(block)
 561	p.tip = above
 562	finalizeHtmlBlock(p.addBlock(HTMLBlock, []byte("</div>")))
 563	block.Walk(func(node *Node, entering bool) WalkStatus {
 564		if node.Type == Paragraph || node.Type == Header {
 565			p.currBlock = node
 566			p.inline(node.content)
 567			node.content = nil
 568		}
 569		return GoToNext
 570	})
 571}
 572
 573// first pass:
 574// - extract references
 575// - expand tabs
 576// - normalize newlines
 577// - copy everything else
 578func firstPass(p *parser, input []byte) []byte {
 579	var out bytes.Buffer
 580	tabSize := TabSizeDefault
 581	if p.flags&TabSizeEight != 0 {
 582		tabSize = TabSizeDouble
 583	}
 584	beg, end := 0, 0
 585	lastFencedCodeBlockEnd := 0
 586	for beg < len(input) { // iterate over lines
 587		if end = isReference(p, input[beg:], tabSize); end > 0 {
 588			beg += end
 589		} else { // skip to the next line
 590			end = beg
 591			for end < len(input) && input[end] != '\n' && input[end] != '\r' {
 592				end++
 593			}
 594
 595			if p.flags&FencedCode != 0 {
 596				// track fenced code block boundaries to suppress tab expansion
 597				// inside them:
 598				if beg >= lastFencedCodeBlockEnd {
 599					if i := p.fencedCode(input[beg:], false); i > 0 {
 600						lastFencedCodeBlockEnd = beg + i
 601					}
 602				}
 603			}
 604
 605			// add the line body if present
 606			if end > beg {
 607				if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
 608					out.Write(input[beg:end])
 609				} else {
 610					expandTabs(&out, input[beg:end], tabSize)
 611				}
 612			}
 613			out.WriteByte('\n')
 614
 615			if end < len(input) && input[end] == '\r' {
 616				end++
 617			}
 618			if end < len(input) && input[end] == '\n' {
 619				end++
 620			}
 621
 622			beg = end
 623		}
 624	}
 625
 626	// empty input?
 627	if out.Len() == 0 {
 628		out.WriteByte('\n')
 629	}
 630
 631	return out.Bytes()
 632}
 633
 634// second pass: actual rendering
 635func secondPass(p *parser, input []byte) {
 636	p.block(input)
 637
 638	if p.flags&Footnotes != 0 && len(p.notes) > 0 {
 639		flags := ListItemBeginningOfList
 640		for i := 0; i < len(p.notes); i += 1 {
 641			ref := p.notes[i]
 642			if ref.hasBlock {
 643				flags |= ListItemContainsBlock
 644				p.block(ref.title)
 645			} else {
 646				p.inline(ref.title)
 647			}
 648			flags &^= ListItemBeginningOfList | ListItemContainsBlock
 649		}
 650	}
 651
 652	if p.nesting != 0 {
 653		panic("Nesting level did not end at zero")
 654	}
 655}
 656
 657//
 658// Link references
 659//
 660// This section implements support for references that (usually) appear
 661// as footnotes in a document, and can be referenced anywhere in the document.
 662// The basic format is:
 663//
 664//    [1]: http://www.google.com/ "Google"
 665//    [2]: http://www.github.com/ "Github"
 666//
 667// Anywhere in the document, the reference can be linked by referring to its
 668// label, i.e., 1 and 2 in this example, as in:
 669//
 670//    This library is hosted on [Github][2], a git hosting site.
 671//
 672// Actual footnotes as specified in Pandoc and supported by some other Markdown
 673// libraries such as php-markdown are also taken care of. They look like this:
 674//
 675//    This sentence needs a bit of further explanation.[^note]
 676//
 677//    [^note]: This is the explanation.
 678//
 679// Footnotes should be placed at the end of the document in an ordered list.
 680// Inline footnotes such as:
 681//
 682//    Inline footnotes^[Not supported.] also exist.
 683//
 684// are not yet supported.
 685
 686// References are parsed and stored in this struct.
 687type reference struct {
 688	link     []byte
 689	title    []byte
 690	noteId   int // 0 if not a footnote ref
 691	hasBlock bool
 692	text     []byte
 693}
 694
 695func (r *reference) String() string {
 696	return fmt.Sprintf("{link: %q, title: %q, text: %q, noteId: %d, hasBlock: %v}",
 697		r.link, r.title, r.text, r.noteId, r.hasBlock)
 698}
 699
 700// Check whether or not data starts with a reference link.
 701// If so, it is parsed and stored in the list of references
 702// (in the render struct).
 703// Returns the number of bytes to skip to move past it,
 704// or zero if the first line is not a reference.
 705func isReference(p *parser, data []byte, tabSize int) int {
 706	// up to 3 optional leading spaces
 707	if len(data) < 4 {
 708		return 0
 709	}
 710	i := 0
 711	for i < 3 && data[i] == ' ' {
 712		i++
 713	}
 714
 715	noteId := 0
 716
 717	// id part: anything but a newline between brackets
 718	if data[i] != '[' {
 719		return 0
 720	}
 721	i++
 722	if p.flags&Footnotes != 0 {
 723		if i < len(data) && data[i] == '^' {
 724			// we can set it to anything here because the proper noteIds will
 725			// be assigned later during the second pass. It just has to be != 0
 726			noteId = 1
 727			i++
 728		}
 729	}
 730	idOffset := i
 731	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
 732		i++
 733	}
 734	if i >= len(data) || data[i] != ']' {
 735		return 0
 736	}
 737	idEnd := i
 738
 739	// spacer: colon (space | tab)* newline? (space | tab)*
 740	i++
 741	if i >= len(data) || data[i] != ':' {
 742		return 0
 743	}
 744	i++
 745	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 746		i++
 747	}
 748	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
 749		i++
 750		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
 751			i++
 752		}
 753	}
 754	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 755		i++
 756	}
 757	if i >= len(data) {
 758		return 0
 759	}
 760
 761	var (
 762		linkOffset, linkEnd   int
 763		titleOffset, titleEnd int
 764		lineEnd               int
 765		raw                   []byte
 766		hasBlock              bool
 767	)
 768
 769	if p.flags&Footnotes != 0 && noteId != 0 {
 770		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
 771		lineEnd = linkEnd
 772	} else {
 773		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
 774	}
 775	if lineEnd == 0 {
 776		return 0
 777	}
 778
 779	// a valid ref has been found
 780
 781	ref := &reference{
 782		noteId:   noteId,
 783		hasBlock: hasBlock,
 784	}
 785
 786	if noteId > 0 {
 787		// reusing the link field for the id since footnotes don't have links
 788		ref.link = data[idOffset:idEnd]
 789		// if footnote, it's not really a title, it's the contained text
 790		ref.title = raw
 791	} else {
 792		ref.link = data[linkOffset:linkEnd]
 793		ref.title = data[titleOffset:titleEnd]
 794	}
 795
 796	// id matches are case-insensitive
 797	id := string(bytes.ToLower(data[idOffset:idEnd]))
 798
 799	p.refs[id] = ref
 800
 801	return lineEnd
 802}
 803
 804func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
 805	// link: whitespace-free sequence, optionally between angle brackets
 806	if data[i] == '<' {
 807		i++
 808	}
 809	linkOffset = i
 810	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
 811		i++
 812	}
 813	if i == len(data) {
 814		return
 815	}
 816	linkEnd = i
 817	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
 818		linkOffset++
 819		linkEnd--
 820	}
 821
 822	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
 823	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 824		i++
 825	}
 826	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
 827		return
 828	}
 829
 830	// compute end-of-line
 831	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
 832		lineEnd = i
 833	}
 834	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
 835		lineEnd++
 836	}
 837
 838	// optional (space|tab)* spacer after a newline
 839	if lineEnd > 0 {
 840		i = lineEnd + 1
 841		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 842			i++
 843		}
 844	}
 845
 846	// optional title: any non-newline sequence enclosed in '"() alone on its line
 847	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
 848		i++
 849		titleOffset = i
 850
 851		// look for EOL
 852		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
 853			i++
 854		}
 855		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
 856			titleEnd = i + 1
 857		} else {
 858			titleEnd = i
 859		}
 860
 861		// step back
 862		i--
 863		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
 864			i--
 865		}
 866		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
 867			lineEnd = titleEnd
 868			titleEnd = i
 869		}
 870	}
 871
 872	return
 873}
 874
 875// The first bit of this logic is the same as (*parser).listItem, but the rest
 876// is much simpler. This function simply finds the entire block and shifts it
 877// over by one tab if it is indeed a block (just returns the line if it's not).
 878// blockEnd is the end of the section in the input buffer, and contents is the
 879// extracted text that was shifted over one tab. It will need to be rendered at
 880// the end of the document.
 881func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
 882	if i == 0 || len(data) == 0 {
 883		return
 884	}
 885
 886	// skip leading whitespace on first line
 887	for i < len(data) && data[i] == ' ' {
 888		i++
 889	}
 890
 891	blockStart = i
 892
 893	// find the end of the line
 894	blockEnd = i
 895	for i < len(data) && data[i-1] != '\n' {
 896		i++
 897	}
 898
 899	// get working buffer
 900	var raw bytes.Buffer
 901
 902	// put the first line into the working buffer
 903	raw.Write(data[blockEnd:i])
 904	blockEnd = i
 905
 906	// process the following lines
 907	containsBlankLine := false
 908
 909gatherLines:
 910	for blockEnd < len(data) {
 911		i++
 912
 913		// find the end of this line
 914		for i < len(data) && data[i-1] != '\n' {
 915			i++
 916		}
 917
 918		// if it is an empty line, guess that it is part of this item
 919		// and move on to the next line
 920		if p.isEmpty(data[blockEnd:i]) > 0 {
 921			containsBlankLine = true
 922			blockEnd = i
 923			continue
 924		}
 925
 926		n := 0
 927		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
 928			// this is the end of the block.
 929			// we don't want to include this last line in the index.
 930			break gatherLines
 931		}
 932
 933		// if there were blank lines before this one, insert a new one now
 934		if containsBlankLine {
 935			raw.WriteByte('\n')
 936			containsBlankLine = false
 937		}
 938
 939		// get rid of that first tab, write to buffer
 940		raw.Write(data[blockEnd+n : i])
 941		hasBlock = true
 942
 943		blockEnd = i
 944	}
 945
 946	if data[blockEnd-1] != '\n' {
 947		raw.WriteByte('\n')
 948	}
 949
 950	contents = raw.Bytes()
 951
 952	return
 953}
 954
 955//
 956//
 957// Miscellaneous helper functions
 958//
 959//
 960
 961// Test if a character is a punctuation symbol.
 962// Taken from a private function in regexp in the stdlib.
 963func ispunct(c byte) bool {
 964	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
 965		if c == r {
 966			return true
 967		}
 968	}
 969	return false
 970}
 971
 972// Test if a character is a whitespace character.
 973func isspace(c byte) bool {
 974	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
 975}
 976
 977// Test if a character is letter.
 978func isletter(c byte) bool {
 979	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
 980}
 981
 982// Test if a character is a letter or a digit.
 983// TODO: check when this is looking for ASCII alnum and when it should use unicode
 984func isalnum(c byte) bool {
 985	return (c >= '0' && c <= '9') || isletter(c)
 986}
 987
 988// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
 989// always ends output with a newline
 990func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
 991	// first, check for common cases: no tabs, or only tabs at beginning of line
 992	i, prefix := 0, 0
 993	slowcase := false
 994	for i = 0; i < len(line); i++ {
 995		if line[i] == '\t' {
 996			if prefix == i {
 997				prefix++
 998			} else {
 999				slowcase = true
1000				break
1001			}
1002		}
1003	}
1004
1005	// no need to decode runes if all tabs are at the beginning of the line
1006	if !slowcase {
1007		for i = 0; i < prefix*tabSize; i++ {
1008			out.WriteByte(' ')
1009		}
1010		out.Write(line[prefix:])
1011		return
1012	}
1013
1014	// the slow case: we need to count runes to figure out how
1015	// many spaces to insert for each tab
1016	column := 0
1017	i = 0
1018	for i < len(line) {
1019		start := i
1020		for i < len(line) && line[i] != '\t' {
1021			_, size := utf8.DecodeRune(line[i:])
1022			i += size
1023			column++
1024		}
1025
1026		if i > start {
1027			out.Write(line[start:i])
1028		}
1029
1030		if i >= len(line) {
1031			break
1032		}
1033
1034		for {
1035			out.WriteByte(' ')
1036			column++
1037			if column%tabSize == 0 {
1038				break
1039			}
1040		}
1041
1042		i++
1043	}
1044}
1045
1046// Find if a line counts as indented or not.
1047// Returns number of characters the indent is (0 = not indented).
1048func isIndented(data []byte, indentSize int) int {
1049	if len(data) == 0 {
1050		return 0
1051	}
1052	if data[0] == '\t' {
1053		return 1
1054	}
1055	if len(data) < indentSize {
1056		return 0
1057	}
1058	for i := 0; i < indentSize; i++ {
1059		if data[i] != ' ' {
1060			return 0
1061		}
1062	}
1063	return indentSize
1064}
1065
1066// Create a url-safe slug for fragments
1067func slugify(in []byte) []byte {
1068	if len(in) == 0 {
1069		return in
1070	}
1071	out := make([]byte, 0, len(in))
1072	sym := false
1073
1074	for _, ch := range in {
1075		if isalnum(ch) {
1076			sym = false
1077			out = append(out, ch)
1078		} else if sym {
1079			continue
1080		} else {
1081			out = append(out, '-')
1082			sym = true
1083		}
1084	}
1085	var a, b int
1086	var ch byte
1087	for a, ch = range out {
1088		if ch != '-' {
1089			break
1090		}
1091	}
1092	for b = len(out) - 1; b > 0; b-- {
1093		if out[b] != '-' {
1094			break
1095		}
1096	}
1097	return out[a : b+1]
1098}