all repos — grayfriday @ 607478a8ce5e5da6d7ab19469b507912ef20eccd

blackfriday fork with a few changes

markdown.go (view raw)

   1//
   2// Blackfriday Markdown Processor
   3// Available at http://github.com/russross/blackfriday
   4//
   5// Copyright © 2011 Russ Ross <russ@russross.com>.
   6// Distributed under the Simplified BSD License.
   7// See README.md for details.
   8//
   9
  10//
  11//
  12// Markdown parsing and processing
  13//
  14//
  15
  16// Blackfriday markdown processor.
  17//
  18// Translates plain text with simple formatting rules into HTML or LaTeX.
  19package blackfriday
  20
  21import (
  22	"bytes"
  23	"fmt"
  24	"strings"
  25	"unicode/utf8"
  26)
  27
  28const VERSION = "1.4"
  29
  30type Extensions int
  31
  32// These are the supported markdown parsing extensions.
  33// OR these values together to select multiple extensions.
  34const (
  35	NoExtensions            Extensions = 0
  36	NoIntraEmphasis         Extensions = 1 << iota // Ignore emphasis markers inside words
  37	Tables                                         // Render tables
  38	FencedCode                                     // Render fenced code blocks
  39	Autolink                                       // Detect embedded URLs that are not explicitly marked
  40	Strikethrough                                  // Strikethrough text using ~~test~~
  41	LaxHTMLBlocks                                  // Loosen up HTML block parsing rules
  42	SpaceHeaders                                   // Be strict about prefix header rules
  43	HardLineBreak                                  // Translate newlines into line breaks
  44	TabSizeEight                                   // Expand tabs to eight spaces instead of four
  45	Footnotes                                      // Pandoc-style footnotes
  46	NoEmptyLineBeforeBlock                         // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
  47	HeaderIDs                                      // specify header IDs  with {#id}
  48	Titleblock                                     // Titleblock ala pandoc
  49	AutoHeaderIDs                                  // Create the header ID from the text
  50	BackslashLineBreak                             // Translate trailing backslashes into line breaks
  51	DefinitionLists                                // Render definition lists
  52	Smartypants                                    // Enable smart punctuation substitutions
  53	SmartypantsFractions                           // Enable smart fractions (with Smartypants)
  54	SmartypantsDashes                              // Enable smart dashes (with Smartypants)
  55	SmartypantsLatexDashes                         // Enable LaTeX-style dashes (with Smartypants)
  56	SmartypantsAngledQuotes                        // Enable angled double quotes (with Smartypants) for double quotes rendering
  57	TOC                                            // Generate a table of contents
  58	OmitContents                                   // Skip the main contents (for a standalone table of contents)
  59
  60	CommonHtmlFlags HTMLFlags = UseXHTML
  61
  62	CommonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
  63		Autolink | Strikethrough | SpaceHeaders | HeaderIDs |
  64		BackslashLineBreak | DefinitionLists | Smartypants |
  65		SmartypantsFractions | SmartypantsDashes | SmartypantsLatexDashes
  66)
  67
  68var DefaultOptions = Options{
  69	Extensions: CommonExtensions,
  70}
  71
  72type LinkType int
  73
  74// These are the possible flag values for the link renderer.
  75// Only a single one of these values will be used; they are not ORed together.
  76// These are mostly of interest if you are writing a new output format.
  77const (
  78	LinkTypeNotAutolink LinkType = iota
  79	LinkTypeNormal
  80	LinkTypeEmail
  81)
  82
  83type ListType int
  84
  85// These are the possible flag values for the ListItem renderer.
  86// Multiple flag values may be ORed together.
  87// These are mostly of interest if you are writing a new output format.
  88const (
  89	ListTypeOrdered ListType = 1 << iota
  90	ListTypeDefinition
  91	ListTypeTerm
  92
  93	ListItemContainsBlock
  94	ListItemBeginningOfList
  95	ListItemEndOfList
  96)
  97
  98type CellAlignFlags int
  99
 100// These are the possible flag values for the table cell renderer.
 101// Only a single one of these values will be used; they are not ORed together.
 102// These are mostly of interest if you are writing a new output format.
 103const (
 104	TableAlignmentLeft = 1 << iota
 105	TableAlignmentRight
 106	TableAlignmentCenter = (TableAlignmentLeft | TableAlignmentRight)
 107)
 108
 109// The size of a tab stop.
 110const (
 111	TabSizeDefault = 4
 112	TabSizeDouble  = 8
 113)
 114
 115// blockTags is a set of tags that are recognized as HTML block tags.
 116// Any of these can be included in markdown text without special escaping.
 117var blockTags = map[string]struct{}{
 118	"blockquote": struct{}{},
 119	"del":        struct{}{},
 120	"div":        struct{}{},
 121	"dl":         struct{}{},
 122	"fieldset":   struct{}{},
 123	"form":       struct{}{},
 124	"h1":         struct{}{},
 125	"h2":         struct{}{},
 126	"h3":         struct{}{},
 127	"h4":         struct{}{},
 128	"h5":         struct{}{},
 129	"h6":         struct{}{},
 130	"iframe":     struct{}{},
 131	"ins":        struct{}{},
 132	"math":       struct{}{},
 133	"noscript":   struct{}{},
 134	"ol":         struct{}{},
 135	"pre":        struct{}{},
 136	"p":          struct{}{},
 137	"script":     struct{}{},
 138	"style":      struct{}{},
 139	"table":      struct{}{},
 140	"ul":         struct{}{},
 141
 142	// HTML5
 143	"address":    struct{}{},
 144	"article":    struct{}{},
 145	"aside":      struct{}{},
 146	"canvas":     struct{}{},
 147	"figcaption": struct{}{},
 148	"figure":     struct{}{},
 149	"footer":     struct{}{},
 150	"header":     struct{}{},
 151	"hgroup":     struct{}{},
 152	"main":       struct{}{},
 153	"nav":        struct{}{},
 154	"output":     struct{}{},
 155	"progress":   struct{}{},
 156	"section":    struct{}{},
 157	"video":      struct{}{},
 158}
 159
 160// Renderer is the rendering interface.
 161// This is mostly of interest if you are implementing a new rendering format.
 162//
 163// When a byte slice is provided, it contains the (rendered) contents of the
 164// element.
 165//
 166// When a callback is provided instead, it will write the contents of the
 167// respective element directly to the output buffer and return true on success.
 168// If the callback returns false, the rendering function should reset the
 169// output buffer as though it had never been called.
 170//
 171// Currently Html and Latex implementations are provided
 172type Renderer interface {
 173	// block-level callbacks
 174	BlockCode(text []byte, lang string)
 175	BlockQuote(text []byte)
 176	BlockHtml(text []byte)
 177	BeginHeader(level int, id string)
 178	EndHeader(level int, id string, header []byte)
 179	HRule()
 180	BeginList(flags ListType)
 181	EndList(flags ListType)
 182	ListItem(text []byte, flags ListType)
 183	BeginParagraph()
 184	EndParagraph()
 185	Table(header []byte, body []byte, columnData []CellAlignFlags)
 186	TableRow(text []byte)
 187	TableHeaderCell(out *bytes.Buffer, text []byte, flags CellAlignFlags)
 188	TableCell(out *bytes.Buffer, text []byte, flags CellAlignFlags)
 189	BeginFootnotes()
 190	EndFootnotes()
 191	FootnoteItem(name, text []byte, flags ListType)
 192	TitleBlock(text []byte)
 193
 194	// Span-level callbacks
 195	AutoLink(link []byte, kind LinkType)
 196	CodeSpan(text []byte)
 197	DoubleEmphasis(text []byte)
 198	Emphasis(text []byte)
 199	Image(link []byte, title []byte, alt []byte)
 200	LineBreak()
 201	Link(link []byte, title []byte, content []byte)
 202	RawHtmlTag(tag []byte)
 203	TripleEmphasis(text []byte)
 204	StrikeThrough(text []byte)
 205	FootnoteRef(ref []byte, id int)
 206
 207	// Low-level callbacks
 208	Entity(entity []byte)
 209	NormalText(text []byte)
 210
 211	// Header and footer
 212	DocumentHeader()
 213	DocumentFooter()
 214
 215	Write(b []byte) (int, error)
 216
 217	Render(ast *Node) []byte
 218}
 219
 220// Callback functions for inline parsing. One such function is defined
 221// for each character that triggers a response when parsing inline data.
 222type inlineParser func(p *parser, data []byte, offset int) int
 223
 224// Parser holds runtime state used by the parser.
 225// This is constructed by the Markdown function.
 226type parser struct {
 227	refOverride    ReferenceOverrideFunc
 228	refs           map[string]*reference
 229	inlineCallback [256]inlineParser
 230	flags          Extensions
 231	nesting        int
 232	maxNesting     int
 233	insideLink     bool
 234
 235	// Footnotes need to be ordered as well as available to quickly check for
 236	// presence. If a ref is also a footnote, it's stored both in refs and here
 237	// in notes. Slice is nil if footnotes not enabled.
 238	notes []*reference
 239
 240	doc                  *Node
 241	tip                  *Node // = doc
 242	oldTip               *Node
 243	lastMatchedContainer *Node // = doc
 244	allClosed            bool
 245	currBlock            *Node // a block node currently being parsed by inline parser
 246}
 247
 248func (p *parser) getRef(refid string) (ref *reference, found bool) {
 249	if p.refOverride != nil {
 250		r, overridden := p.refOverride(refid)
 251		if overridden {
 252			if r == nil {
 253				return nil, false
 254			}
 255			return &reference{
 256				link:     []byte(r.Link),
 257				title:    []byte(r.Title),
 258				noteId:   0,
 259				hasBlock: false,
 260				text:     []byte(r.Text)}, true
 261		}
 262	}
 263	// refs are case insensitive
 264	ref, found = p.refs[strings.ToLower(refid)]
 265	return ref, found
 266}
 267
 268func (p *parser) finalize(block *Node) {
 269	above := block.Parent
 270	block.open = false
 271	p.tip = above
 272}
 273
 274func (p *parser) addChild(node NodeType, offset uint32) *Node {
 275	for !p.tip.canContain(node) {
 276		p.finalize(p.tip)
 277	}
 278	newNode := NewNode(node)
 279	newNode.content = []byte{}
 280	p.tip.appendChild(newNode)
 281	p.tip = newNode
 282	return newNode
 283}
 284
 285func (p *parser) closeUnmatchedBlocks() {
 286	if !p.allClosed {
 287		for p.oldTip != p.lastMatchedContainer {
 288			parent := p.oldTip.Parent
 289			p.finalize(p.oldTip)
 290			p.oldTip = parent
 291		}
 292		p.allClosed = true
 293	}
 294}
 295
 296//
 297//
 298// Public interface
 299//
 300//
 301
 302// Reference represents the details of a link.
 303// See the documentation in Options for more details on use-case.
 304type Reference struct {
 305	// Link is usually the URL the reference points to.
 306	Link string
 307	// Title is the alternate text describing the link in more detail.
 308	Title string
 309	// Text is the optional text to override the ref with if the syntax used was
 310	// [refid][]
 311	Text string
 312}
 313
 314// ReferenceOverrideFunc is expected to be called with a reference string and
 315// return either a valid Reference type that the reference string maps to or
 316// nil. If overridden is false, the default reference logic will be executed.
 317// See the documentation in Options for more details on use-case.
 318type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
 319
 320// Options represents configurable overrides and callbacks (in addition to the
 321// extension flag set) for configuring a Markdown parse.
 322type Options struct {
 323	// Extensions is a flag set of bit-wise ORed extension bits. See the
 324	// Extensions flags defined in this package.
 325	Extensions Extensions
 326
 327	// ReferenceOverride is an optional function callback that is called every
 328	// time a reference is resolved.
 329	//
 330	// In Markdown, the link reference syntax can be made to resolve a link to
 331	// a reference instead of an inline URL, in one of the following ways:
 332	//
 333	//  * [link text][refid]
 334	//  * [refid][]
 335	//
 336	// Usually, the refid is defined at the bottom of the Markdown document. If
 337	// this override function is provided, the refid is passed to the override
 338	// function first, before consulting the defined refids at the bottom. If
 339	// the override function indicates an override did not occur, the refids at
 340	// the bottom will be used to fill in the link details.
 341	ReferenceOverride ReferenceOverrideFunc
 342}
 343
 344// MarkdownBasic is a convenience function for simple rendering.
 345// It processes markdown input with no extensions enabled.
 346func MarkdownBasic(input []byte) []byte {
 347	// set up the HTML renderer
 348	htmlFlags := UseXHTML
 349	renderer := HTMLRenderer(htmlFlags, CommonExtensions, "", "")
 350
 351	// set up the parser
 352	return MarkdownOptions(input, renderer, Options{Extensions: 0})
 353}
 354
 355// Call Markdown with most useful extensions enabled
 356// MarkdownCommon is a convenience function for simple rendering.
 357// It processes markdown input with common extensions enabled, including:
 358//
 359// * Smartypants processing with smart fractions and LaTeX dashes
 360//
 361// * Intra-word emphasis suppression
 362//
 363// * Tables
 364//
 365// * Fenced code blocks
 366//
 367// * Autolinking
 368//
 369// * Strikethrough support
 370//
 371// * Strict header parsing
 372//
 373// * Custom Header IDs
 374func MarkdownCommon(input []byte) []byte {
 375	// set up the HTML renderer
 376	renderer := HTMLRenderer(CommonHtmlFlags, CommonExtensions, "", "")
 377	return MarkdownOptions(input, renderer, DefaultOptions)
 378}
 379
 380// Markdown is the main rendering function.
 381// It parses and renders a block of markdown-encoded text.
 382// The supplied Renderer is used to format the output, and extensions dictates
 383// which non-standard extensions are enabled.
 384//
 385// To use the supplied Html or LaTeX renderers, see HtmlRenderer and
 386// LatexRenderer, respectively.
 387func Markdown(input []byte, renderer Renderer, extensions Extensions) []byte {
 388	return MarkdownOptions(input, renderer, Options{
 389		Extensions: extensions})
 390}
 391
 392// MarkdownOptions is just like Markdown but takes additional options through
 393// the Options struct.
 394func MarkdownOptions(input []byte, renderer Renderer, opts Options) []byte {
 395	// no point in parsing if we can't render
 396	if renderer == nil {
 397		return nil
 398	}
 399	return renderer.Render(Parse(input, opts))
 400}
 401
 402func Parse(input []byte, opts Options) *Node {
 403	extensions := opts.Extensions
 404
 405	// fill in the render structure
 406	p := new(parser)
 407	p.flags = extensions
 408	p.refOverride = opts.ReferenceOverride
 409	p.refs = make(map[string]*reference)
 410	p.maxNesting = 16
 411	p.insideLink = false
 412
 413	docNode := NewNode(Document)
 414	p.doc = docNode
 415	p.tip = docNode
 416	p.oldTip = docNode
 417	p.lastMatchedContainer = docNode
 418	p.allClosed = true
 419
 420	// register inline parsers
 421	p.inlineCallback['*'] = emphasis
 422	p.inlineCallback['_'] = emphasis
 423	if extensions&Strikethrough != 0 {
 424		p.inlineCallback['~'] = emphasis
 425	}
 426	p.inlineCallback['`'] = codeSpan
 427	p.inlineCallback['\n'] = lineBreak
 428	p.inlineCallback['['] = link
 429	p.inlineCallback['<'] = leftAngle
 430	p.inlineCallback['\\'] = escape
 431	p.inlineCallback['&'] = entity
 432	p.inlineCallback['!'] = maybeImage
 433	p.inlineCallback['^'] = maybeInlineFootnote
 434
 435	if extensions&Autolink != 0 {
 436		p.inlineCallback['h'] = maybeAutoLink
 437		p.inlineCallback['m'] = maybeAutoLink
 438		p.inlineCallback['f'] = maybeAutoLink
 439		p.inlineCallback['H'] = maybeAutoLink
 440		p.inlineCallback['M'] = maybeAutoLink
 441		p.inlineCallback['F'] = maybeAutoLink
 442	}
 443
 444	if extensions&Footnotes != 0 {
 445		p.notes = make([]*reference, 0)
 446	}
 447
 448	first := firstPass(p, input)
 449	secondPass(p, first)
 450	// Walk the tree and finish up some of unfinished blocks
 451	for p.tip != nil {
 452		p.finalize(p.tip)
 453	}
 454	// Walk the tree again and process inline markdown in each block
 455	p.doc.Walk(func(node *Node, entering bool) {
 456		if node.Type == Paragraph || node.Type == Header || node.Type == TableCell {
 457			p.currBlock = node
 458			p.inline(node.content)
 459			node.content = nil
 460		}
 461	})
 462	p.parseRefsToAST()
 463	p.generateTOC()
 464	return p.doc
 465}
 466
 467func (p *parser) generateTOC() {
 468	if p.flags&TOC == 0 && p.flags&OmitContents == 0 {
 469		return
 470	}
 471	navNode := NewNode(HTMLBlock)
 472	navNode.Literal = []byte("<nav>")
 473	navNode.open = false
 474
 475	var topList *Node
 476	var listNode *Node
 477	var lastItem *Node
 478	headerCount := 0
 479	var currentLevel uint32
 480	p.doc.Walk(func(node *Node, entering bool) {
 481		if entering && node.Type == Header {
 482			if node.Level > currentLevel {
 483				currentLevel++
 484				newList := NewNode(List)
 485				if lastItem != nil {
 486					lastItem.appendChild(newList)
 487					listNode = newList
 488				} else {
 489					listNode = newList
 490					topList = listNode
 491				}
 492			}
 493			if node.Level < currentLevel {
 494				finalizeList(listNode)
 495				lastItem = listNode.Parent
 496				listNode = lastItem.Parent
 497			}
 498			node.HeaderID = fmt.Sprintf("toc_%d", headerCount)
 499			headerCount++
 500			lastItem = NewNode(Item)
 501			listNode.appendChild(lastItem)
 502			anchorNode := NewNode(Link)
 503			anchorNode.Destination = []byte("#" + node.HeaderID)
 504			lastItem.appendChild(anchorNode)
 505			anchorNode.appendChild(text(node.FirstChild.Literal))
 506		}
 507	})
 508	firstChild := p.doc.FirstChild
 509	// Insert TOC only if there is anything to insert
 510	if topList != nil {
 511		finalizeList(topList)
 512		firstChild.insertBefore(navNode)
 513		firstChild.insertBefore(topList)
 514		navCloseNode := NewNode(HTMLBlock)
 515		navCloseNode.Literal = []byte("</nav>")
 516		navCloseNode.open = false
 517		firstChild.insertBefore(navCloseNode)
 518	}
 519	// Drop everything after the TOC if OmitContents was requested
 520	if p.flags&OmitContents != 0 {
 521		for firstChild != nil {
 522			next := firstChild.Next
 523			firstChild.unlink()
 524			firstChild = next
 525		}
 526	}
 527}
 528
 529func (p *parser) parseRefsToAST() {
 530	if p.flags&Footnotes == 0 || len(p.notes) == 0 {
 531		return
 532	}
 533	p.tip = p.doc
 534	finalizeHtmlBlock(p.addBlock(HTMLBlock, []byte(`<div class="footnotes">`)))
 535	p.addBlock(HorizontalRule, nil)
 536	block := p.addBlock(List, nil)
 537	block.ListFlags = ListTypeOrdered
 538	flags := ListItemBeginningOfList
 539	// Note: this loop is intentionally explicit, not range-form. This is
 540	// because the body of the loop will append nested footnotes to p.notes and
 541	// we need to process those late additions. Range form would only walk over
 542	// the fixed initial set.
 543	for i := 0; i < len(p.notes); i++ {
 544		ref := p.notes[i]
 545		block := p.addBlock(Item, nil)
 546		block.ListFlags = ListTypeOrdered
 547		block.RefLink = ref.link
 548		if ref.hasBlock {
 549			flags |= ListItemContainsBlock
 550			p.block(ref.title)
 551		} else {
 552			p.currBlock = block
 553			p.inline(ref.title)
 554		}
 555		flags &^= ListItemBeginningOfList | ListItemContainsBlock
 556	}
 557	above := block.Parent
 558	finalizeList(block)
 559	p.tip = above
 560	finalizeHtmlBlock(p.addBlock(HTMLBlock, []byte("</div>")))
 561	block.Walk(func(node *Node, entering bool) {
 562		if node.Type == Paragraph || node.Type == Header {
 563			p.currBlock = node
 564			p.inline(node.content)
 565			node.content = nil
 566		}
 567	})
 568}
 569
 570// first pass:
 571// - extract references
 572// - expand tabs
 573// - normalize newlines
 574// - copy everything else
 575func firstPass(p *parser, input []byte) []byte {
 576	var out bytes.Buffer
 577	tabSize := TabSizeDefault
 578	if p.flags&TabSizeEight != 0 {
 579		tabSize = TabSizeDouble
 580	}
 581	beg, end := 0, 0
 582	lastFencedCodeBlockEnd := 0
 583	for beg < len(input) { // iterate over lines
 584		if end = isReference(p, input[beg:], tabSize); end > 0 {
 585			beg += end
 586		} else { // skip to the next line
 587			end = beg
 588			for end < len(input) && input[end] != '\n' && input[end] != '\r' {
 589				end++
 590			}
 591
 592			if p.flags&FencedCode != 0 {
 593				// track fenced code block boundaries to suppress tab expansion
 594				// inside them:
 595				if beg >= lastFencedCodeBlockEnd {
 596					if i := p.fencedCode(input[beg:], false); i > 0 {
 597						lastFencedCodeBlockEnd = beg + i
 598					}
 599				}
 600			}
 601
 602			// add the line body if present
 603			if end > beg {
 604				if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
 605					out.Write(input[beg:end])
 606				} else {
 607					expandTabs(&out, input[beg:end], tabSize)
 608				}
 609			}
 610			out.WriteByte('\n')
 611
 612			if end < len(input) && input[end] == '\r' {
 613				end++
 614			}
 615			if end < len(input) && input[end] == '\n' {
 616				end++
 617			}
 618
 619			beg = end
 620		}
 621	}
 622
 623	// empty input?
 624	if out.Len() == 0 {
 625		out.WriteByte('\n')
 626	}
 627
 628	return out.Bytes()
 629}
 630
 631// second pass: actual rendering
 632func secondPass(p *parser, input []byte) {
 633	p.block(input)
 634
 635	if p.flags&Footnotes != 0 && len(p.notes) > 0 {
 636		flags := ListItemBeginningOfList
 637		for i := 0; i < len(p.notes); i += 1 {
 638			ref := p.notes[i]
 639			if ref.hasBlock {
 640				flags |= ListItemContainsBlock
 641				p.block(ref.title)
 642			} else {
 643				p.inline(ref.title)
 644			}
 645			flags &^= ListItemBeginningOfList | ListItemContainsBlock
 646		}
 647	}
 648
 649	if p.nesting != 0 {
 650		panic("Nesting level did not end at zero")
 651	}
 652}
 653
 654//
 655// Link references
 656//
 657// This section implements support for references that (usually) appear
 658// as footnotes in a document, and can be referenced anywhere in the document.
 659// The basic format is:
 660//
 661//    [1]: http://www.google.com/ "Google"
 662//    [2]: http://www.github.com/ "Github"
 663//
 664// Anywhere in the document, the reference can be linked by referring to its
 665// label, i.e., 1 and 2 in this example, as in:
 666//
 667//    This library is hosted on [Github][2], a git hosting site.
 668//
 669// Actual footnotes as specified in Pandoc and supported by some other Markdown
 670// libraries such as php-markdown are also taken care of. They look like this:
 671//
 672//    This sentence needs a bit of further explanation.[^note]
 673//
 674//    [^note]: This is the explanation.
 675//
 676// Footnotes should be placed at the end of the document in an ordered list.
 677// Inline footnotes such as:
 678//
 679//    Inline footnotes^[Not supported.] also exist.
 680//
 681// are not yet supported.
 682
 683// References are parsed and stored in this struct.
 684type reference struct {
 685	link     []byte
 686	title    []byte
 687	noteId   int // 0 if not a footnote ref
 688	hasBlock bool
 689	text     []byte
 690}
 691
 692func (r *reference) String() string {
 693	return fmt.Sprintf("{link: %q, title: %q, text: %q, noteId: %d, hasBlock: %v}",
 694		r.link, r.title, r.text, r.noteId, r.hasBlock)
 695}
 696
 697// Check whether or not data starts with a reference link.
 698// If so, it is parsed and stored in the list of references
 699// (in the render struct).
 700// Returns the number of bytes to skip to move past it,
 701// or zero if the first line is not a reference.
 702func isReference(p *parser, data []byte, tabSize int) int {
 703	// up to 3 optional leading spaces
 704	if len(data) < 4 {
 705		return 0
 706	}
 707	i := 0
 708	for i < 3 && data[i] == ' ' {
 709		i++
 710	}
 711
 712	noteId := 0
 713
 714	// id part: anything but a newline between brackets
 715	if data[i] != '[' {
 716		return 0
 717	}
 718	i++
 719	if p.flags&Footnotes != 0 {
 720		if i < len(data) && data[i] == '^' {
 721			// we can set it to anything here because the proper noteIds will
 722			// be assigned later during the second pass. It just has to be != 0
 723			noteId = 1
 724			i++
 725		}
 726	}
 727	idOffset := i
 728	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
 729		i++
 730	}
 731	if i >= len(data) || data[i] != ']' {
 732		return 0
 733	}
 734	idEnd := i
 735
 736	// spacer: colon (space | tab)* newline? (space | tab)*
 737	i++
 738	if i >= len(data) || data[i] != ':' {
 739		return 0
 740	}
 741	i++
 742	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 743		i++
 744	}
 745	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
 746		i++
 747		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
 748			i++
 749		}
 750	}
 751	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 752		i++
 753	}
 754	if i >= len(data) {
 755		return 0
 756	}
 757
 758	var (
 759		linkOffset, linkEnd   int
 760		titleOffset, titleEnd int
 761		lineEnd               int
 762		raw                   []byte
 763		hasBlock              bool
 764	)
 765
 766	if p.flags&Footnotes != 0 && noteId != 0 {
 767		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
 768		lineEnd = linkEnd
 769	} else {
 770		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
 771	}
 772	if lineEnd == 0 {
 773		return 0
 774	}
 775
 776	// a valid ref has been found
 777
 778	ref := &reference{
 779		noteId:   noteId,
 780		hasBlock: hasBlock,
 781	}
 782
 783	if noteId > 0 {
 784		// reusing the link field for the id since footnotes don't have links
 785		ref.link = data[idOffset:idEnd]
 786		// if footnote, it's not really a title, it's the contained text
 787		ref.title = raw
 788	} else {
 789		ref.link = data[linkOffset:linkEnd]
 790		ref.title = data[titleOffset:titleEnd]
 791	}
 792
 793	// id matches are case-insensitive
 794	id := string(bytes.ToLower(data[idOffset:idEnd]))
 795
 796	p.refs[id] = ref
 797
 798	return lineEnd
 799}
 800
 801func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
 802	// link: whitespace-free sequence, optionally between angle brackets
 803	if data[i] == '<' {
 804		i++
 805	}
 806	linkOffset = i
 807	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
 808		i++
 809	}
 810	if i == len(data) {
 811		return
 812	}
 813	linkEnd = i
 814	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
 815		linkOffset++
 816		linkEnd--
 817	}
 818
 819	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
 820	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 821		i++
 822	}
 823	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
 824		return
 825	}
 826
 827	// compute end-of-line
 828	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
 829		lineEnd = i
 830	}
 831	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
 832		lineEnd++
 833	}
 834
 835	// optional (space|tab)* spacer after a newline
 836	if lineEnd > 0 {
 837		i = lineEnd + 1
 838		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 839			i++
 840		}
 841	}
 842
 843	// optional title: any non-newline sequence enclosed in '"() alone on its line
 844	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
 845		i++
 846		titleOffset = i
 847
 848		// look for EOL
 849		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
 850			i++
 851		}
 852		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
 853			titleEnd = i + 1
 854		} else {
 855			titleEnd = i
 856		}
 857
 858		// step back
 859		i--
 860		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
 861			i--
 862		}
 863		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
 864			lineEnd = titleEnd
 865			titleEnd = i
 866		}
 867	}
 868
 869	return
 870}
 871
 872// The first bit of this logic is the same as (*parser).listItem, but the rest
 873// is much simpler. This function simply finds the entire block and shifts it
 874// over by one tab if it is indeed a block (just returns the line if it's not).
 875// blockEnd is the end of the section in the input buffer, and contents is the
 876// extracted text that was shifted over one tab. It will need to be rendered at
 877// the end of the document.
 878func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
 879	if i == 0 || len(data) == 0 {
 880		return
 881	}
 882
 883	// skip leading whitespace on first line
 884	for i < len(data) && data[i] == ' ' {
 885		i++
 886	}
 887
 888	blockStart = i
 889
 890	// find the end of the line
 891	blockEnd = i
 892	for i < len(data) && data[i-1] != '\n' {
 893		i++
 894	}
 895
 896	// get working buffer
 897	var raw bytes.Buffer
 898
 899	// put the first line into the working buffer
 900	raw.Write(data[blockEnd:i])
 901	blockEnd = i
 902
 903	// process the following lines
 904	containsBlankLine := false
 905
 906gatherLines:
 907	for blockEnd < len(data) {
 908		i++
 909
 910		// find the end of this line
 911		for i < len(data) && data[i-1] != '\n' {
 912			i++
 913		}
 914
 915		// if it is an empty line, guess that it is part of this item
 916		// and move on to the next line
 917		if p.isEmpty(data[blockEnd:i]) > 0 {
 918			containsBlankLine = true
 919			blockEnd = i
 920			continue
 921		}
 922
 923		n := 0
 924		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
 925			// this is the end of the block.
 926			// we don't want to include this last line in the index.
 927			break gatherLines
 928		}
 929
 930		// if there were blank lines before this one, insert a new one now
 931		if containsBlankLine {
 932			raw.WriteByte('\n')
 933			containsBlankLine = false
 934		}
 935
 936		// get rid of that first tab, write to buffer
 937		raw.Write(data[blockEnd+n : i])
 938		hasBlock = true
 939
 940		blockEnd = i
 941	}
 942
 943	if data[blockEnd-1] != '\n' {
 944		raw.WriteByte('\n')
 945	}
 946
 947	contents = raw.Bytes()
 948
 949	return
 950}
 951
 952//
 953//
 954// Miscellaneous helper functions
 955//
 956//
 957
 958// Test if a character is a punctuation symbol.
 959// Taken from a private function in regexp in the stdlib.
 960func ispunct(c byte) bool {
 961	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
 962		if c == r {
 963			return true
 964		}
 965	}
 966	return false
 967}
 968
 969// Test if a character is a whitespace character.
 970func isspace(c byte) bool {
 971	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
 972}
 973
 974// Test if a character is letter.
 975func isletter(c byte) bool {
 976	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
 977}
 978
 979// Test if a character is a letter or a digit.
 980// TODO: check when this is looking for ASCII alnum and when it should use unicode
 981func isalnum(c byte) bool {
 982	return (c >= '0' && c <= '9') || isletter(c)
 983}
 984
 985// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
 986// always ends output with a newline
 987func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
 988	// first, check for common cases: no tabs, or only tabs at beginning of line
 989	i, prefix := 0, 0
 990	slowcase := false
 991	for i = 0; i < len(line); i++ {
 992		if line[i] == '\t' {
 993			if prefix == i {
 994				prefix++
 995			} else {
 996				slowcase = true
 997				break
 998			}
 999		}
1000	}
1001
1002	// no need to decode runes if all tabs are at the beginning of the line
1003	if !slowcase {
1004		for i = 0; i < prefix*tabSize; i++ {
1005			out.WriteByte(' ')
1006		}
1007		out.Write(line[prefix:])
1008		return
1009	}
1010
1011	// the slow case: we need to count runes to figure out how
1012	// many spaces to insert for each tab
1013	column := 0
1014	i = 0
1015	for i < len(line) {
1016		start := i
1017		for i < len(line) && line[i] != '\t' {
1018			_, size := utf8.DecodeRune(line[i:])
1019			i += size
1020			column++
1021		}
1022
1023		if i > start {
1024			out.Write(line[start:i])
1025		}
1026
1027		if i >= len(line) {
1028			break
1029		}
1030
1031		for {
1032			out.WriteByte(' ')
1033			column++
1034			if column%tabSize == 0 {
1035				break
1036			}
1037		}
1038
1039		i++
1040	}
1041}
1042
1043// Find if a line counts as indented or not.
1044// Returns number of characters the indent is (0 = not indented).
1045func isIndented(data []byte, indentSize int) int {
1046	if len(data) == 0 {
1047		return 0
1048	}
1049	if data[0] == '\t' {
1050		return 1
1051	}
1052	if len(data) < indentSize {
1053		return 0
1054	}
1055	for i := 0; i < indentSize; i++ {
1056		if data[i] != ' ' {
1057			return 0
1058		}
1059	}
1060	return indentSize
1061}
1062
1063// Create a url-safe slug for fragments
1064func slugify(in []byte) []byte {
1065	if len(in) == 0 {
1066		return in
1067	}
1068	out := make([]byte, 0, len(in))
1069	sym := false
1070
1071	for _, ch := range in {
1072		if isalnum(ch) {
1073			sym = false
1074			out = append(out, ch)
1075		} else if sym {
1076			continue
1077		} else {
1078			out = append(out, '-')
1079			sym = true
1080		}
1081	}
1082	var a, b int
1083	var ch byte
1084	for a, ch = range out {
1085		if ch != '-' {
1086			break
1087		}
1088	}
1089	for b = len(out) - 1; b > 0; b-- {
1090		if out[b] != '-' {
1091			break
1092		}
1093	}
1094	return out[a : b+1]
1095}