all repos — grayfriday @ c61b63f42c03b4ab7d30a79fd661c242202cf6d0

blackfriday fork with a few changes

markdown.go (view raw)

   1//
   2// Blackfriday Markdown Processor
   3// Available at http://github.com/russross/blackfriday
   4//
   5// Copyright © 2011 Russ Ross <russ@russross.com>.
   6// Distributed under the Simplified BSD License.
   7// See README.md for details.
   8//
   9
  10//
  11//
  12// Markdown parsing and processing
  13//
  14//
  15
  16// Package blackfriday is a markdown processor.
  17//
  18// Translates plain text with simple formatting rules into HTML or LaTeX.
  19package blackfriday
  20
  21import (
  22	"bytes"
  23	"fmt"
  24	"io"
  25	"strings"
  26	"unicode/utf8"
  27)
  28
  29// Version string of the package.
  30const Version = "2.0"
  31
  32// Extensions is a bitwise or'ed collection of enabled Blackfriday's
  33// extensions.
  34type Extensions int
  35
  36// These are the supported markdown parsing extensions.
  37// OR these values together to select multiple extensions.
  38const (
  39	NoExtensions            Extensions = 0
  40	NoIntraEmphasis         Extensions = 1 << iota // Ignore emphasis markers inside words
  41	Tables                                         // Render tables
  42	FencedCode                                     // Render fenced code blocks
  43	Autolink                                       // Detect embedded URLs that are not explicitly marked
  44	Strikethrough                                  // Strikethrough text using ~~test~~
  45	LaxHTMLBlocks                                  // Loosen up HTML block parsing rules
  46	SpaceHeaders                                   // Be strict about prefix header rules
  47	HardLineBreak                                  // Translate newlines into line breaks
  48	TabSizeEight                                   // Expand tabs to eight spaces instead of four
  49	Footnotes                                      // Pandoc-style footnotes
  50	NoEmptyLineBeforeBlock                         // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
  51	HeaderIDs                                      // specify header IDs  with {#id}
  52	Titleblock                                     // Titleblock ala pandoc
  53	AutoHeaderIDs                                  // Create the header ID from the text
  54	BackslashLineBreak                             // Translate trailing backslashes into line breaks
  55	DefinitionLists                                // Render definition lists
  56	Smartypants                                    // Enable smart punctuation substitutions
  57	SmartypantsFractions                           // Enable smart fractions (with Smartypants)
  58	SmartypantsDashes                              // Enable smart dashes (with Smartypants)
  59	SmartypantsLatexDashes                         // Enable LaTeX-style dashes (with Smartypants)
  60	SmartypantsAngledQuotes                        // Enable angled double quotes (with Smartypants) for double quotes rendering
  61	TOC                                            // Generate a table of contents
  62	OmitContents                                   // Skip the main contents (for a standalone table of contents)
  63
  64	CommonHTMLFlags HTMLFlags = UseXHTML
  65
  66	CommonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
  67		Autolink | Strikethrough | SpaceHeaders | HeaderIDs |
  68		BackslashLineBreak | DefinitionLists | Smartypants |
  69		SmartypantsFractions | SmartypantsDashes | SmartypantsLatexDashes
  70)
  71
  72// DefaultOptions is a convenience variable with all the options that are
  73// enabled by default.
  74var DefaultOptions = Options{
  75	Extensions: CommonExtensions,
  76}
  77
  78// ListType contains bitwise or'ed flags for list and list item objects.
  79type ListType int
  80
  81// These are the possible flag values for the ListItem renderer.
  82// Multiple flag values may be ORed together.
  83// These are mostly of interest if you are writing a new output format.
  84const (
  85	ListTypeOrdered ListType = 1 << iota
  86	ListTypeDefinition
  87	ListTypeTerm
  88
  89	ListItemContainsBlock
  90	ListItemBeginningOfList
  91	ListItemEndOfList
  92)
  93
  94// CellAlignFlags holds a type of alignment in a table cell.
  95type CellAlignFlags int
  96
  97// These are the possible flag values for the table cell renderer.
  98// Only a single one of these values will be used; they are not ORed together.
  99// These are mostly of interest if you are writing a new output format.
 100const (
 101	TableAlignmentLeft = 1 << iota
 102	TableAlignmentRight
 103	TableAlignmentCenter = (TableAlignmentLeft | TableAlignmentRight)
 104)
 105
 106// The size of a tab stop.
 107const (
 108	TabSizeDefault = 4
 109	TabSizeDouble  = 8
 110)
 111
 112// blockTags is a set of tags that are recognized as HTML block tags.
 113// Any of these can be included in markdown text without special escaping.
 114var blockTags = map[string]struct{}{
 115	"blockquote": struct{}{},
 116	"del":        struct{}{},
 117	"div":        struct{}{},
 118	"dl":         struct{}{},
 119	"fieldset":   struct{}{},
 120	"form":       struct{}{},
 121	"h1":         struct{}{},
 122	"h2":         struct{}{},
 123	"h3":         struct{}{},
 124	"h4":         struct{}{},
 125	"h5":         struct{}{},
 126	"h6":         struct{}{},
 127	"iframe":     struct{}{},
 128	"ins":        struct{}{},
 129	"math":       struct{}{},
 130	"noscript":   struct{}{},
 131	"ol":         struct{}{},
 132	"pre":        struct{}{},
 133	"p":          struct{}{},
 134	"script":     struct{}{},
 135	"style":      struct{}{},
 136	"table":      struct{}{},
 137	"ul":         struct{}{},
 138
 139	// HTML5
 140	"address":    struct{}{},
 141	"article":    struct{}{},
 142	"aside":      struct{}{},
 143	"canvas":     struct{}{},
 144	"figcaption": struct{}{},
 145	"figure":     struct{}{},
 146	"footer":     struct{}{},
 147	"header":     struct{}{},
 148	"hgroup":     struct{}{},
 149	"main":       struct{}{},
 150	"nav":        struct{}{},
 151	"output":     struct{}{},
 152	"progress":   struct{}{},
 153	"section":    struct{}{},
 154	"video":      struct{}{},
 155}
 156
 157// Renderer is the rendering interface.
 158// This is mostly of interest if you are implementing a new rendering format.
 159//
 160// When a byte slice is provided, it contains the (rendered) contents of the
 161// element.
 162//
 163// When a callback is provided instead, it will write the contents of the
 164// respective element directly to the output buffer and return true on success.
 165// If the callback returns false, the rendering function should reset the
 166// output buffer as though it had never been called.
 167//
 168// Currently HTML and Latex implementations are provided
 169type Renderer interface {
 170	Render(ast *Node) []byte
 171	RenderNode(w io.Writer, node *Node, entering bool) WalkStatus
 172}
 173
 174// Callback functions for inline parsing. One such function is defined
 175// for each character that triggers a response when parsing inline data.
 176type inlineParser func(p *parser, data []byte, offset int) int
 177
 178// Parser holds runtime state used by the parser.
 179// This is constructed by the Markdown function.
 180type parser struct {
 181	refOverride    ReferenceOverrideFunc
 182	refs           map[string]*reference
 183	inlineCallback [256]inlineParser
 184	flags          Extensions
 185	nesting        int
 186	maxNesting     int
 187	insideLink     bool
 188
 189	// Footnotes need to be ordered as well as available to quickly check for
 190	// presence. If a ref is also a footnote, it's stored both in refs and here
 191	// in notes. Slice is nil if footnotes not enabled.
 192	notes []*reference
 193
 194	doc                  *Node
 195	tip                  *Node // = doc
 196	oldTip               *Node
 197	lastMatchedContainer *Node // = doc
 198	allClosed            bool
 199	currBlock            *Node // a block node currently being parsed by inline parser
 200}
 201
 202func (p *parser) getRef(refid string) (ref *reference, found bool) {
 203	if p.refOverride != nil {
 204		r, overridden := p.refOverride(refid)
 205		if overridden {
 206			if r == nil {
 207				return nil, false
 208			}
 209			return &reference{
 210				link:     []byte(r.Link),
 211				title:    []byte(r.Title),
 212				noteID:   0,
 213				hasBlock: false,
 214				text:     []byte(r.Text)}, true
 215		}
 216	}
 217	// refs are case insensitive
 218	ref, found = p.refs[strings.ToLower(refid)]
 219	return ref, found
 220}
 221
 222func (p *parser) finalize(block *Node) {
 223	above := block.Parent
 224	block.open = false
 225	p.tip = above
 226}
 227
 228func (p *parser) addChild(node NodeType, offset uint32) *Node {
 229	for !p.tip.canContain(node) {
 230		p.finalize(p.tip)
 231	}
 232	newNode := NewNode(node)
 233	newNode.content = []byte{}
 234	p.tip.appendChild(newNode)
 235	p.tip = newNode
 236	return newNode
 237}
 238
 239func (p *parser) closeUnmatchedBlocks() {
 240	if !p.allClosed {
 241		for p.oldTip != p.lastMatchedContainer {
 242			parent := p.oldTip.Parent
 243			p.finalize(p.oldTip)
 244			p.oldTip = parent
 245		}
 246		p.allClosed = true
 247	}
 248}
 249
 250//
 251//
 252// Public interface
 253//
 254//
 255
 256// Reference represents the details of a link.
 257// See the documentation in Options for more details on use-case.
 258type Reference struct {
 259	// Link is usually the URL the reference points to.
 260	Link string
 261	// Title is the alternate text describing the link in more detail.
 262	Title string
 263	// Text is the optional text to override the ref with if the syntax used was
 264	// [refid][]
 265	Text string
 266}
 267
 268// ReferenceOverrideFunc is expected to be called with a reference string and
 269// return either a valid Reference type that the reference string maps to or
 270// nil. If overridden is false, the default reference logic will be executed.
 271// See the documentation in Options for more details on use-case.
 272type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
 273
 274// Options represents configurable overrides and callbacks (in addition to the
 275// extension flag set) for configuring a Markdown parse.
 276type Options struct {
 277	// Extensions is a flag set of bit-wise ORed extension bits. See the
 278	// Extensions flags defined in this package.
 279	Extensions Extensions
 280
 281	// ReferenceOverride is an optional function callback that is called every
 282	// time a reference is resolved.
 283	//
 284	// In Markdown, the link reference syntax can be made to resolve a link to
 285	// a reference instead of an inline URL, in one of the following ways:
 286	//
 287	//  * [link text][refid]
 288	//  * [refid][]
 289	//
 290	// Usually, the refid is defined at the bottom of the Markdown document. If
 291	// this override function is provided, the refid is passed to the override
 292	// function first, before consulting the defined refids at the bottom. If
 293	// the override function indicates an override did not occur, the refids at
 294	// the bottom will be used to fill in the link details.
 295	ReferenceOverride ReferenceOverrideFunc
 296}
 297
 298// MarkdownBasic is a convenience function for simple rendering.
 299// It processes markdown input with no extensions enabled.
 300func MarkdownBasic(input []byte) []byte {
 301	// set up the HTML renderer
 302	renderer := NewHTMLRenderer(HTMLRendererParameters{
 303		Flags:      UseXHTML,
 304		Extensions: CommonExtensions,
 305	})
 306
 307	// set up the parser
 308	return Markdown(input, renderer, Options{})
 309}
 310
 311// MarkdownCommon is a convenience function for simple rendering. It calls
 312// Markdown with most useful extensions enabled, including:
 313//
 314// * Smartypants processing with smart fractions and LaTeX dashes
 315//
 316// * Intra-word emphasis suppression
 317//
 318// * Tables
 319//
 320// * Fenced code blocks
 321//
 322// * Autolinking
 323//
 324// * Strikethrough support
 325//
 326// * Strict header parsing
 327//
 328// * Custom Header IDs
 329func MarkdownCommon(input []byte) []byte {
 330	// set up the HTML renderer
 331	renderer := NewHTMLRenderer(HTMLRendererParameters{
 332		Flags:      CommonHTMLFlags,
 333		Extensions: CommonExtensions,
 334	})
 335	return Markdown(input, renderer, DefaultOptions)
 336}
 337
 338// Markdown is the main rendering function.
 339// It parses and renders a block of markdown-encoded text.
 340// The supplied Renderer is used to format the output, and extensions dictates
 341// which non-standard extensions are enabled.
 342//
 343// To use the supplied HTML or LaTeX renderers, see NewHTMLRenderer and
 344// NewLatexRenderer, respectively.
 345func Markdown(input []byte, renderer Renderer, options Options) []byte {
 346	if renderer == nil {
 347		return nil
 348	}
 349	return renderer.Render(Parse(input, options))
 350}
 351
 352// Parse is an entry point to the parsing part of Blackfriday. It takes an
 353// input markdown document and produces a syntax tree for its contents. This
 354// tree can then be rendered with a default or custom renderer, or
 355// analyzed/transformed by the caller to whatever non-standard needs they have.
 356func Parse(input []byte, opts Options) *Node {
 357	extensions := opts.Extensions
 358
 359	// fill in the render structure
 360	p := new(parser)
 361	p.flags = extensions
 362	p.refOverride = opts.ReferenceOverride
 363	p.refs = make(map[string]*reference)
 364	p.maxNesting = 16
 365	p.insideLink = false
 366
 367	docNode := NewNode(Document)
 368	p.doc = docNode
 369	p.tip = docNode
 370	p.oldTip = docNode
 371	p.lastMatchedContainer = docNode
 372	p.allClosed = true
 373
 374	// register inline parsers
 375	p.inlineCallback['*'] = emphasis
 376	p.inlineCallback['_'] = emphasis
 377	if extensions&Strikethrough != 0 {
 378		p.inlineCallback['~'] = emphasis
 379	}
 380	p.inlineCallback['`'] = codeSpan
 381	p.inlineCallback['\n'] = lineBreak
 382	p.inlineCallback['['] = link
 383	p.inlineCallback['<'] = leftAngle
 384	p.inlineCallback['\\'] = escape
 385	p.inlineCallback['&'] = entity
 386	p.inlineCallback['!'] = maybeImage
 387	p.inlineCallback['^'] = maybeInlineFootnote
 388
 389	if extensions&Autolink != 0 {
 390		p.inlineCallback['h'] = maybeAutoLink
 391		p.inlineCallback['m'] = maybeAutoLink
 392		p.inlineCallback['f'] = maybeAutoLink
 393		p.inlineCallback['H'] = maybeAutoLink
 394		p.inlineCallback['M'] = maybeAutoLink
 395		p.inlineCallback['F'] = maybeAutoLink
 396	}
 397
 398	if extensions&Footnotes != 0 {
 399		p.notes = make([]*reference, 0)
 400	}
 401
 402	first := firstPass(p, input)
 403	secondPass(p, first)
 404	// Walk the tree and finish up some of unfinished blocks
 405	for p.tip != nil {
 406		p.finalize(p.tip)
 407	}
 408	// Walk the tree again and process inline markdown in each block
 409	p.doc.Walk(func(node *Node, entering bool) WalkStatus {
 410		if node.Type == Paragraph || node.Type == Header || node.Type == TableCell {
 411			p.currBlock = node
 412			p.inline(node.content)
 413			node.content = nil
 414		}
 415		return GoToNext
 416	})
 417	p.parseRefsToAST()
 418	p.generateTOC()
 419	return p.doc
 420}
 421
 422func (p *parser) generateTOC() {
 423	if p.flags&TOC == 0 && p.flags&OmitContents == 0 {
 424		return
 425	}
 426	navNode := NewNode(HTMLBlock)
 427	navNode.Literal = []byte("<nav>")
 428	navNode.open = false
 429
 430	var topList *Node
 431	var listNode *Node
 432	var lastItem *Node
 433	headerCount := 0
 434	currentLevel := 0
 435	p.doc.Walk(func(node *Node, entering bool) WalkStatus {
 436		if entering && node.Type == Header {
 437			if node.Level > currentLevel {
 438				currentLevel++
 439				newList := NewNode(List)
 440				if lastItem != nil {
 441					lastItem.appendChild(newList)
 442					listNode = newList
 443				} else {
 444					listNode = newList
 445					topList = listNode
 446				}
 447			}
 448			if node.Level < currentLevel {
 449				finalizeList(listNode)
 450				lastItem = listNode.Parent
 451				listNode = lastItem.Parent
 452			}
 453			node.HeaderID = fmt.Sprintf("toc_%d", headerCount)
 454			headerCount++
 455			lastItem = NewNode(Item)
 456			listNode.appendChild(lastItem)
 457			anchorNode := NewNode(Link)
 458			anchorNode.Destination = []byte("#" + node.HeaderID)
 459			lastItem.appendChild(anchorNode)
 460			anchorNode.appendChild(text(node.FirstChild.Literal))
 461		}
 462		return GoToNext
 463	})
 464	firstChild := p.doc.FirstChild
 465	// Insert TOC only if there is anything to insert
 466	if topList != nil {
 467		finalizeList(topList)
 468		firstChild.insertBefore(navNode)
 469		firstChild.insertBefore(topList)
 470		navCloseNode := NewNode(HTMLBlock)
 471		navCloseNode.Literal = []byte("</nav>")
 472		navCloseNode.open = false
 473		firstChild.insertBefore(navCloseNode)
 474	}
 475	// Drop everything after the TOC if OmitContents was requested
 476	if p.flags&OmitContents != 0 {
 477		for firstChild != nil {
 478			next := firstChild.Next
 479			firstChild.unlink()
 480			firstChild = next
 481		}
 482	}
 483}
 484
 485func (p *parser) parseRefsToAST() {
 486	if p.flags&Footnotes == 0 || len(p.notes) == 0 {
 487		return
 488	}
 489	p.tip = p.doc
 490	finalizeHTMLBlock(p.addBlock(HTMLBlock, []byte(`<div class="footnotes">`)))
 491	p.addBlock(HorizontalRule, nil)
 492	block := p.addBlock(List, nil)
 493	block.ListFlags = ListTypeOrdered
 494	flags := ListItemBeginningOfList
 495	// Note: this loop is intentionally explicit, not range-form. This is
 496	// because the body of the loop will append nested footnotes to p.notes and
 497	// we need to process those late additions. Range form would only walk over
 498	// the fixed initial set.
 499	for i := 0; i < len(p.notes); i++ {
 500		ref := p.notes[i]
 501		block := p.addBlock(Item, nil)
 502		block.ListFlags = ListTypeOrdered
 503		block.RefLink = ref.link
 504		if ref.hasBlock {
 505			flags |= ListItemContainsBlock
 506			p.block(ref.title)
 507		} else {
 508			p.currBlock = block
 509			p.inline(ref.title)
 510		}
 511		flags &^= ListItemBeginningOfList | ListItemContainsBlock
 512	}
 513	above := block.Parent
 514	finalizeList(block)
 515	p.tip = above
 516	finalizeHTMLBlock(p.addBlock(HTMLBlock, []byte("</div>")))
 517	block.Walk(func(node *Node, entering bool) WalkStatus {
 518		if node.Type == Paragraph || node.Type == Header {
 519			p.currBlock = node
 520			p.inline(node.content)
 521			node.content = nil
 522		}
 523		return GoToNext
 524	})
 525}
 526
 527// first pass:
 528// - normalize newlines
 529// - extract references (outside of fenced code blocks)
 530// - expand tabs (outside of fenced code blocks)
 531// - copy everything else
 532func firstPass(p *parser, input []byte) []byte {
 533	var out bytes.Buffer
 534	tabSize := TabSizeDefault
 535	if p.flags&TabSizeEight != 0 {
 536		tabSize = TabSizeDouble
 537	}
 538	beg := 0
 539	lastFencedCodeBlockEnd := 0
 540	for beg < len(input) {
 541		// Find end of this line, then process the line.
 542		end := beg
 543		for end < len(input) && input[end] != '\n' && input[end] != '\r' {
 544			end++
 545		}
 546
 547		if p.flags&FencedCode != 0 {
 548			// track fenced code block boundaries to suppress tab expansion
 549			// and reference extraction inside them:
 550			if beg >= lastFencedCodeBlockEnd {
 551				if i := p.fencedCodeBlock(input[beg:], false); i > 0 {
 552					lastFencedCodeBlockEnd = beg + i
 553				}
 554			}
 555		}
 556
 557		// add the line body if present
 558		if end > beg {
 559			if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
 560				out.Write(input[beg:end])
 561			} else if refEnd := isReference(p, input[beg:], tabSize); refEnd > 0 {
 562				beg += refEnd
 563				continue
 564			} else {
 565				expandTabs(&out, input[beg:end], tabSize)
 566			}
 567		}
 568
 569		if end < len(input) && input[end] == '\r' {
 570			end++
 571		}
 572		if end < len(input) && input[end] == '\n' {
 573			end++
 574		}
 575		out.WriteByte('\n')
 576
 577		beg = end
 578	}
 579
 580	// empty input?
 581	if out.Len() == 0 {
 582		out.WriteByte('\n')
 583	}
 584
 585	return out.Bytes()
 586}
 587
 588// second pass: actual rendering
 589func secondPass(p *parser, input []byte) {
 590	p.block(input)
 591
 592	if p.flags&Footnotes != 0 && len(p.notes) > 0 {
 593		flags := ListItemBeginningOfList
 594		for i := 0; i < len(p.notes); i++ {
 595			ref := p.notes[i]
 596			if ref.hasBlock {
 597				flags |= ListItemContainsBlock
 598				p.block(ref.title)
 599			} else {
 600				p.inline(ref.title)
 601			}
 602			flags &^= ListItemBeginningOfList | ListItemContainsBlock
 603		}
 604	}
 605
 606	if p.nesting != 0 {
 607		panic("Nesting level did not end at zero")
 608	}
 609}
 610
 611//
 612// Link references
 613//
 614// This section implements support for references that (usually) appear
 615// as footnotes in a document, and can be referenced anywhere in the document.
 616// The basic format is:
 617//
 618//    [1]: http://www.google.com/ "Google"
 619//    [2]: http://www.github.com/ "Github"
 620//
 621// Anywhere in the document, the reference can be linked by referring to its
 622// label, i.e., 1 and 2 in this example, as in:
 623//
 624//    This library is hosted on [Github][2], a git hosting site.
 625//
 626// Actual footnotes as specified in Pandoc and supported by some other Markdown
 627// libraries such as php-markdown are also taken care of. They look like this:
 628//
 629//    This sentence needs a bit of further explanation.[^note]
 630//
 631//    [^note]: This is the explanation.
 632//
 633// Footnotes should be placed at the end of the document in an ordered list.
 634// Inline footnotes such as:
 635//
 636//    Inline footnotes^[Not supported.] also exist.
 637//
 638// are not yet supported.
 639
 640// References are parsed and stored in this struct.
 641type reference struct {
 642	link     []byte
 643	title    []byte
 644	noteID   int // 0 if not a footnote ref
 645	hasBlock bool
 646	text     []byte
 647}
 648
 649func (r *reference) String() string {
 650	return fmt.Sprintf("{link: %q, title: %q, text: %q, noteID: %d, hasBlock: %v}",
 651		r.link, r.title, r.text, r.noteID, r.hasBlock)
 652}
 653
 654// Check whether or not data starts with a reference link.
 655// If so, it is parsed and stored in the list of references
 656// (in the render struct).
 657// Returns the number of bytes to skip to move past it,
 658// or zero if the first line is not a reference.
 659func isReference(p *parser, data []byte, tabSize int) int {
 660	// up to 3 optional leading spaces
 661	if len(data) < 4 {
 662		return 0
 663	}
 664	i := 0
 665	for i < 3 && data[i] == ' ' {
 666		i++
 667	}
 668
 669	noteID := 0
 670
 671	// id part: anything but a newline between brackets
 672	if data[i] != '[' {
 673		return 0
 674	}
 675	i++
 676	if p.flags&Footnotes != 0 {
 677		if i < len(data) && data[i] == '^' {
 678			// we can set it to anything here because the proper noteIds will
 679			// be assigned later during the second pass. It just has to be != 0
 680			noteID = 1
 681			i++
 682		}
 683	}
 684	idOffset := i
 685	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
 686		i++
 687	}
 688	if i >= len(data) || data[i] != ']' {
 689		return 0
 690	}
 691	idEnd := i
 692
 693	// spacer: colon (space | tab)* newline? (space | tab)*
 694	i++
 695	if i >= len(data) || data[i] != ':' {
 696		return 0
 697	}
 698	i++
 699	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 700		i++
 701	}
 702	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
 703		i++
 704		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
 705			i++
 706		}
 707	}
 708	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 709		i++
 710	}
 711	if i >= len(data) {
 712		return 0
 713	}
 714
 715	var (
 716		linkOffset, linkEnd   int
 717		titleOffset, titleEnd int
 718		lineEnd               int
 719		raw                   []byte
 720		hasBlock              bool
 721	)
 722
 723	if p.flags&Footnotes != 0 && noteID != 0 {
 724		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
 725		lineEnd = linkEnd
 726	} else {
 727		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
 728	}
 729	if lineEnd == 0 {
 730		return 0
 731	}
 732
 733	// a valid ref has been found
 734
 735	ref := &reference{
 736		noteID:   noteID,
 737		hasBlock: hasBlock,
 738	}
 739
 740	if noteID > 0 {
 741		// reusing the link field for the id since footnotes don't have links
 742		ref.link = data[idOffset:idEnd]
 743		// if footnote, it's not really a title, it's the contained text
 744		ref.title = raw
 745	} else {
 746		ref.link = data[linkOffset:linkEnd]
 747		ref.title = data[titleOffset:titleEnd]
 748	}
 749
 750	// id matches are case-insensitive
 751	id := string(bytes.ToLower(data[idOffset:idEnd]))
 752
 753	p.refs[id] = ref
 754
 755	return lineEnd
 756}
 757
 758func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
 759	// link: whitespace-free sequence, optionally between angle brackets
 760	if data[i] == '<' {
 761		i++
 762	}
 763	linkOffset = i
 764	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
 765		i++
 766	}
 767	if i == len(data) {
 768		return
 769	}
 770	linkEnd = i
 771	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
 772		linkOffset++
 773		linkEnd--
 774	}
 775
 776	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
 777	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 778		i++
 779	}
 780	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
 781		return
 782	}
 783
 784	// compute end-of-line
 785	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
 786		lineEnd = i
 787	}
 788	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
 789		lineEnd++
 790	}
 791
 792	// optional (space|tab)* spacer after a newline
 793	if lineEnd > 0 {
 794		i = lineEnd + 1
 795		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 796			i++
 797		}
 798	}
 799
 800	// optional title: any non-newline sequence enclosed in '"() alone on its line
 801	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
 802		i++
 803		titleOffset = i
 804
 805		// look for EOL
 806		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
 807			i++
 808		}
 809		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
 810			titleEnd = i + 1
 811		} else {
 812			titleEnd = i
 813		}
 814
 815		// step back
 816		i--
 817		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
 818			i--
 819		}
 820		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
 821			lineEnd = titleEnd
 822			titleEnd = i
 823		}
 824	}
 825
 826	return
 827}
 828
 829// The first bit of this logic is the same as (*parser).listItem, but the rest
 830// is much simpler. This function simply finds the entire block and shifts it
 831// over by one tab if it is indeed a block (just returns the line if it's not).
 832// blockEnd is the end of the section in the input buffer, and contents is the
 833// extracted text that was shifted over one tab. It will need to be rendered at
 834// the end of the document.
 835func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
 836	if i == 0 || len(data) == 0 {
 837		return
 838	}
 839
 840	// skip leading whitespace on first line
 841	for i < len(data) && data[i] == ' ' {
 842		i++
 843	}
 844
 845	blockStart = i
 846
 847	// find the end of the line
 848	blockEnd = i
 849	for i < len(data) && data[i-1] != '\n' {
 850		i++
 851	}
 852
 853	// get working buffer
 854	var raw bytes.Buffer
 855
 856	// put the first line into the working buffer
 857	raw.Write(data[blockEnd:i])
 858	blockEnd = i
 859
 860	// process the following lines
 861	containsBlankLine := false
 862
 863gatherLines:
 864	for blockEnd < len(data) {
 865		i++
 866
 867		// find the end of this line
 868		for i < len(data) && data[i-1] != '\n' {
 869			i++
 870		}
 871
 872		// if it is an empty line, guess that it is part of this item
 873		// and move on to the next line
 874		if p.isEmpty(data[blockEnd:i]) > 0 {
 875			containsBlankLine = true
 876			blockEnd = i
 877			continue
 878		}
 879
 880		n := 0
 881		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
 882			// this is the end of the block.
 883			// we don't want to include this last line in the index.
 884			break gatherLines
 885		}
 886
 887		// if there were blank lines before this one, insert a new one now
 888		if containsBlankLine {
 889			raw.WriteByte('\n')
 890			containsBlankLine = false
 891		}
 892
 893		// get rid of that first tab, write to buffer
 894		raw.Write(data[blockEnd+n : i])
 895		hasBlock = true
 896
 897		blockEnd = i
 898	}
 899
 900	if data[blockEnd-1] != '\n' {
 901		raw.WriteByte('\n')
 902	}
 903
 904	contents = raw.Bytes()
 905
 906	return
 907}
 908
 909//
 910//
 911// Miscellaneous helper functions
 912//
 913//
 914
 915// Test if a character is a punctuation symbol.
 916// Taken from a private function in regexp in the stdlib.
 917func ispunct(c byte) bool {
 918	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
 919		if c == r {
 920			return true
 921		}
 922	}
 923	return false
 924}
 925
 926// Test if a character is a whitespace character.
 927func isspace(c byte) bool {
 928	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
 929}
 930
 931// Test if a character is letter.
 932func isletter(c byte) bool {
 933	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
 934}
 935
 936// Test if a character is a letter or a digit.
 937// TODO: check when this is looking for ASCII alnum and when it should use unicode
 938func isalnum(c byte) bool {
 939	return (c >= '0' && c <= '9') || isletter(c)
 940}
 941
 942// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
 943// always ends output with a newline
 944func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
 945	// first, check for common cases: no tabs, or only tabs at beginning of line
 946	i, prefix := 0, 0
 947	slowcase := false
 948	for i = 0; i < len(line); i++ {
 949		if line[i] == '\t' {
 950			if prefix == i {
 951				prefix++
 952			} else {
 953				slowcase = true
 954				break
 955			}
 956		}
 957	}
 958
 959	// no need to decode runes if all tabs are at the beginning of the line
 960	if !slowcase {
 961		for i = 0; i < prefix*tabSize; i++ {
 962			out.WriteByte(' ')
 963		}
 964		out.Write(line[prefix:])
 965		return
 966	}
 967
 968	// the slow case: we need to count runes to figure out how
 969	// many spaces to insert for each tab
 970	column := 0
 971	i = 0
 972	for i < len(line) {
 973		start := i
 974		for i < len(line) && line[i] != '\t' {
 975			_, size := utf8.DecodeRune(line[i:])
 976			i += size
 977			column++
 978		}
 979
 980		if i > start {
 981			out.Write(line[start:i])
 982		}
 983
 984		if i >= len(line) {
 985			break
 986		}
 987
 988		for {
 989			out.WriteByte(' ')
 990			column++
 991			if column%tabSize == 0 {
 992				break
 993			}
 994		}
 995
 996		i++
 997	}
 998}
 999
1000// Find if a line counts as indented or not.
1001// Returns number of characters the indent is (0 = not indented).
1002func isIndented(data []byte, indentSize int) int {
1003	if len(data) == 0 {
1004		return 0
1005	}
1006	if data[0] == '\t' {
1007		return 1
1008	}
1009	if len(data) < indentSize {
1010		return 0
1011	}
1012	for i := 0; i < indentSize; i++ {
1013		if data[i] != ' ' {
1014			return 0
1015		}
1016	}
1017	return indentSize
1018}
1019
1020// Create a url-safe slug for fragments
1021func slugify(in []byte) []byte {
1022	if len(in) == 0 {
1023		return in
1024	}
1025	out := make([]byte, 0, len(in))
1026	sym := false
1027
1028	for _, ch := range in {
1029		if isalnum(ch) {
1030			sym = false
1031			out = append(out, ch)
1032		} else if sym {
1033			continue
1034		} else {
1035			out = append(out, '-')
1036			sym = true
1037		}
1038	}
1039	var a, b int
1040	var ch byte
1041	for a, ch = range out {
1042		if ch != '-' {
1043			break
1044		}
1045	}
1046	for b = len(out) - 1; b > 0; b-- {
1047		if out[b] != '-' {
1048			break
1049		}
1050	}
1051	return out[a : b+1]
1052}