all repos — grayfriday @ 225250ddf1dbe4a6e8438eeef941dd76fcaa1eab

blackfriday fork with a few changes

markdown.go (view raw)

  1//
  2// Blackfriday Markdown Processor
  3// Available at http://github.com/russross/blackfriday
  4//
  5// Copyright © 2011 Russ Ross <russ@russross.com>.
  6// Distributed under the Simplified BSD License.
  7// See README.md for details.
  8//
  9
 10//
 11//
 12// Markdown parsing and processing
 13//
 14//
 15
 16// Package blackfriday is a markdown processor.
 17//
 18// Translates plain text with simple formatting rules into HTML or LaTeX.
 19package blackfriday
 20
 21import (
 22	"bytes"
 23	"fmt"
 24	"io"
 25	"strings"
 26	"unicode/utf8"
 27)
 28
 29// Version string of the package.
 30const Version = "2.0"
 31
 32// Extensions is a bitwise or'ed collection of enabled Blackfriday's
 33// extensions.
 34type Extensions int
 35
 36// These are the supported markdown parsing extensions.
 37// OR these values together to select multiple extensions.
 38const (
 39	NoExtensions           Extensions = 0
 40	NoIntraEmphasis        Extensions = 1 << iota // Ignore emphasis markers inside words
 41	Tables                                        // Render tables
 42	FencedCode                                    // Render fenced code blocks
 43	Autolink                                      // Detect embedded URLs that are not explicitly marked
 44	Strikethrough                                 // Strikethrough text using ~~test~~
 45	LaxHTMLBlocks                                 // Loosen up HTML block parsing rules
 46	SpaceHeaders                                  // Be strict about prefix header rules
 47	HardLineBreak                                 // Translate newlines into line breaks
 48	TabSizeEight                                  // Expand tabs to eight spaces instead of four
 49	Footnotes                                     // Pandoc-style footnotes
 50	NoEmptyLineBeforeBlock                        // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
 51	HeaderIDs                                     // specify header IDs  with {#id}
 52	Titleblock                                    // Titleblock ala pandoc
 53	AutoHeaderIDs                                 // Create the header ID from the text
 54	BackslashLineBreak                            // Translate trailing backslashes into line breaks
 55	DefinitionLists                               // Render definition lists
 56	TOC                                           // Generate a table of contents
 57	OmitContents                                  // Skip the main contents (for a standalone table of contents)
 58
 59	CommonHTMLFlags HTMLFlags = UseXHTML | Smartypants |
 60		SmartypantsFractions | SmartypantsDashes | SmartypantsLatexDashes
 61
 62	CommonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
 63		Autolink | Strikethrough | SpaceHeaders | HeaderIDs |
 64		BackslashLineBreak | DefinitionLists
 65)
 66
 67// DefaultOptions is a convenience variable with all the options that are
 68// enabled by default.
 69var DefaultOptions = Options{
 70	Extensions: CommonExtensions,
 71}
 72
 73// ListType contains bitwise or'ed flags for list and list item objects.
 74type ListType int
 75
 76// These are the possible flag values for the ListItem renderer.
 77// Multiple flag values may be ORed together.
 78// These are mostly of interest if you are writing a new output format.
 79const (
 80	ListTypeOrdered ListType = 1 << iota
 81	ListTypeDefinition
 82	ListTypeTerm
 83
 84	ListItemContainsBlock
 85	ListItemBeginningOfList
 86	ListItemEndOfList
 87)
 88
 89// CellAlignFlags holds a type of alignment in a table cell.
 90type CellAlignFlags int
 91
 92// These are the possible flag values for the table cell renderer.
 93// Only a single one of these values will be used; they are not ORed together.
 94// These are mostly of interest if you are writing a new output format.
 95const (
 96	TableAlignmentLeft = 1 << iota
 97	TableAlignmentRight
 98	TableAlignmentCenter = (TableAlignmentLeft | TableAlignmentRight)
 99)
100
101// The size of a tab stop.
102const (
103	TabSizeDefault = 4
104	TabSizeDouble  = 8
105)
106
107// blockTags is a set of tags that are recognized as HTML block tags.
108// Any of these can be included in markdown text without special escaping.
109var blockTags = map[string]struct{}{
110	"blockquote": struct{}{},
111	"del":        struct{}{},
112	"div":        struct{}{},
113	"dl":         struct{}{},
114	"fieldset":   struct{}{},
115	"form":       struct{}{},
116	"h1":         struct{}{},
117	"h2":         struct{}{},
118	"h3":         struct{}{},
119	"h4":         struct{}{},
120	"h5":         struct{}{},
121	"h6":         struct{}{},
122	"iframe":     struct{}{},
123	"ins":        struct{}{},
124	"math":       struct{}{},
125	"noscript":   struct{}{},
126	"ol":         struct{}{},
127	"pre":        struct{}{},
128	"p":          struct{}{},
129	"script":     struct{}{},
130	"style":      struct{}{},
131	"table":      struct{}{},
132	"ul":         struct{}{},
133
134	// HTML5
135	"address":    struct{}{},
136	"article":    struct{}{},
137	"aside":      struct{}{},
138	"canvas":     struct{}{},
139	"figcaption": struct{}{},
140	"figure":     struct{}{},
141	"footer":     struct{}{},
142	"header":     struct{}{},
143	"hgroup":     struct{}{},
144	"main":       struct{}{},
145	"nav":        struct{}{},
146	"output":     struct{}{},
147	"progress":   struct{}{},
148	"section":    struct{}{},
149	"video":      struct{}{},
150}
151
152// Renderer is the rendering interface.
153// This is mostly of interest if you are implementing a new rendering format.
154//
155// When a byte slice is provided, it contains the (rendered) contents of the
156// element.
157//
158// When a callback is provided instead, it will write the contents of the
159// respective element directly to the output buffer and return true on success.
160// If the callback returns false, the rendering function should reset the
161// output buffer as though it had never been called.
162//
163// Currently HTML and Latex implementations are provided
164type Renderer interface {
165	Render(ast *Node) []byte
166	RenderNode(w io.Writer, node *Node, entering bool) WalkStatus
167}
168
169// Callback functions for inline parsing. One such function is defined
170// for each character that triggers a response when parsing inline data.
171type inlineParser func(p *parser, data []byte, offset int) int
172
173// Parser holds runtime state used by the parser.
174// This is constructed by the Markdown function.
175type parser struct {
176	refOverride    ReferenceOverrideFunc
177	refs           map[string]*reference
178	inlineCallback [256]inlineParser
179	flags          Extensions
180	nesting        int
181	maxNesting     int
182	insideLink     bool
183
184	// Footnotes need to be ordered as well as available to quickly check for
185	// presence. If a ref is also a footnote, it's stored both in refs and here
186	// in notes. Slice is nil if footnotes not enabled.
187	notes []*reference
188
189	doc                  *Node
190	tip                  *Node // = doc
191	oldTip               *Node
192	lastMatchedContainer *Node // = doc
193	allClosed            bool
194	currBlock            *Node // a block node currently being parsed by inline parser
195}
196
197func (p *parser) getRef(refid string) (ref *reference, found bool) {
198	if p.refOverride != nil {
199		r, overridden := p.refOverride(refid)
200		if overridden {
201			if r == nil {
202				return nil, false
203			}
204			return &reference{
205				link:     []byte(r.Link),
206				title:    []byte(r.Title),
207				noteID:   0,
208				hasBlock: false,
209				text:     []byte(r.Text)}, true
210		}
211	}
212	// refs are case insensitive
213	ref, found = p.refs[strings.ToLower(refid)]
214	return ref, found
215}
216
217func (p *parser) finalize(block *Node) {
218	above := block.Parent
219	block.open = false
220	p.tip = above
221}
222
223func (p *parser) addChild(node NodeType, offset uint32) *Node {
224	for !p.tip.canContain(node) {
225		p.finalize(p.tip)
226	}
227	newNode := NewNode(node)
228	newNode.content = []byte{}
229	p.tip.AppendChild(newNode)
230	p.tip = newNode
231	return newNode
232}
233
234func (p *parser) closeUnmatchedBlocks() {
235	if !p.allClosed {
236		for p.oldTip != p.lastMatchedContainer {
237			parent := p.oldTip.Parent
238			p.finalize(p.oldTip)
239			p.oldTip = parent
240		}
241		p.allClosed = true
242	}
243}
244
245//
246//
247// Public interface
248//
249//
250
251// Reference represents the details of a link.
252// See the documentation in Options for more details on use-case.
253type Reference struct {
254	// Link is usually the URL the reference points to.
255	Link string
256	// Title is the alternate text describing the link in more detail.
257	Title string
258	// Text is the optional text to override the ref with if the syntax used was
259	// [refid][]
260	Text string
261}
262
263// ReferenceOverrideFunc is expected to be called with a reference string and
264// return either a valid Reference type that the reference string maps to or
265// nil. If overridden is false, the default reference logic will be executed.
266// See the documentation in Options for more details on use-case.
267type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
268
269// Options represents configurable overrides and callbacks (in addition to the
270// extension flag set) for configuring a Markdown parse.
271type Options struct {
272	// Extensions is a flag set of bit-wise ORed extension bits. See the
273	// Extensions flags defined in this package.
274	Extensions Extensions
275
276	// ReferenceOverride is an optional function callback that is called every
277	// time a reference is resolved.
278	//
279	// In Markdown, the link reference syntax can be made to resolve a link to
280	// a reference instead of an inline URL, in one of the following ways:
281	//
282	//  * [link text][refid]
283	//  * [refid][]
284	//
285	// Usually, the refid is defined at the bottom of the Markdown document. If
286	// this override function is provided, the refid is passed to the override
287	// function first, before consulting the defined refids at the bottom. If
288	// the override function indicates an override did not occur, the refids at
289	// the bottom will be used to fill in the link details.
290	ReferenceOverride ReferenceOverrideFunc
291}
292
293// MarkdownBasic is a convenience function for simple rendering.
294// It processes markdown input with no extensions enabled.
295func MarkdownBasic(input []byte) []byte {
296	// set up the HTML renderer
297	renderer := NewHTMLRenderer(HTMLRendererParameters{
298		Flags:      UseXHTML,
299		Extensions: CommonExtensions,
300	})
301
302	// set up the parser
303	return Markdown(input, renderer, Options{})
304}
305
306// MarkdownCommon is a convenience function for simple rendering. It calls
307// Markdown with most useful extensions enabled, including:
308//
309// * Smartypants processing with smart fractions and LaTeX dashes
310//
311// * Intra-word emphasis suppression
312//
313// * Tables
314//
315// * Fenced code blocks
316//
317// * Autolinking
318//
319// * Strikethrough support
320//
321// * Strict header parsing
322//
323// * Custom Header IDs
324func MarkdownCommon(input []byte) []byte {
325	// set up the HTML renderer
326	renderer := NewHTMLRenderer(HTMLRendererParameters{
327		Flags:      CommonHTMLFlags,
328		Extensions: CommonExtensions,
329	})
330	return Markdown(input, renderer, DefaultOptions)
331}
332
333// Markdown is the main rendering function.
334// It parses and renders a block of markdown-encoded text.
335// The supplied Renderer is used to format the output, and extensions dictates
336// which non-standard extensions are enabled.
337//
338// To use the supplied HTML or LaTeX renderers, see NewHTMLRenderer and
339// NewLatexRenderer, respectively.
340func Markdown(input []byte, renderer Renderer, options Options) []byte {
341	if renderer == nil {
342		return nil
343	}
344	return renderer.Render(Parse(input, options))
345}
346
347// Parse is an entry point to the parsing part of Blackfriday. It takes an
348// input markdown document and produces a syntax tree for its contents. This
349// tree can then be rendered with a default or custom renderer, or
350// analyzed/transformed by the caller to whatever non-standard needs they have.
351func Parse(input []byte, opts Options) *Node {
352	extensions := opts.Extensions
353
354	// fill in the render structure
355	p := new(parser)
356	p.flags = extensions
357	p.refOverride = opts.ReferenceOverride
358	p.refs = make(map[string]*reference)
359	p.maxNesting = 16
360	p.insideLink = false
361
362	docNode := NewNode(Document)
363	p.doc = docNode
364	p.tip = docNode
365	p.oldTip = docNode
366	p.lastMatchedContainer = docNode
367	p.allClosed = true
368
369	// register inline parsers
370	p.inlineCallback['*'] = emphasis
371	p.inlineCallback['_'] = emphasis
372	if extensions&Strikethrough != 0 {
373		p.inlineCallback['~'] = emphasis
374	}
375	p.inlineCallback['`'] = codeSpan
376	p.inlineCallback['\n'] = lineBreak
377	p.inlineCallback['['] = link
378	p.inlineCallback['<'] = leftAngle
379	p.inlineCallback['\\'] = escape
380	p.inlineCallback['&'] = entity
381	p.inlineCallback['!'] = maybeImage
382	p.inlineCallback['^'] = maybeInlineFootnote
383
384	if extensions&Autolink != 0 {
385		p.inlineCallback['h'] = maybeAutoLink
386		p.inlineCallback['m'] = maybeAutoLink
387		p.inlineCallback['f'] = maybeAutoLink
388		p.inlineCallback['H'] = maybeAutoLink
389		p.inlineCallback['M'] = maybeAutoLink
390		p.inlineCallback['F'] = maybeAutoLink
391	}
392
393	if extensions&Footnotes != 0 {
394		p.notes = make([]*reference, 0)
395	}
396
397	first := firstPass(p, input)
398	secondPass(p, first)
399	// Walk the tree and finish up some of unfinished blocks
400	for p.tip != nil {
401		p.finalize(p.tip)
402	}
403	// Walk the tree again and process inline markdown in each block
404	p.doc.Walk(func(node *Node, entering bool) WalkStatus {
405		if node.Type == Paragraph || node.Type == Header || node.Type == TableCell {
406			p.currBlock = node
407			p.inline(node.content)
408			node.content = nil
409		}
410		return GoToNext
411	})
412	p.parseRefsToAST()
413	return p.doc
414}
415
416func (p *parser) parseRefsToAST() {
417	if p.flags&Footnotes == 0 || len(p.notes) == 0 {
418		return
419	}
420	p.tip = p.doc
421	finalizeHTMLBlock(p.addBlock(HTMLBlock, []byte(`<div class="footnotes">`)))
422	p.addBlock(HorizontalRule, nil)
423	block := p.addBlock(List, nil)
424	block.ListFlags = ListTypeOrdered
425	flags := ListItemBeginningOfList
426	// Note: this loop is intentionally explicit, not range-form. This is
427	// because the body of the loop will append nested footnotes to p.notes and
428	// we need to process those late additions. Range form would only walk over
429	// the fixed initial set.
430	for i := 0; i < len(p.notes); i++ {
431		ref := p.notes[i]
432		block := p.addBlock(Item, nil)
433		block.ListFlags = ListTypeOrdered
434		block.RefLink = ref.link
435		if ref.hasBlock {
436			flags |= ListItemContainsBlock
437			p.block(ref.title)
438		} else {
439			p.currBlock = block
440			p.inline(ref.title)
441		}
442		flags &^= ListItemBeginningOfList | ListItemContainsBlock
443	}
444	above := block.Parent
445	finalizeList(block)
446	p.tip = above
447	finalizeHTMLBlock(p.addBlock(HTMLBlock, []byte("</div>")))
448	block.Walk(func(node *Node, entering bool) WalkStatus {
449		if node.Type == Paragraph || node.Type == Header {
450			p.currBlock = node
451			p.inline(node.content)
452			node.content = nil
453		}
454		return GoToNext
455	})
456}
457
458// first pass:
459// - normalize newlines
460// - extract references (outside of fenced code blocks)
461// - expand tabs (outside of fenced code blocks)
462// - copy everything else
463func firstPass(p *parser, input []byte) []byte {
464	var out bytes.Buffer
465	tabSize := TabSizeDefault
466	if p.flags&TabSizeEight != 0 {
467		tabSize = TabSizeDouble
468	}
469	beg := 0
470	lastFencedCodeBlockEnd := 0
471	for beg < len(input) {
472		// Find end of this line, then process the line.
473		end := beg
474		for end < len(input) && input[end] != '\n' && input[end] != '\r' {
475			end++
476		}
477
478		if p.flags&FencedCode != 0 {
479			// track fenced code block boundaries to suppress tab expansion
480			// and reference extraction inside them:
481			if beg >= lastFencedCodeBlockEnd {
482				if i := p.fencedCodeBlock(input[beg:], false); i > 0 {
483					lastFencedCodeBlockEnd = beg + i
484				}
485			}
486		}
487
488		// add the line body if present
489		if end > beg {
490			if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
491				out.Write(input[beg:end])
492			} else if refEnd := isReference(p, input[beg:], tabSize); refEnd > 0 {
493				beg += refEnd
494				continue
495			} else {
496				expandTabs(&out, input[beg:end], tabSize)
497			}
498		}
499
500		if end < len(input) && input[end] == '\r' {
501			end++
502		}
503		if end < len(input) && input[end] == '\n' {
504			end++
505		}
506		out.WriteByte('\n')
507
508		beg = end
509	}
510
511	// empty input?
512	if out.Len() == 0 {
513		out.WriteByte('\n')
514	}
515
516	return out.Bytes()
517}
518
519// second pass: actual rendering
520func secondPass(p *parser, input []byte) {
521	p.block(input)
522
523	if p.flags&Footnotes != 0 && len(p.notes) > 0 {
524		flags := ListItemBeginningOfList
525		for i := 0; i < len(p.notes); i++ {
526			ref := p.notes[i]
527			if ref.hasBlock {
528				flags |= ListItemContainsBlock
529				p.block(ref.title)
530			} else {
531				p.inline(ref.title)
532			}
533			flags &^= ListItemBeginningOfList | ListItemContainsBlock
534		}
535	}
536
537	if p.nesting != 0 {
538		panic("Nesting level did not end at zero")
539	}
540}
541
542//
543// Link references
544//
545// This section implements support for references that (usually) appear
546// as footnotes in a document, and can be referenced anywhere in the document.
547// The basic format is:
548//
549//    [1]: http://www.google.com/ "Google"
550//    [2]: http://www.github.com/ "Github"
551//
552// Anywhere in the document, the reference can be linked by referring to its
553// label, i.e., 1 and 2 in this example, as in:
554//
555//    This library is hosted on [Github][2], a git hosting site.
556//
557// Actual footnotes as specified in Pandoc and supported by some other Markdown
558// libraries such as php-markdown are also taken care of. They look like this:
559//
560//    This sentence needs a bit of further explanation.[^note]
561//
562//    [^note]: This is the explanation.
563//
564// Footnotes should be placed at the end of the document in an ordered list.
565// Inline footnotes such as:
566//
567//    Inline footnotes^[Not supported.] also exist.
568//
569// are not yet supported.
570
571// References are parsed and stored in this struct.
572type reference struct {
573	link     []byte
574	title    []byte
575	noteID   int // 0 if not a footnote ref
576	hasBlock bool
577	text     []byte
578}
579
580func (r *reference) String() string {
581	return fmt.Sprintf("{link: %q, title: %q, text: %q, noteID: %d, hasBlock: %v}",
582		r.link, r.title, r.text, r.noteID, r.hasBlock)
583}
584
585// Check whether or not data starts with a reference link.
586// If so, it is parsed and stored in the list of references
587// (in the render struct).
588// Returns the number of bytes to skip to move past it,
589// or zero if the first line is not a reference.
590func isReference(p *parser, data []byte, tabSize int) int {
591	// up to 3 optional leading spaces
592	if len(data) < 4 {
593		return 0
594	}
595	i := 0
596	for i < 3 && data[i] == ' ' {
597		i++
598	}
599
600	noteID := 0
601
602	// id part: anything but a newline between brackets
603	if data[i] != '[' {
604		return 0
605	}
606	i++
607	if p.flags&Footnotes != 0 {
608		if i < len(data) && data[i] == '^' {
609			// we can set it to anything here because the proper noteIds will
610			// be assigned later during the second pass. It just has to be != 0
611			noteID = 1
612			i++
613		}
614	}
615	idOffset := i
616	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
617		i++
618	}
619	if i >= len(data) || data[i] != ']' {
620		return 0
621	}
622	idEnd := i
623
624	// spacer: colon (space | tab)* newline? (space | tab)*
625	i++
626	if i >= len(data) || data[i] != ':' {
627		return 0
628	}
629	i++
630	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
631		i++
632	}
633	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
634		i++
635		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
636			i++
637		}
638	}
639	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
640		i++
641	}
642	if i >= len(data) {
643		return 0
644	}
645
646	var (
647		linkOffset, linkEnd   int
648		titleOffset, titleEnd int
649		lineEnd               int
650		raw                   []byte
651		hasBlock              bool
652	)
653
654	if p.flags&Footnotes != 0 && noteID != 0 {
655		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
656		lineEnd = linkEnd
657	} else {
658		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
659	}
660	if lineEnd == 0 {
661		return 0
662	}
663
664	// a valid ref has been found
665
666	ref := &reference{
667		noteID:   noteID,
668		hasBlock: hasBlock,
669	}
670
671	if noteID > 0 {
672		// reusing the link field for the id since footnotes don't have links
673		ref.link = data[idOffset:idEnd]
674		// if footnote, it's not really a title, it's the contained text
675		ref.title = raw
676	} else {
677		ref.link = data[linkOffset:linkEnd]
678		ref.title = data[titleOffset:titleEnd]
679	}
680
681	// id matches are case-insensitive
682	id := string(bytes.ToLower(data[idOffset:idEnd]))
683
684	p.refs[id] = ref
685
686	return lineEnd
687}
688
689func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
690	// link: whitespace-free sequence, optionally between angle brackets
691	if data[i] == '<' {
692		i++
693	}
694	linkOffset = i
695	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
696		i++
697	}
698	if i == len(data) {
699		return
700	}
701	linkEnd = i
702	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
703		linkOffset++
704		linkEnd--
705	}
706
707	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
708	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
709		i++
710	}
711	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
712		return
713	}
714
715	// compute end-of-line
716	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
717		lineEnd = i
718	}
719	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
720		lineEnd++
721	}
722
723	// optional (space|tab)* spacer after a newline
724	if lineEnd > 0 {
725		i = lineEnd + 1
726		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
727			i++
728		}
729	}
730
731	// optional title: any non-newline sequence enclosed in '"() alone on its line
732	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
733		i++
734		titleOffset = i
735
736		// look for EOL
737		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
738			i++
739		}
740		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
741			titleEnd = i + 1
742		} else {
743			titleEnd = i
744		}
745
746		// step back
747		i--
748		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
749			i--
750		}
751		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
752			lineEnd = titleEnd
753			titleEnd = i
754		}
755	}
756
757	return
758}
759
760// The first bit of this logic is the same as (*parser).listItem, but the rest
761// is much simpler. This function simply finds the entire block and shifts it
762// over by one tab if it is indeed a block (just returns the line if it's not).
763// blockEnd is the end of the section in the input buffer, and contents is the
764// extracted text that was shifted over one tab. It will need to be rendered at
765// the end of the document.
766func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
767	if i == 0 || len(data) == 0 {
768		return
769	}
770
771	// skip leading whitespace on first line
772	for i < len(data) && data[i] == ' ' {
773		i++
774	}
775
776	blockStart = i
777
778	// find the end of the line
779	blockEnd = i
780	for i < len(data) && data[i-1] != '\n' {
781		i++
782	}
783
784	// get working buffer
785	var raw bytes.Buffer
786
787	// put the first line into the working buffer
788	raw.Write(data[blockEnd:i])
789	blockEnd = i
790
791	// process the following lines
792	containsBlankLine := false
793
794gatherLines:
795	for blockEnd < len(data) {
796		i++
797
798		// find the end of this line
799		for i < len(data) && data[i-1] != '\n' {
800			i++
801		}
802
803		// if it is an empty line, guess that it is part of this item
804		// and move on to the next line
805		if p.isEmpty(data[blockEnd:i]) > 0 {
806			containsBlankLine = true
807			blockEnd = i
808			continue
809		}
810
811		n := 0
812		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
813			// this is the end of the block.
814			// we don't want to include this last line in the index.
815			break gatherLines
816		}
817
818		// if there were blank lines before this one, insert a new one now
819		if containsBlankLine {
820			raw.WriteByte('\n')
821			containsBlankLine = false
822		}
823
824		// get rid of that first tab, write to buffer
825		raw.Write(data[blockEnd+n : i])
826		hasBlock = true
827
828		blockEnd = i
829	}
830
831	if data[blockEnd-1] != '\n' {
832		raw.WriteByte('\n')
833	}
834
835	contents = raw.Bytes()
836
837	return
838}
839
840//
841//
842// Miscellaneous helper functions
843//
844//
845
846// Test if a character is a punctuation symbol.
847// Taken from a private function in regexp in the stdlib.
848func ispunct(c byte) bool {
849	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
850		if c == r {
851			return true
852		}
853	}
854	return false
855}
856
857// Test if a character is a whitespace character.
858func isspace(c byte) bool {
859	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
860}
861
862// Test if a character is letter.
863func isletter(c byte) bool {
864	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
865}
866
867// Test if a character is a letter or a digit.
868// TODO: check when this is looking for ASCII alnum and when it should use unicode
869func isalnum(c byte) bool {
870	return (c >= '0' && c <= '9') || isletter(c)
871}
872
873// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
874// always ends output with a newline
875func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
876	// first, check for common cases: no tabs, or only tabs at beginning of line
877	i, prefix := 0, 0
878	slowcase := false
879	for i = 0; i < len(line); i++ {
880		if line[i] == '\t' {
881			if prefix == i {
882				prefix++
883			} else {
884				slowcase = true
885				break
886			}
887		}
888	}
889
890	// no need to decode runes if all tabs are at the beginning of the line
891	if !slowcase {
892		for i = 0; i < prefix*tabSize; i++ {
893			out.WriteByte(' ')
894		}
895		out.Write(line[prefix:])
896		return
897	}
898
899	// the slow case: we need to count runes to figure out how
900	// many spaces to insert for each tab
901	column := 0
902	i = 0
903	for i < len(line) {
904		start := i
905		for i < len(line) && line[i] != '\t' {
906			_, size := utf8.DecodeRune(line[i:])
907			i += size
908			column++
909		}
910
911		if i > start {
912			out.Write(line[start:i])
913		}
914
915		if i >= len(line) {
916			break
917		}
918
919		for {
920			out.WriteByte(' ')
921			column++
922			if column%tabSize == 0 {
923				break
924			}
925		}
926
927		i++
928	}
929}
930
931// Find if a line counts as indented or not.
932// Returns number of characters the indent is (0 = not indented).
933func isIndented(data []byte, indentSize int) int {
934	if len(data) == 0 {
935		return 0
936	}
937	if data[0] == '\t' {
938		return 1
939	}
940	if len(data) < indentSize {
941		return 0
942	}
943	for i := 0; i < indentSize; i++ {
944		if data[i] != ' ' {
945			return 0
946		}
947	}
948	return indentSize
949}
950
951// Create a url-safe slug for fragments
952func slugify(in []byte) []byte {
953	if len(in) == 0 {
954		return in
955	}
956	out := make([]byte, 0, len(in))
957	sym := false
958
959	for _, ch := range in {
960		if isalnum(ch) {
961			sym = false
962			out = append(out, ch)
963		} else if sym {
964			continue
965		} else {
966			out = append(out, '-')
967			sym = true
968		}
969	}
970	var a, b int
971	var ch byte
972	for a, ch = range out {
973		if ch != '-' {
974			break
975		}
976	}
977	for b = len(out) - 1; b > 0; b-- {
978		if out[b] != '-' {
979			break
980		}
981	}
982	return out[a : b+1]
983}