all repos — grayfriday @ 14a0c487b886d6ec301a6ae53e15e489e91cde8d

blackfriday fork with a few changes

markdown.go (view raw)

  1// Blackfriday Markdown Processor
  2// Available at http://github.com/russross/blackfriday
  3//
  4// Copyright © 2011 Russ Ross <russ@russross.com>.
  5// Distributed under the Simplified BSD License.
  6// See README.md for details.
  7
  8package blackfriday
  9
 10import (
 11	"bytes"
 12	"fmt"
 13	"io"
 14	"strings"
 15	"unicode/utf8"
 16)
 17
 18//
 19// Markdown parsing and processing
 20//
 21
 22// Version string of the package.
 23const Version = "2.0"
 24
 25// Extensions is a bitwise or'ed collection of enabled Blackfriday's
 26// extensions.
 27type Extensions int
 28
 29// These are the supported markdown parsing extensions.
 30// OR these values together to select multiple extensions.
 31const (
 32	NoExtensions           Extensions = 0
 33	NoIntraEmphasis        Extensions = 1 << iota // Ignore emphasis markers inside words
 34	Tables                                        // Render tables
 35	FencedCode                                    // Render fenced code blocks
 36	Autolink                                      // Detect embedded URLs that are not explicitly marked
 37	Strikethrough                                 // Strikethrough text using ~~test~~
 38	LaxHTMLBlocks                                 // Loosen up HTML block parsing rules
 39	SpaceHeaders                                  // Be strict about prefix header rules
 40	HardLineBreak                                 // Translate newlines into line breaks
 41	TabSizeEight                                  // Expand tabs to eight spaces instead of four
 42	Footnotes                                     // Pandoc-style footnotes
 43	NoEmptyLineBeforeBlock                        // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
 44	HeaderIDs                                     // specify header IDs  with {#id}
 45	Titleblock                                    // Titleblock ala pandoc
 46	AutoHeaderIDs                                 // Create the header ID from the text
 47	BackslashLineBreak                            // Translate trailing backslashes into line breaks
 48	DefinitionLists                               // Render definition lists
 49	TOC                                           // Generate a table of contents
 50	OmitContents                                  // Skip the main contents (for a standalone table of contents)
 51
 52	CommonHTMLFlags HTMLFlags = UseXHTML | Smartypants |
 53		SmartypantsFractions | SmartypantsDashes | SmartypantsLatexDashes
 54
 55	CommonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
 56		Autolink | Strikethrough | SpaceHeaders | HeaderIDs |
 57		BackslashLineBreak | DefinitionLists
 58)
 59
 60// DefaultOptions is a convenience variable with all the options that are
 61// enabled by default.
 62var DefaultOptions = Options{
 63	Extensions: CommonExtensions,
 64}
 65
 66// ListType contains bitwise or'ed flags for list and list item objects.
 67type ListType int
 68
 69// These are the possible flag values for the ListItem renderer.
 70// Multiple flag values may be ORed together.
 71// These are mostly of interest if you are writing a new output format.
 72const (
 73	ListTypeOrdered ListType = 1 << iota
 74	ListTypeDefinition
 75	ListTypeTerm
 76
 77	ListItemContainsBlock
 78	ListItemBeginningOfList // TODO: figure out if this is of any use now
 79	ListItemEndOfList
 80)
 81
 82// CellAlignFlags holds a type of alignment in a table cell.
 83type CellAlignFlags int
 84
 85// These are the possible flag values for the table cell renderer.
 86// Only a single one of these values will be used; they are not ORed together.
 87// These are mostly of interest if you are writing a new output format.
 88const (
 89	TableAlignmentLeft = 1 << iota
 90	TableAlignmentRight
 91	TableAlignmentCenter = (TableAlignmentLeft | TableAlignmentRight)
 92)
 93
 94// The size of a tab stop.
 95const (
 96	TabSizeDefault = 4
 97	TabSizeDouble  = 8
 98)
 99
100// blockTags is a set of tags that are recognized as HTML block tags.
101// Any of these can be included in markdown text without special escaping.
102var blockTags = map[string]struct{}{
103	"blockquote": struct{}{},
104	"del":        struct{}{},
105	"div":        struct{}{},
106	"dl":         struct{}{},
107	"fieldset":   struct{}{},
108	"form":       struct{}{},
109	"h1":         struct{}{},
110	"h2":         struct{}{},
111	"h3":         struct{}{},
112	"h4":         struct{}{},
113	"h5":         struct{}{},
114	"h6":         struct{}{},
115	"iframe":     struct{}{},
116	"ins":        struct{}{},
117	"math":       struct{}{},
118	"noscript":   struct{}{},
119	"ol":         struct{}{},
120	"pre":        struct{}{},
121	"p":          struct{}{},
122	"script":     struct{}{},
123	"style":      struct{}{},
124	"table":      struct{}{},
125	"ul":         struct{}{},
126
127	// HTML5
128	"address":    struct{}{},
129	"article":    struct{}{},
130	"aside":      struct{}{},
131	"canvas":     struct{}{},
132	"figcaption": struct{}{},
133	"figure":     struct{}{},
134	"footer":     struct{}{},
135	"header":     struct{}{},
136	"hgroup":     struct{}{},
137	"main":       struct{}{},
138	"nav":        struct{}{},
139	"output":     struct{}{},
140	"progress":   struct{}{},
141	"section":    struct{}{},
142	"video":      struct{}{},
143}
144
145// Renderer is the rendering interface.
146// This is mostly of interest if you are implementing a new rendering format.
147//
148// When a byte slice is provided, it contains the (rendered) contents of the
149// element.
150//
151// When a callback is provided instead, it will write the contents of the
152// respective element directly to the output buffer and return true on success.
153// If the callback returns false, the rendering function should reset the
154// output buffer as though it had never been called.
155//
156// Only an HTML implementation is provided in this repository,
157// see the README for external implementations.
158type Renderer interface {
159	Render(ast *Node) []byte
160	RenderNode(w io.Writer, node *Node, entering bool) WalkStatus
161}
162
163// Callback functions for inline parsing. One such function is defined
164// for each character that triggers a response when parsing inline data.
165type inlineParser func(p *parser, data []byte, offset int) (int, *Node)
166
167// Parser holds runtime state used by the parser.
168// This is constructed by the Markdown function.
169type parser struct {
170	refOverride    ReferenceOverrideFunc
171	refs           map[string]*reference
172	inlineCallback [256]inlineParser
173	flags          Extensions
174	nesting        int
175	maxNesting     int
176	insideLink     bool
177
178	// Footnotes need to be ordered as well as available to quickly check for
179	// presence. If a ref is also a footnote, it's stored both in refs and here
180	// in notes. Slice is nil if footnotes not enabled.
181	notes []*reference
182
183	doc                  *Node
184	tip                  *Node // = doc
185	oldTip               *Node
186	lastMatchedContainer *Node // = doc
187	allClosed            bool
188}
189
190func (p *parser) getRef(refid string) (ref *reference, found bool) {
191	if p.refOverride != nil {
192		r, overridden := p.refOverride(refid)
193		if overridden {
194			if r == nil {
195				return nil, false
196			}
197			return &reference{
198				link:     []byte(r.Link),
199				title:    []byte(r.Title),
200				noteID:   0,
201				hasBlock: false,
202				text:     []byte(r.Text)}, true
203		}
204	}
205	// refs are case insensitive
206	ref, found = p.refs[strings.ToLower(refid)]
207	return ref, found
208}
209
210func (p *parser) finalize(block *Node) {
211	above := block.Parent
212	block.open = false
213	p.tip = above
214}
215
216func (p *parser) addChild(node NodeType, offset uint32) *Node {
217	return p.addExistingChild(NewNode(node), offset)
218}
219
220func (p *parser) addExistingChild(node *Node, offset uint32) *Node {
221	for !p.tip.canContain(node.Type) {
222		p.finalize(p.tip)
223	}
224	p.tip.AppendChild(node)
225	p.tip = node
226	return node
227}
228
229func (p *parser) closeUnmatchedBlocks() {
230	if !p.allClosed {
231		for p.oldTip != p.lastMatchedContainer {
232			parent := p.oldTip.Parent
233			p.finalize(p.oldTip)
234			p.oldTip = parent
235		}
236		p.allClosed = true
237	}
238}
239
240//
241//
242// Public interface
243//
244//
245
246// Reference represents the details of a link.
247// See the documentation in Options for more details on use-case.
248type Reference struct {
249	// Link is usually the URL the reference points to.
250	Link string
251	// Title is the alternate text describing the link in more detail.
252	Title string
253	// Text is the optional text to override the ref with if the syntax used was
254	// [refid][]
255	Text string
256}
257
258// ReferenceOverrideFunc is expected to be called with a reference string and
259// return either a valid Reference type that the reference string maps to or
260// nil. If overridden is false, the default reference logic will be executed.
261// See the documentation in Options for more details on use-case.
262type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
263
264// Options represents configurable overrides and callbacks (in addition to the
265// extension flag set) for configuring a Markdown parse.
266type Options struct {
267	// Extensions is a flag set of bit-wise ORed extension bits. See the
268	// Extensions flags defined in this package.
269	Extensions Extensions
270
271	// ReferenceOverride is an optional function callback that is called every
272	// time a reference is resolved.
273	//
274	// In Markdown, the link reference syntax can be made to resolve a link to
275	// a reference instead of an inline URL, in one of the following ways:
276	//
277	//  * [link text][refid]
278	//  * [refid][]
279	//
280	// Usually, the refid is defined at the bottom of the Markdown document. If
281	// this override function is provided, the refid is passed to the override
282	// function first, before consulting the defined refids at the bottom. If
283	// the override function indicates an override did not occur, the refids at
284	// the bottom will be used to fill in the link details.
285	ReferenceOverride ReferenceOverrideFunc
286}
287
288// MarkdownBasic is a convenience function for simple rendering.
289// It processes markdown input with no extensions enabled.
290func MarkdownBasic(input []byte) []byte {
291	// set up the HTML renderer
292	renderer := NewHTMLRenderer(HTMLRendererParameters{
293		Flags:      UseXHTML,
294		Extensions: CommonExtensions,
295	})
296
297	// set up the parser
298	return Markdown(input, renderer, Options{})
299}
300
301// MarkdownCommon is a convenience function for simple rendering. It calls
302// Markdown with most useful extensions enabled, including:
303//
304// * Smartypants processing with smart fractions and LaTeX dashes
305//
306// * Intra-word emphasis suppression
307//
308// * Tables
309//
310// * Fenced code blocks
311//
312// * Autolinking
313//
314// * Strikethrough support
315//
316// * Strict header parsing
317//
318// * Custom Header IDs
319func MarkdownCommon(input []byte) []byte {
320	// set up the HTML renderer
321	renderer := NewHTMLRenderer(HTMLRendererParameters{
322		Flags:      CommonHTMLFlags,
323		Extensions: CommonExtensions,
324	})
325	return Markdown(input, renderer, DefaultOptions)
326}
327
328// Markdown is the main rendering function.
329// It parses and renders a block of markdown-encoded text.
330// The supplied Renderer is used to format the output, and extensions dictates
331// which non-standard extensions are enabled.
332//
333// To use the supplied HTML renderer, see NewHTMLRenderer.
334func Markdown(input []byte, renderer Renderer, options Options) []byte {
335	if renderer == nil {
336		return nil
337	}
338	return renderer.Render(Parse(input, options))
339}
340
341// Parse is an entry point to the parsing part of Blackfriday. It takes an
342// input markdown document and produces a syntax tree for its contents. This
343// tree can then be rendered with a default or custom renderer, or
344// analyzed/transformed by the caller to whatever non-standard needs they have.
345func Parse(input []byte, opts Options) *Node {
346	extensions := opts.Extensions
347
348	// fill in the render structure
349	p := new(parser)
350	p.flags = extensions
351	p.refOverride = opts.ReferenceOverride
352	p.refs = make(map[string]*reference)
353	p.maxNesting = 16
354	p.insideLink = false
355
356	docNode := NewNode(Document)
357	p.doc = docNode
358	p.tip = docNode
359	p.oldTip = docNode
360	p.lastMatchedContainer = docNode
361	p.allClosed = true
362
363	// register inline parsers
364	p.inlineCallback[' '] = maybeLineBreak
365	p.inlineCallback['*'] = emphasis
366	p.inlineCallback['_'] = emphasis
367	if extensions&Strikethrough != 0 {
368		p.inlineCallback['~'] = emphasis
369	}
370	p.inlineCallback['`'] = codeSpan
371	p.inlineCallback['\n'] = lineBreak
372	p.inlineCallback['['] = link
373	p.inlineCallback['<'] = leftAngle
374	p.inlineCallback['\\'] = escape
375	p.inlineCallback['&'] = entity
376	p.inlineCallback['!'] = maybeImage
377	p.inlineCallback['^'] = maybeInlineFootnote
378
379	if extensions&Autolink != 0 {
380		p.inlineCallback['h'] = maybeAutoLink
381		p.inlineCallback['m'] = maybeAutoLink
382		p.inlineCallback['f'] = maybeAutoLink
383		p.inlineCallback['H'] = maybeAutoLink
384		p.inlineCallback['M'] = maybeAutoLink
385		p.inlineCallback['F'] = maybeAutoLink
386	}
387
388	if extensions&Footnotes != 0 {
389		p.notes = make([]*reference, 0)
390	}
391
392	p.block(preprocess(p, input))
393	// Walk the tree and finish up some of unfinished blocks
394	for p.tip != nil {
395		p.finalize(p.tip)
396	}
397	// Walk the tree again and process inline markdown in each block
398	p.doc.Walk(func(node *Node, entering bool) WalkStatus {
399		if node.Type == Paragraph || node.Type == Header || node.Type == TableCell {
400			p.inline(node, node.content)
401			node.content = nil
402		}
403		return GoToNext
404	})
405	p.parseRefsToAST()
406	return p.doc
407}
408
409func (p *parser) parseRefsToAST() {
410	if p.flags&Footnotes == 0 || len(p.notes) == 0 {
411		return
412	}
413	p.tip = p.doc
414	block := p.addBlock(List, nil)
415	block.IsFootnotesList = true
416	block.ListFlags = ListTypeOrdered
417	flags := ListItemBeginningOfList
418	// Note: this loop is intentionally explicit, not range-form. This is
419	// because the body of the loop will append nested footnotes to p.notes and
420	// we need to process those late additions. Range form would only walk over
421	// the fixed initial set.
422	for i := 0; i < len(p.notes); i++ {
423		ref := p.notes[i]
424		p.addExistingChild(ref.footnote, 0)
425		block := ref.footnote
426		block.ListFlags = flags | ListTypeOrdered
427		block.RefLink = ref.link
428		if ref.hasBlock {
429			flags |= ListItemContainsBlock
430			p.block(ref.title)
431		} else {
432			p.inline(block, ref.title)
433		}
434		flags &^= ListItemBeginningOfList | ListItemContainsBlock
435	}
436	above := block.Parent
437	finalizeList(block)
438	p.tip = above
439	block.Walk(func(node *Node, entering bool) WalkStatus {
440		if node.Type == Paragraph || node.Type == Header {
441			p.inline(node, node.content)
442			node.content = nil
443		}
444		return GoToNext
445	})
446}
447
448// preprocess does a preparatory first pass over the input:
449// - normalize newlines
450// - expand tabs (outside of fenced code blocks)
451// - copy everything else
452func preprocess(p *parser, input []byte) []byte {
453	var out bytes.Buffer
454	tabSize := TabSizeDefault
455	if p.flags&TabSizeEight != 0 {
456		tabSize = TabSizeDouble
457	}
458	beg := 0
459	lastFencedCodeBlockEnd := 0
460	for beg < len(input) {
461		// Find end of this line, then process the line.
462		end := beg
463		for end < len(input) && input[end] != '\n' && input[end] != '\r' {
464			end++
465		}
466
467		if p.flags&FencedCode != 0 {
468			// track fenced code block boundaries to suppress tab expansion
469			// and reference extraction inside them:
470			if beg >= lastFencedCodeBlockEnd {
471				if i := p.fencedCodeBlock(input[beg:], false); i > 0 {
472					lastFencedCodeBlockEnd = beg + i
473				}
474			}
475		}
476
477		// add the line body if present
478		if end > beg {
479			if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
480				out.Write(input[beg:end])
481			} else {
482				expandTabs(&out, input[beg:end], tabSize)
483			}
484		}
485
486		if end < len(input) && input[end] == '\r' {
487			end++
488		}
489		if end < len(input) && input[end] == '\n' {
490			end++
491		}
492		out.WriteByte('\n')
493
494		beg = end
495	}
496
497	// empty input?
498	if out.Len() == 0 {
499		out.WriteByte('\n')
500	}
501
502	return out.Bytes()
503}
504
505//
506// Link references
507//
508// This section implements support for references that (usually) appear
509// as footnotes in a document, and can be referenced anywhere in the document.
510// The basic format is:
511//
512//    [1]: http://www.google.com/ "Google"
513//    [2]: http://www.github.com/ "Github"
514//
515// Anywhere in the document, the reference can be linked by referring to its
516// label, i.e., 1 and 2 in this example, as in:
517//
518//    This library is hosted on [Github][2], a git hosting site.
519//
520// Actual footnotes as specified in Pandoc and supported by some other Markdown
521// libraries such as php-markdown are also taken care of. They look like this:
522//
523//    This sentence needs a bit of further explanation.[^note]
524//
525//    [^note]: This is the explanation.
526//
527// Footnotes should be placed at the end of the document in an ordered list.
528// Inline footnotes such as:
529//
530//    Inline footnotes^[Not supported.] also exist.
531//
532// are not yet supported.
533
534// reference holds all information necessary for a reference-style links or
535// footnotes.
536//
537// Consider this markdown with reference-style links:
538//
539//     [link][ref]
540//
541//     [ref]: /url/ "tooltip title"
542//
543// It will be ultimately converted to this HTML:
544//
545//     <p><a href=\"/url/\" title=\"title\">link</a></p>
546//
547// And a reference structure will be populated as follows:
548//
549//     p.refs["ref"] = &reference{
550//         link: "/url/",
551//         title: "tooltip title",
552//     }
553//
554// Alternatively, reference can contain information about a footnote. Consider
555// this markdown:
556//
557//     Text needing a footnote.[^a]
558//
559//     [^a]: This is the note
560//
561// A reference structure will be populated as follows:
562//
563//     p.refs["a"] = &reference{
564//         link: "a",
565//         title: "This is the note",
566//         noteID: <some positive int>,
567//     }
568//
569// TODO: As you can see, it begs for splitting into two dedicated structures
570// for refs and for footnotes.
571type reference struct {
572	link     []byte
573	title    []byte
574	noteID   int // 0 if not a footnote ref
575	hasBlock bool
576	footnote *Node // a link to the Item node within a list of footnotes
577
578	text []byte // only gets populated by refOverride feature with Reference.Text
579}
580
581func (r *reference) String() string {
582	return fmt.Sprintf("{link: %q, title: %q, text: %q, noteID: %d, hasBlock: %v}",
583		r.link, r.title, r.text, r.noteID, r.hasBlock)
584}
585
586// Check whether or not data starts with a reference link.
587// If so, it is parsed and stored in the list of references
588// (in the render struct).
589// Returns the number of bytes to skip to move past it,
590// or zero if the first line is not a reference.
591func isReference(p *parser, data []byte, tabSize int) int {
592	// up to 3 optional leading spaces
593	if len(data) < 4 {
594		return 0
595	}
596	i := 0
597	for i < 3 && data[i] == ' ' {
598		i++
599	}
600
601	noteID := 0
602
603	// id part: anything but a newline between brackets
604	if data[i] != '[' {
605		return 0
606	}
607	i++
608	if p.flags&Footnotes != 0 {
609		if i < len(data) && data[i] == '^' {
610			// we can set it to anything here because the proper noteIds will
611			// be assigned later during the second pass. It just has to be != 0
612			noteID = 1
613			i++
614		}
615	}
616	idOffset := i
617	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
618		i++
619	}
620	if i >= len(data) || data[i] != ']' {
621		return 0
622	}
623	idEnd := i
624	// footnotes can have empty ID, like this: [^], but a reference can not be
625	// empty like this: []. Break early if it's not a footnote and there's no ID
626	if noteID == 0 && idOffset == idEnd {
627		return 0
628	}
629	// spacer: colon (space | tab)* newline? (space | tab)*
630	i++
631	if i >= len(data) || data[i] != ':' {
632		return 0
633	}
634	i++
635	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
636		i++
637	}
638	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
639		i++
640		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
641			i++
642		}
643	}
644	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
645		i++
646	}
647	if i >= len(data) {
648		return 0
649	}
650
651	var (
652		linkOffset, linkEnd   int
653		titleOffset, titleEnd int
654		lineEnd               int
655		raw                   []byte
656		hasBlock              bool
657	)
658
659	if p.flags&Footnotes != 0 && noteID != 0 {
660		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
661		lineEnd = linkEnd
662	} else {
663		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
664	}
665	if lineEnd == 0 {
666		return 0
667	}
668
669	// a valid ref has been found
670
671	ref := &reference{
672		noteID:   noteID,
673		hasBlock: hasBlock,
674	}
675
676	if noteID > 0 {
677		// reusing the link field for the id since footnotes don't have links
678		ref.link = data[idOffset:idEnd]
679		// if footnote, it's not really a title, it's the contained text
680		ref.title = raw
681	} else {
682		ref.link = data[linkOffset:linkEnd]
683		ref.title = data[titleOffset:titleEnd]
684	}
685
686	// id matches are case-insensitive
687	id := string(bytes.ToLower(data[idOffset:idEnd]))
688
689	p.refs[id] = ref
690
691	return lineEnd
692}
693
694func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
695	// link: whitespace-free sequence, optionally between angle brackets
696	if data[i] == '<' {
697		i++
698	}
699	linkOffset = i
700	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
701		i++
702	}
703	if i == len(data) {
704		return
705	}
706	linkEnd = i
707	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
708		linkOffset++
709		linkEnd--
710	}
711
712	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
713	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
714		i++
715	}
716	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
717		return
718	}
719
720	// compute end-of-line
721	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
722		lineEnd = i
723	}
724	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
725		lineEnd++
726	}
727
728	// optional (space|tab)* spacer after a newline
729	if lineEnd > 0 {
730		i = lineEnd + 1
731		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
732			i++
733		}
734	}
735
736	// optional title: any non-newline sequence enclosed in '"() alone on its line
737	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
738		i++
739		titleOffset = i
740
741		// look for EOL
742		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
743			i++
744		}
745		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
746			titleEnd = i + 1
747		} else {
748			titleEnd = i
749		}
750
751		// step back
752		i--
753		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
754			i--
755		}
756		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
757			lineEnd = titleEnd
758			titleEnd = i
759		}
760	}
761
762	return
763}
764
765// The first bit of this logic is the same as (*parser).listItem, but the rest
766// is much simpler. This function simply finds the entire block and shifts it
767// over by one tab if it is indeed a block (just returns the line if it's not).
768// blockEnd is the end of the section in the input buffer, and contents is the
769// extracted text that was shifted over one tab. It will need to be rendered at
770// the end of the document.
771func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
772	if i == 0 || len(data) == 0 {
773		return
774	}
775
776	// skip leading whitespace on first line
777	for i < len(data) && data[i] == ' ' {
778		i++
779	}
780
781	blockStart = i
782
783	// find the end of the line
784	blockEnd = i
785	for i < len(data) && data[i-1] != '\n' {
786		i++
787	}
788
789	// get working buffer
790	var raw bytes.Buffer
791
792	// put the first line into the working buffer
793	raw.Write(data[blockEnd:i])
794	blockEnd = i
795
796	// process the following lines
797	containsBlankLine := false
798
799gatherLines:
800	for blockEnd < len(data) {
801		i++
802
803		// find the end of this line
804		for i < len(data) && data[i-1] != '\n' {
805			i++
806		}
807
808		// if it is an empty line, guess that it is part of this item
809		// and move on to the next line
810		if p.isEmpty(data[blockEnd:i]) > 0 {
811			containsBlankLine = true
812			blockEnd = i
813			continue
814		}
815
816		n := 0
817		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
818			// this is the end of the block.
819			// we don't want to include this last line in the index.
820			break gatherLines
821		}
822
823		// if there were blank lines before this one, insert a new one now
824		if containsBlankLine {
825			raw.WriteByte('\n')
826			containsBlankLine = false
827		}
828
829		// get rid of that first tab, write to buffer
830		raw.Write(data[blockEnd+n : i])
831		hasBlock = true
832
833		blockEnd = i
834	}
835
836	if data[blockEnd-1] != '\n' {
837		raw.WriteByte('\n')
838	}
839
840	contents = raw.Bytes()
841
842	return
843}
844
845//
846//
847// Miscellaneous helper functions
848//
849//
850
851// Test if a character is a punctuation symbol.
852// Taken from a private function in regexp in the stdlib.
853func ispunct(c byte) bool {
854	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
855		if c == r {
856			return true
857		}
858	}
859	return false
860}
861
862// Test if a character is a whitespace character.
863func isspace(c byte) bool {
864	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
865}
866
867// Test if a character is letter.
868func isletter(c byte) bool {
869	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
870}
871
872// Test if a character is a letter or a digit.
873// TODO: check when this is looking for ASCII alnum and when it should use unicode
874func isalnum(c byte) bool {
875	return (c >= '0' && c <= '9') || isletter(c)
876}
877
878// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
879// always ends output with a newline
880func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
881	// first, check for common cases: no tabs, or only tabs at beginning of line
882	i, prefix := 0, 0
883	slowcase := false
884	for i = 0; i < len(line); i++ {
885		if line[i] == '\t' {
886			if prefix == i {
887				prefix++
888			} else {
889				slowcase = true
890				break
891			}
892		}
893	}
894
895	// no need to decode runes if all tabs are at the beginning of the line
896	if !slowcase {
897		for i = 0; i < prefix*tabSize; i++ {
898			out.WriteByte(' ')
899		}
900		out.Write(line[prefix:])
901		return
902	}
903
904	// the slow case: we need to count runes to figure out how
905	// many spaces to insert for each tab
906	column := 0
907	i = 0
908	for i < len(line) {
909		start := i
910		for i < len(line) && line[i] != '\t' {
911			_, size := utf8.DecodeRune(line[i:])
912			i += size
913			column++
914		}
915
916		if i > start {
917			out.Write(line[start:i])
918		}
919
920		if i >= len(line) {
921			break
922		}
923
924		for {
925			out.WriteByte(' ')
926			column++
927			if column%tabSize == 0 {
928				break
929			}
930		}
931
932		i++
933	}
934}
935
936// Find if a line counts as indented or not.
937// Returns number of characters the indent is (0 = not indented).
938func isIndented(data []byte, indentSize int) int {
939	if len(data) == 0 {
940		return 0
941	}
942	if data[0] == '\t' {
943		return 1
944	}
945	if len(data) < indentSize {
946		return 0
947	}
948	for i := 0; i < indentSize; i++ {
949		if data[i] != ' ' {
950			return 0
951		}
952	}
953	return indentSize
954}
955
956// Create a url-safe slug for fragments
957func slugify(in []byte) []byte {
958	if len(in) == 0 {
959		return in
960	}
961	out := make([]byte, 0, len(in))
962	sym := false
963
964	for _, ch := range in {
965		if isalnum(ch) {
966			sym = false
967			out = append(out, ch)
968		} else if sym {
969			continue
970		} else {
971			out = append(out, '-')
972			sym = true
973		}
974	}
975	var a, b int
976	var ch byte
977	for a, ch = range out {
978		if ch != '-' {
979			break
980		}
981	}
982	for b = len(out) - 1; b > 0; b-- {
983		if out[b] != '-' {
984			break
985		}
986	}
987	return out[a : b+1]
988}