markdown.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11//
12// Markdown parsing and processing
13//
14//
15
16// Blackfriday markdown processor.
17//
18// Translates plain text with simple formatting rules into HTML or LaTeX.
19package blackfriday
20
21import (
22 "bytes"
23 "fmt"
24 "strings"
25 "unicode/utf8"
26)
27
28const VERSION = "1.4"
29
30type Extensions int
31
32// These are the supported markdown parsing extensions.
33// OR these values together to select multiple extensions.
34const (
35 NoExtensions Extensions = 0
36 NoIntraEmphasis Extensions = 1 << iota // Ignore emphasis markers inside words
37 Tables // Render tables
38 FencedCode // Render fenced code blocks
39 Autolink // Detect embedded URLs that are not explicitly marked
40 Strikethrough // Strikethrough text using ~~test~~
41 LaxHTMLBlocks // Loosen up HTML block parsing rules
42 SpaceHeaders // Be strict about prefix header rules
43 HardLineBreak // Translate newlines into line breaks
44 TabSizeEight // Expand tabs to eight spaces instead of four
45 Footnotes // Pandoc-style footnotes
46 NoEmptyLineBeforeBlock // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
47 HeaderIDs // specify header IDs with {#id}
48 Titleblock // Titleblock ala pandoc
49 AutoHeaderIDs // Create the header ID from the text
50 BackslashLineBreak // Translate trailing backslashes into line breaks
51 DefinitionLists // Render definition lists
52 Smartypants // Enable smart punctuation substitutions
53 SmartypantsFractions // Enable smart fractions (with Smartypants)
54 SmartypantsDashes // Enable smart dashes (with Smartypants)
55 SmartypantsLatexDashes // Enable LaTeX-style dashes (with Smartypants)
56 SmartypantsAngledQuotes // Enable angled double quotes (with Smartypants) for double quotes rendering
57 TOC // Generate a table of contents
58 OmitContents // Skip the main contents (for a standalone table of contents)
59
60 CommonHtmlFlags HTMLFlags = UseXHTML
61
62 CommonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
63 Autolink | Strikethrough | SpaceHeaders | HeaderIDs |
64 BackslashLineBreak | DefinitionLists | Smartypants |
65 SmartypantsFractions | SmartypantsDashes | SmartypantsLatexDashes
66)
67
68var DefaultOptions = Options{
69 Extensions: CommonExtensions,
70}
71
72type LinkType int
73
74// These are the possible flag values for the link renderer.
75// Only a single one of these values will be used; they are not ORed together.
76// These are mostly of interest if you are writing a new output format.
77const (
78 LinkTypeNotAutolink LinkType = iota
79 LinkTypeNormal
80 LinkTypeEmail
81)
82
83type ListType int
84
85// These are the possible flag values for the ListItem renderer.
86// Multiple flag values may be ORed together.
87// These are mostly of interest if you are writing a new output format.
88const (
89 ListTypeOrdered ListType = 1 << iota
90 ListTypeDefinition
91 ListTypeTerm
92
93 ListItemContainsBlock
94 ListItemBeginningOfList
95 ListItemEndOfList
96)
97
98type CellAlignFlags int
99
100// These are the possible flag values for the table cell renderer.
101// Only a single one of these values will be used; they are not ORed together.
102// These are mostly of interest if you are writing a new output format.
103const (
104 TableAlignmentLeft = 1 << iota
105 TableAlignmentRight
106 TableAlignmentCenter = (TableAlignmentLeft | TableAlignmentRight)
107)
108
109// The size of a tab stop.
110const (
111 TabSizeDefault = 4
112 TabSizeDouble = 8
113)
114
115// blockTags is a set of tags that are recognized as HTML block tags.
116// Any of these can be included in markdown text without special escaping.
117var blockTags = map[string]struct{}{
118 "blockquote": struct{}{},
119 "del": struct{}{},
120 "div": struct{}{},
121 "dl": struct{}{},
122 "fieldset": struct{}{},
123 "form": struct{}{},
124 "h1": struct{}{},
125 "h2": struct{}{},
126 "h3": struct{}{},
127 "h4": struct{}{},
128 "h5": struct{}{},
129 "h6": struct{}{},
130 "iframe": struct{}{},
131 "ins": struct{}{},
132 "math": struct{}{},
133 "noscript": struct{}{},
134 "ol": struct{}{},
135 "pre": struct{}{},
136 "p": struct{}{},
137 "script": struct{}{},
138 "style": struct{}{},
139 "table": struct{}{},
140 "ul": struct{}{},
141
142 // HTML5
143 "address": struct{}{},
144 "article": struct{}{},
145 "aside": struct{}{},
146 "canvas": struct{}{},
147 "figcaption": struct{}{},
148 "figure": struct{}{},
149 "footer": struct{}{},
150 "header": struct{}{},
151 "hgroup": struct{}{},
152 "main": struct{}{},
153 "nav": struct{}{},
154 "output": struct{}{},
155 "progress": struct{}{},
156 "section": struct{}{},
157 "video": struct{}{},
158}
159
160// Renderer is the rendering interface.
161// This is mostly of interest if you are implementing a new rendering format.
162//
163// When a byte slice is provided, it contains the (rendered) contents of the
164// element.
165//
166// When a callback is provided instead, it will write the contents of the
167// respective element directly to the output buffer and return true on success.
168// If the callback returns false, the rendering function should reset the
169// output buffer as though it had never been called.
170//
171// Currently Html and Latex implementations are provided
172type Renderer interface {
173 // block-level callbacks
174 BlockCode(text []byte, lang string)
175 BlockQuote(text []byte)
176 BlockHtml(text []byte)
177 BeginHeader(level int, id string)
178 EndHeader(level int, id string, header []byte)
179 HRule()
180 BeginList(flags ListType)
181 EndList(flags ListType)
182 ListItem(text []byte, flags ListType)
183 BeginParagraph()
184 EndParagraph()
185 Table(header []byte, body []byte, columnData []CellAlignFlags)
186 TableRow(text []byte)
187 TableHeaderCell(out *bytes.Buffer, text []byte, flags CellAlignFlags)
188 TableCell(out *bytes.Buffer, text []byte, flags CellAlignFlags)
189 BeginFootnotes()
190 EndFootnotes()
191 FootnoteItem(name, text []byte, flags ListType)
192 TitleBlock(text []byte)
193
194 // Span-level callbacks
195 AutoLink(link []byte, kind LinkType)
196 CodeSpan(text []byte)
197 DoubleEmphasis(text []byte)
198 Emphasis(text []byte)
199 Image(link []byte, title []byte, alt []byte)
200 LineBreak()
201 Link(link []byte, title []byte, content []byte)
202 RawHtmlTag(tag []byte)
203 TripleEmphasis(text []byte)
204 StrikeThrough(text []byte)
205 FootnoteRef(ref []byte, id int)
206
207 // Low-level callbacks
208 Entity(entity []byte)
209 NormalText(text []byte)
210
211 // Header and footer
212 DocumentHeader()
213 DocumentFooter()
214
215 Write(b []byte) (int, error)
216
217 Render(ast *Node) []byte
218}
219
220// Callback functions for inline parsing. One such function is defined
221// for each character that triggers a response when parsing inline data.
222type inlineParser func(p *parser, data []byte, offset int) int
223
224// Parser holds runtime state used by the parser.
225// This is constructed by the Markdown function.
226type parser struct {
227 refOverride ReferenceOverrideFunc
228 refs map[string]*reference
229 inlineCallback [256]inlineParser
230 flags Extensions
231 nesting int
232 maxNesting int
233 insideLink bool
234
235 // Footnotes need to be ordered as well as available to quickly check for
236 // presence. If a ref is also a footnote, it's stored both in refs and here
237 // in notes. Slice is nil if footnotes not enabled.
238 notes []*reference
239
240 doc *Node
241 tip *Node // = doc
242 oldTip *Node
243 lastMatchedContainer *Node // = doc
244 allClosed bool
245 currBlock *Node // a block node currently being parsed by inline parser
246}
247
248func (p *parser) getRef(refid string) (ref *reference, found bool) {
249 if p.refOverride != nil {
250 r, overridden := p.refOverride(refid)
251 if overridden {
252 if r == nil {
253 return nil, false
254 }
255 return &reference{
256 link: []byte(r.Link),
257 title: []byte(r.Title),
258 noteId: 0,
259 hasBlock: false,
260 text: []byte(r.Text)}, true
261 }
262 }
263 // refs are case insensitive
264 ref, found = p.refs[strings.ToLower(refid)]
265 return ref, found
266}
267
268func (p *parser) finalize(block *Node) {
269 above := block.Parent
270 block.open = false
271 p.tip = above
272}
273
274func (p *parser) addChild(node NodeType, offset uint32) *Node {
275 for !p.tip.canContain(node) {
276 p.finalize(p.tip)
277 }
278 newNode := NewNode(node)
279 newNode.content = []byte{}
280 p.tip.appendChild(newNode)
281 p.tip = newNode
282 return newNode
283}
284
285func (p *parser) closeUnmatchedBlocks() {
286 if !p.allClosed {
287 for p.oldTip != p.lastMatchedContainer {
288 parent := p.oldTip.Parent
289 p.finalize(p.oldTip)
290 p.oldTip = parent
291 }
292 p.allClosed = true
293 }
294}
295
296//
297//
298// Public interface
299//
300//
301
302// Reference represents the details of a link.
303// See the documentation in Options for more details on use-case.
304type Reference struct {
305 // Link is usually the URL the reference points to.
306 Link string
307 // Title is the alternate text describing the link in more detail.
308 Title string
309 // Text is the optional text to override the ref with if the syntax used was
310 // [refid][]
311 Text string
312}
313
314// ReferenceOverrideFunc is expected to be called with a reference string and
315// return either a valid Reference type that the reference string maps to or
316// nil. If overridden is false, the default reference logic will be executed.
317// See the documentation in Options for more details on use-case.
318type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
319
320// Options represents configurable overrides and callbacks (in addition to the
321// extension flag set) for configuring a Markdown parse.
322type Options struct {
323 // Extensions is a flag set of bit-wise ORed extension bits. See the
324 // Extensions flags defined in this package.
325 Extensions Extensions
326
327 // ReferenceOverride is an optional function callback that is called every
328 // time a reference is resolved.
329 //
330 // In Markdown, the link reference syntax can be made to resolve a link to
331 // a reference instead of an inline URL, in one of the following ways:
332 //
333 // * [link text][refid]
334 // * [refid][]
335 //
336 // Usually, the refid is defined at the bottom of the Markdown document. If
337 // this override function is provided, the refid is passed to the override
338 // function first, before consulting the defined refids at the bottom. If
339 // the override function indicates an override did not occur, the refids at
340 // the bottom will be used to fill in the link details.
341 ReferenceOverride ReferenceOverrideFunc
342}
343
344// MarkdownBasic is a convenience function for simple rendering.
345// It processes markdown input with no extensions enabled.
346func MarkdownBasic(input []byte) []byte {
347 // set up the HTML renderer
348 htmlFlags := UseXHTML
349 renderer := HTMLRenderer(htmlFlags, CommonExtensions, "", "")
350
351 // set up the parser
352 return MarkdownOptions(input, renderer, Options{Extensions: 0})
353}
354
355// Call Markdown with most useful extensions enabled
356// MarkdownCommon is a convenience function for simple rendering.
357// It processes markdown input with common extensions enabled, including:
358//
359// * Smartypants processing with smart fractions and LaTeX dashes
360//
361// * Intra-word emphasis suppression
362//
363// * Tables
364//
365// * Fenced code blocks
366//
367// * Autolinking
368//
369// * Strikethrough support
370//
371// * Strict header parsing
372//
373// * Custom Header IDs
374func MarkdownCommon(input []byte) []byte {
375 // set up the HTML renderer
376 renderer := HTMLRenderer(CommonHtmlFlags, CommonExtensions, "", "")
377 return MarkdownOptions(input, renderer, DefaultOptions)
378}
379
380// Markdown is the main rendering function.
381// It parses and renders a block of markdown-encoded text.
382// The supplied Renderer is used to format the output, and extensions dictates
383// which non-standard extensions are enabled.
384//
385// To use the supplied Html or LaTeX renderers, see HtmlRenderer and
386// LatexRenderer, respectively.
387func Markdown(input []byte, renderer Renderer, extensions Extensions) []byte {
388 return MarkdownOptions(input, renderer, Options{
389 Extensions: extensions})
390}
391
392// MarkdownOptions is just like Markdown but takes additional options through
393// the Options struct.
394func MarkdownOptions(input []byte, renderer Renderer, opts Options) []byte {
395 // no point in parsing if we can't render
396 if renderer == nil {
397 return nil
398 }
399 return renderer.Render(Parse(input, opts))
400}
401
402func Parse(input []byte, opts Options) *Node {
403 extensions := opts.Extensions
404
405 // fill in the render structure
406 p := new(parser)
407 p.flags = extensions
408 p.refOverride = opts.ReferenceOverride
409 p.refs = make(map[string]*reference)
410 p.maxNesting = 16
411 p.insideLink = false
412
413 docNode := NewNode(Document)
414 p.doc = docNode
415 p.tip = docNode
416 p.oldTip = docNode
417 p.lastMatchedContainer = docNode
418 p.allClosed = true
419
420 // register inline parsers
421 p.inlineCallback['*'] = emphasis
422 p.inlineCallback['_'] = emphasis
423 if extensions&Strikethrough != 0 {
424 p.inlineCallback['~'] = emphasis
425 }
426 p.inlineCallback['`'] = codeSpan
427 p.inlineCallback['\n'] = lineBreak
428 p.inlineCallback['['] = link
429 p.inlineCallback['<'] = leftAngle
430 p.inlineCallback['\\'] = escape
431 p.inlineCallback['&'] = entity
432 p.inlineCallback['!'] = maybeImage
433 p.inlineCallback['^'] = maybeInlineFootnote
434
435 if extensions&Autolink != 0 {
436 p.inlineCallback['h'] = maybeAutoLink
437 p.inlineCallback['m'] = maybeAutoLink
438 p.inlineCallback['f'] = maybeAutoLink
439 p.inlineCallback['H'] = maybeAutoLink
440 p.inlineCallback['M'] = maybeAutoLink
441 p.inlineCallback['F'] = maybeAutoLink
442 }
443
444 if extensions&Footnotes != 0 {
445 p.notes = make([]*reference, 0)
446 }
447
448 first := firstPass(p, input)
449 secondPass(p, first)
450 // Walk the tree and finish up some of unfinished blocks
451 for p.tip != nil {
452 p.finalize(p.tip)
453 }
454 // Walk the tree again and process inline markdown in each block
455 p.doc.Walk(func(node *Node, entering bool) WalkStatus {
456 if node.Type == Paragraph || node.Type == Header || node.Type == TableCell {
457 p.currBlock = node
458 p.inline(node.content)
459 node.content = nil
460 }
461 return GoToNext
462 })
463 p.parseRefsToAST()
464 p.generateTOC()
465 return p.doc
466}
467
468func (p *parser) generateTOC() {
469 if p.flags&TOC == 0 && p.flags&OmitContents == 0 {
470 return
471 }
472 navNode := NewNode(HTMLBlock)
473 navNode.Literal = []byte("<nav>")
474 navNode.open = false
475
476 var topList *Node
477 var listNode *Node
478 var lastItem *Node
479 headerCount := 0
480 var currentLevel uint32
481 p.doc.Walk(func(node *Node, entering bool) WalkStatus {
482 if entering && node.Type == Header {
483 if node.Level > currentLevel {
484 currentLevel++
485 newList := NewNode(List)
486 if lastItem != nil {
487 lastItem.appendChild(newList)
488 listNode = newList
489 } else {
490 listNode = newList
491 topList = listNode
492 }
493 }
494 if node.Level < currentLevel {
495 finalizeList(listNode)
496 lastItem = listNode.Parent
497 listNode = lastItem.Parent
498 }
499 node.HeaderID = fmt.Sprintf("toc_%d", headerCount)
500 headerCount++
501 lastItem = NewNode(Item)
502 listNode.appendChild(lastItem)
503 anchorNode := NewNode(Link)
504 anchorNode.Destination = []byte("#" + node.HeaderID)
505 lastItem.appendChild(anchorNode)
506 anchorNode.appendChild(text(node.FirstChild.Literal))
507 }
508 return GoToNext
509 })
510 firstChild := p.doc.FirstChild
511 // Insert TOC only if there is anything to insert
512 if topList != nil {
513 finalizeList(topList)
514 firstChild.insertBefore(navNode)
515 firstChild.insertBefore(topList)
516 navCloseNode := NewNode(HTMLBlock)
517 navCloseNode.Literal = []byte("</nav>")
518 navCloseNode.open = false
519 firstChild.insertBefore(navCloseNode)
520 }
521 // Drop everything after the TOC if OmitContents was requested
522 if p.flags&OmitContents != 0 {
523 for firstChild != nil {
524 next := firstChild.Next
525 firstChild.unlink()
526 firstChild = next
527 }
528 }
529}
530
531func (p *parser) parseRefsToAST() {
532 if p.flags&Footnotes == 0 || len(p.notes) == 0 {
533 return
534 }
535 p.tip = p.doc
536 finalizeHtmlBlock(p.addBlock(HTMLBlock, []byte(`<div class="footnotes">`)))
537 p.addBlock(HorizontalRule, nil)
538 block := p.addBlock(List, nil)
539 block.ListFlags = ListTypeOrdered
540 flags := ListItemBeginningOfList
541 // Note: this loop is intentionally explicit, not range-form. This is
542 // because the body of the loop will append nested footnotes to p.notes and
543 // we need to process those late additions. Range form would only walk over
544 // the fixed initial set.
545 for i := 0; i < len(p.notes); i++ {
546 ref := p.notes[i]
547 block := p.addBlock(Item, nil)
548 block.ListFlags = ListTypeOrdered
549 block.RefLink = ref.link
550 if ref.hasBlock {
551 flags |= ListItemContainsBlock
552 p.block(ref.title)
553 } else {
554 p.currBlock = block
555 p.inline(ref.title)
556 }
557 flags &^= ListItemBeginningOfList | ListItemContainsBlock
558 }
559 above := block.Parent
560 finalizeList(block)
561 p.tip = above
562 finalizeHtmlBlock(p.addBlock(HTMLBlock, []byte("</div>")))
563 block.Walk(func(node *Node, entering bool) WalkStatus {
564 if node.Type == Paragraph || node.Type == Header {
565 p.currBlock = node
566 p.inline(node.content)
567 node.content = nil
568 }
569 return GoToNext
570 })
571}
572
573// first pass:
574// - extract references
575// - expand tabs
576// - normalize newlines
577// - copy everything else
578func firstPass(p *parser, input []byte) []byte {
579 var out bytes.Buffer
580 tabSize := TabSizeDefault
581 if p.flags&TabSizeEight != 0 {
582 tabSize = TabSizeDouble
583 }
584 beg, end := 0, 0
585 lastFencedCodeBlockEnd := 0
586 for beg < len(input) { // iterate over lines
587 if end = isReference(p, input[beg:], tabSize); end > 0 {
588 beg += end
589 } else { // skip to the next line
590 end = beg
591 for end < len(input) && input[end] != '\n' && input[end] != '\r' {
592 end++
593 }
594
595 if p.flags&FencedCode != 0 {
596 // track fenced code block boundaries to suppress tab expansion
597 // inside them:
598 if beg >= lastFencedCodeBlockEnd {
599 if i := p.fencedCode(input[beg:], false); i > 0 {
600 lastFencedCodeBlockEnd = beg + i
601 }
602 }
603 }
604
605 // add the line body if present
606 if end > beg {
607 if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
608 out.Write(input[beg:end])
609 } else {
610 expandTabs(&out, input[beg:end], tabSize)
611 }
612 }
613 out.WriteByte('\n')
614
615 if end < len(input) && input[end] == '\r' {
616 end++
617 }
618 if end < len(input) && input[end] == '\n' {
619 end++
620 }
621
622 beg = end
623 }
624 }
625
626 // empty input?
627 if out.Len() == 0 {
628 out.WriteByte('\n')
629 }
630
631 return out.Bytes()
632}
633
634// second pass: actual rendering
635func secondPass(p *parser, input []byte) {
636 p.block(input)
637
638 if p.flags&Footnotes != 0 && len(p.notes) > 0 {
639 flags := ListItemBeginningOfList
640 for i := 0; i < len(p.notes); i += 1 {
641 ref := p.notes[i]
642 if ref.hasBlock {
643 flags |= ListItemContainsBlock
644 p.block(ref.title)
645 } else {
646 p.inline(ref.title)
647 }
648 flags &^= ListItemBeginningOfList | ListItemContainsBlock
649 }
650 }
651
652 if p.nesting != 0 {
653 panic("Nesting level did not end at zero")
654 }
655}
656
657//
658// Link references
659//
660// This section implements support for references that (usually) appear
661// as footnotes in a document, and can be referenced anywhere in the document.
662// The basic format is:
663//
664// [1]: http://www.google.com/ "Google"
665// [2]: http://www.github.com/ "Github"
666//
667// Anywhere in the document, the reference can be linked by referring to its
668// label, i.e., 1 and 2 in this example, as in:
669//
670// This library is hosted on [Github][2], a git hosting site.
671//
672// Actual footnotes as specified in Pandoc and supported by some other Markdown
673// libraries such as php-markdown are also taken care of. They look like this:
674//
675// This sentence needs a bit of further explanation.[^note]
676//
677// [^note]: This is the explanation.
678//
679// Footnotes should be placed at the end of the document in an ordered list.
680// Inline footnotes such as:
681//
682// Inline footnotes^[Not supported.] also exist.
683//
684// are not yet supported.
685
686// References are parsed and stored in this struct.
687type reference struct {
688 link []byte
689 title []byte
690 noteId int // 0 if not a footnote ref
691 hasBlock bool
692 text []byte
693}
694
695func (r *reference) String() string {
696 return fmt.Sprintf("{link: %q, title: %q, text: %q, noteId: %d, hasBlock: %v}",
697 r.link, r.title, r.text, r.noteId, r.hasBlock)
698}
699
700// Check whether or not data starts with a reference link.
701// If so, it is parsed and stored in the list of references
702// (in the render struct).
703// Returns the number of bytes to skip to move past it,
704// or zero if the first line is not a reference.
705func isReference(p *parser, data []byte, tabSize int) int {
706 // up to 3 optional leading spaces
707 if len(data) < 4 {
708 return 0
709 }
710 i := 0
711 for i < 3 && data[i] == ' ' {
712 i++
713 }
714
715 noteId := 0
716
717 // id part: anything but a newline between brackets
718 if data[i] != '[' {
719 return 0
720 }
721 i++
722 if p.flags&Footnotes != 0 {
723 if i < len(data) && data[i] == '^' {
724 // we can set it to anything here because the proper noteIds will
725 // be assigned later during the second pass. It just has to be != 0
726 noteId = 1
727 i++
728 }
729 }
730 idOffset := i
731 for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
732 i++
733 }
734 if i >= len(data) || data[i] != ']' {
735 return 0
736 }
737 idEnd := i
738
739 // spacer: colon (space | tab)* newline? (space | tab)*
740 i++
741 if i >= len(data) || data[i] != ':' {
742 return 0
743 }
744 i++
745 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
746 i++
747 }
748 if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
749 i++
750 if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
751 i++
752 }
753 }
754 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
755 i++
756 }
757 if i >= len(data) {
758 return 0
759 }
760
761 var (
762 linkOffset, linkEnd int
763 titleOffset, titleEnd int
764 lineEnd int
765 raw []byte
766 hasBlock bool
767 )
768
769 if p.flags&Footnotes != 0 && noteId != 0 {
770 linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
771 lineEnd = linkEnd
772 } else {
773 linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
774 }
775 if lineEnd == 0 {
776 return 0
777 }
778
779 // a valid ref has been found
780
781 ref := &reference{
782 noteId: noteId,
783 hasBlock: hasBlock,
784 }
785
786 if noteId > 0 {
787 // reusing the link field for the id since footnotes don't have links
788 ref.link = data[idOffset:idEnd]
789 // if footnote, it's not really a title, it's the contained text
790 ref.title = raw
791 } else {
792 ref.link = data[linkOffset:linkEnd]
793 ref.title = data[titleOffset:titleEnd]
794 }
795
796 // id matches are case-insensitive
797 id := string(bytes.ToLower(data[idOffset:idEnd]))
798
799 p.refs[id] = ref
800
801 return lineEnd
802}
803
804func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
805 // link: whitespace-free sequence, optionally between angle brackets
806 if data[i] == '<' {
807 i++
808 }
809 linkOffset = i
810 for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
811 i++
812 }
813 if i == len(data) {
814 return
815 }
816 linkEnd = i
817 if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
818 linkOffset++
819 linkEnd--
820 }
821
822 // optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
823 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
824 i++
825 }
826 if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
827 return
828 }
829
830 // compute end-of-line
831 if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
832 lineEnd = i
833 }
834 if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
835 lineEnd++
836 }
837
838 // optional (space|tab)* spacer after a newline
839 if lineEnd > 0 {
840 i = lineEnd + 1
841 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
842 i++
843 }
844 }
845
846 // optional title: any non-newline sequence enclosed in '"() alone on its line
847 if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
848 i++
849 titleOffset = i
850
851 // look for EOL
852 for i < len(data) && data[i] != '\n' && data[i] != '\r' {
853 i++
854 }
855 if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
856 titleEnd = i + 1
857 } else {
858 titleEnd = i
859 }
860
861 // step back
862 i--
863 for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
864 i--
865 }
866 if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
867 lineEnd = titleEnd
868 titleEnd = i
869 }
870 }
871
872 return
873}
874
875// The first bit of this logic is the same as (*parser).listItem, but the rest
876// is much simpler. This function simply finds the entire block and shifts it
877// over by one tab if it is indeed a block (just returns the line if it's not).
878// blockEnd is the end of the section in the input buffer, and contents is the
879// extracted text that was shifted over one tab. It will need to be rendered at
880// the end of the document.
881func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
882 if i == 0 || len(data) == 0 {
883 return
884 }
885
886 // skip leading whitespace on first line
887 for i < len(data) && data[i] == ' ' {
888 i++
889 }
890
891 blockStart = i
892
893 // find the end of the line
894 blockEnd = i
895 for i < len(data) && data[i-1] != '\n' {
896 i++
897 }
898
899 // get working buffer
900 var raw bytes.Buffer
901
902 // put the first line into the working buffer
903 raw.Write(data[blockEnd:i])
904 blockEnd = i
905
906 // process the following lines
907 containsBlankLine := false
908
909gatherLines:
910 for blockEnd < len(data) {
911 i++
912
913 // find the end of this line
914 for i < len(data) && data[i-1] != '\n' {
915 i++
916 }
917
918 // if it is an empty line, guess that it is part of this item
919 // and move on to the next line
920 if p.isEmpty(data[blockEnd:i]) > 0 {
921 containsBlankLine = true
922 blockEnd = i
923 continue
924 }
925
926 n := 0
927 if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
928 // this is the end of the block.
929 // we don't want to include this last line in the index.
930 break gatherLines
931 }
932
933 // if there were blank lines before this one, insert a new one now
934 if containsBlankLine {
935 raw.WriteByte('\n')
936 containsBlankLine = false
937 }
938
939 // get rid of that first tab, write to buffer
940 raw.Write(data[blockEnd+n : i])
941 hasBlock = true
942
943 blockEnd = i
944 }
945
946 if data[blockEnd-1] != '\n' {
947 raw.WriteByte('\n')
948 }
949
950 contents = raw.Bytes()
951
952 return
953}
954
955//
956//
957// Miscellaneous helper functions
958//
959//
960
961// Test if a character is a punctuation symbol.
962// Taken from a private function in regexp in the stdlib.
963func ispunct(c byte) bool {
964 for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
965 if c == r {
966 return true
967 }
968 }
969 return false
970}
971
972// Test if a character is a whitespace character.
973func isspace(c byte) bool {
974 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
975}
976
977// Test if a character is letter.
978func isletter(c byte) bool {
979 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
980}
981
982// Test if a character is a letter or a digit.
983// TODO: check when this is looking for ASCII alnum and when it should use unicode
984func isalnum(c byte) bool {
985 return (c >= '0' && c <= '9') || isletter(c)
986}
987
988// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
989// always ends output with a newline
990func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
991 // first, check for common cases: no tabs, or only tabs at beginning of line
992 i, prefix := 0, 0
993 slowcase := false
994 for i = 0; i < len(line); i++ {
995 if line[i] == '\t' {
996 if prefix == i {
997 prefix++
998 } else {
999 slowcase = true
1000 break
1001 }
1002 }
1003 }
1004
1005 // no need to decode runes if all tabs are at the beginning of the line
1006 if !slowcase {
1007 for i = 0; i < prefix*tabSize; i++ {
1008 out.WriteByte(' ')
1009 }
1010 out.Write(line[prefix:])
1011 return
1012 }
1013
1014 // the slow case: we need to count runes to figure out how
1015 // many spaces to insert for each tab
1016 column := 0
1017 i = 0
1018 for i < len(line) {
1019 start := i
1020 for i < len(line) && line[i] != '\t' {
1021 _, size := utf8.DecodeRune(line[i:])
1022 i += size
1023 column++
1024 }
1025
1026 if i > start {
1027 out.Write(line[start:i])
1028 }
1029
1030 if i >= len(line) {
1031 break
1032 }
1033
1034 for {
1035 out.WriteByte(' ')
1036 column++
1037 if column%tabSize == 0 {
1038 break
1039 }
1040 }
1041
1042 i++
1043 }
1044}
1045
1046// Find if a line counts as indented or not.
1047// Returns number of characters the indent is (0 = not indented).
1048func isIndented(data []byte, indentSize int) int {
1049 if len(data) == 0 {
1050 return 0
1051 }
1052 if data[0] == '\t' {
1053 return 1
1054 }
1055 if len(data) < indentSize {
1056 return 0
1057 }
1058 for i := 0; i < indentSize; i++ {
1059 if data[i] != ' ' {
1060 return 0
1061 }
1062 }
1063 return indentSize
1064}
1065
1066// Create a url-safe slug for fragments
1067func slugify(in []byte) []byte {
1068 if len(in) == 0 {
1069 return in
1070 }
1071 out := make([]byte, 0, len(in))
1072 sym := false
1073
1074 for _, ch := range in {
1075 if isalnum(ch) {
1076 sym = false
1077 out = append(out, ch)
1078 } else if sym {
1079 continue
1080 } else {
1081 out = append(out, '-')
1082 sym = true
1083 }
1084 }
1085 var a, b int
1086 var ch byte
1087 for a, ch = range out {
1088 if ch != '-' {
1089 break
1090 }
1091 }
1092 for b = len(out) - 1; b > 0; b-- {
1093 if out[b] != '-' {
1094 break
1095 }
1096 }
1097 return out[a : b+1]
1098}