markdown.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11//
12// Markdown parsing and processing
13//
14//
15
16// Blackfriday markdown processor.
17//
18// Translates plain text with simple formatting rules into HTML or LaTeX.
19package blackfriday
20
21import (
22 "bytes"
23 "fmt"
24 "strings"
25 "unicode/utf8"
26)
27
28const VERSION = "1.4"
29
30type Extensions int
31
32// These are the supported markdown parsing extensions.
33// OR these values together to select multiple extensions.
34const (
35 NoExtensions Extensions = 0
36 NoIntraEmphasis Extensions = 1 << iota // Ignore emphasis markers inside words
37 Tables // Render tables
38 FencedCode // Render fenced code blocks
39 Autolink // Detect embedded URLs that are not explicitly marked
40 Strikethrough // Strikethrough text using ~~test~~
41 LaxHTMLBlocks // Loosen up HTML block parsing rules
42 SpaceHeaders // Be strict about prefix header rules
43 HardLineBreak // Translate newlines into line breaks
44 TabSizeEight // Expand tabs to eight spaces instead of four
45 Footnotes // Pandoc-style footnotes
46 NoEmptyLineBeforeBlock // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
47 HeaderIDs // specify header IDs with {#id}
48 Titleblock // Titleblock ala pandoc
49 AutoHeaderIDs // Create the header ID from the text
50 BackslashLineBreak // Translate trailing backslashes into line breaks
51 DefinitionLists // Render definition lists
52 Smartypants // Enable smart punctuation substitutions
53 SmartypantsFractions // Enable smart fractions (with Smartypants)
54 SmartypantsDashes // Enable smart dashes (with Smartypants)
55 SmartypantsLatexDashes // Enable LaTeX-style dashes (with Smartypants)
56 SmartypantsAngledQuotes // Enable angled double quotes (with Smartypants) for double quotes rendering
57 TOC // Generate a table of contents
58 OmitContents // Skip the main contents (for a standalone table of contents)
59
60 CommonHtmlFlags HTMLFlags = UseXHTML
61
62 CommonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
63 Autolink | Strikethrough | SpaceHeaders | HeaderIDs |
64 BackslashLineBreak | DefinitionLists | Smartypants |
65 SmartypantsFractions | SmartypantsDashes | SmartypantsLatexDashes
66)
67
68var DefaultOptions = Options{
69 Extensions: CommonExtensions,
70}
71
72type LinkType int
73
74// These are the possible flag values for the link renderer.
75// Only a single one of these values will be used; they are not ORed together.
76// These are mostly of interest if you are writing a new output format.
77const (
78 LinkTypeNotAutolink LinkType = iota
79 LinkTypeNormal
80 LinkTypeEmail
81)
82
83type ListType int
84
85// These are the possible flag values for the ListItem renderer.
86// Multiple flag values may be ORed together.
87// These are mostly of interest if you are writing a new output format.
88const (
89 ListTypeOrdered ListType = 1 << iota
90 ListTypeDefinition
91 ListTypeTerm
92
93 ListItemContainsBlock
94 ListItemBeginningOfList
95 ListItemEndOfList
96)
97
98type CellAlignFlags int
99
100// These are the possible flag values for the table cell renderer.
101// Only a single one of these values will be used; they are not ORed together.
102// These are mostly of interest if you are writing a new output format.
103const (
104 TableAlignmentLeft = 1 << iota
105 TableAlignmentRight
106 TableAlignmentCenter = (TableAlignmentLeft | TableAlignmentRight)
107)
108
109// The size of a tab stop.
110const (
111 TabSizeDefault = 4
112 TabSizeDouble = 8
113)
114
115// blockTags is a set of tags that are recognized as HTML block tags.
116// Any of these can be included in markdown text without special escaping.
117var blockTags = map[string]struct{}{
118 "blockquote": struct{}{},
119 "del": struct{}{},
120 "div": struct{}{},
121 "dl": struct{}{},
122 "fieldset": struct{}{},
123 "form": struct{}{},
124 "h1": struct{}{},
125 "h2": struct{}{},
126 "h3": struct{}{},
127 "h4": struct{}{},
128 "h5": struct{}{},
129 "h6": struct{}{},
130 "iframe": struct{}{},
131 "ins": struct{}{},
132 "math": struct{}{},
133 "noscript": struct{}{},
134 "ol": struct{}{},
135 "pre": struct{}{},
136 "p": struct{}{},
137 "script": struct{}{},
138 "style": struct{}{},
139 "table": struct{}{},
140 "ul": struct{}{},
141
142 // HTML5
143 "address": struct{}{},
144 "article": struct{}{},
145 "aside": struct{}{},
146 "canvas": struct{}{},
147 "figcaption": struct{}{},
148 "figure": struct{}{},
149 "footer": struct{}{},
150 "header": struct{}{},
151 "hgroup": struct{}{},
152 "main": struct{}{},
153 "nav": struct{}{},
154 "output": struct{}{},
155 "progress": struct{}{},
156 "section": struct{}{},
157 "video": struct{}{},
158}
159
160// Renderer is the rendering interface.
161// This is mostly of interest if you are implementing a new rendering format.
162//
163// When a byte slice is provided, it contains the (rendered) contents of the
164// element.
165//
166// When a callback is provided instead, it will write the contents of the
167// respective element directly to the output buffer and return true on success.
168// If the callback returns false, the rendering function should reset the
169// output buffer as though it had never been called.
170//
171// Currently Html and Latex implementations are provided
172type Renderer interface {
173 // block-level callbacks
174 BlockCode(text []byte, lang string)
175 BlockQuote(text []byte)
176 BlockHtml(text []byte)
177 BeginHeader(level int, id string)
178 EndHeader(level int, id string, header []byte)
179 HRule()
180 BeginList(flags ListType)
181 EndList(flags ListType)
182 ListItem(text []byte, flags ListType)
183 BeginParagraph()
184 EndParagraph()
185 Table(header []byte, body []byte, columnData []CellAlignFlags)
186 TableRow(text []byte)
187 TableHeaderCell(out *bytes.Buffer, text []byte, flags CellAlignFlags)
188 TableCell(out *bytes.Buffer, text []byte, flags CellAlignFlags)
189 BeginFootnotes()
190 EndFootnotes()
191 FootnoteItem(name, text []byte, flags ListType)
192 TitleBlock(text []byte)
193
194 // Span-level callbacks
195 AutoLink(link []byte, kind LinkType)
196 CodeSpan(text []byte)
197 DoubleEmphasis(text []byte)
198 Emphasis(text []byte)
199 Image(link []byte, title []byte, alt []byte)
200 LineBreak()
201 Link(link []byte, title []byte, content []byte)
202 RawHtmlTag(tag []byte)
203 TripleEmphasis(text []byte)
204 StrikeThrough(text []byte)
205 FootnoteRef(ref []byte, id int)
206
207 // Low-level callbacks
208 Entity(entity []byte)
209 NormalText(text []byte)
210
211 // Header and footer
212 DocumentHeader()
213 DocumentFooter()
214
215 Write(b []byte) (int, error)
216
217 Render(ast *Node) []byte
218}
219
220// Callback functions for inline parsing. One such function is defined
221// for each character that triggers a response when parsing inline data.
222type inlineParser func(p *parser, data []byte, offset int) int
223
224// Parser holds runtime state used by the parser.
225// This is constructed by the Markdown function.
226type parser struct {
227 refOverride ReferenceOverrideFunc
228 refs map[string]*reference
229 inlineCallback [256]inlineParser
230 flags Extensions
231 nesting int
232 maxNesting int
233 insideLink bool
234
235 // Footnotes need to be ordered as well as available to quickly check for
236 // presence. If a ref is also a footnote, it's stored both in refs and here
237 // in notes. Slice is nil if footnotes not enabled.
238 notes []*reference
239
240 doc *Node
241 tip *Node // = doc
242 oldTip *Node
243 lastMatchedContainer *Node // = doc
244 allClosed bool
245 currBlock *Node // a block node currently being parsed by inline parser
246}
247
248func (p *parser) getRef(refid string) (ref *reference, found bool) {
249 if p.refOverride != nil {
250 r, overridden := p.refOverride(refid)
251 if overridden {
252 if r == nil {
253 return nil, false
254 }
255 return &reference{
256 link: []byte(r.Link),
257 title: []byte(r.Title),
258 noteId: 0,
259 hasBlock: false,
260 text: []byte(r.Text)}, true
261 }
262 }
263 // refs are case insensitive
264 ref, found = p.refs[strings.ToLower(refid)]
265 return ref, found
266}
267
268func (p *parser) finalize(block *Node) {
269 above := block.Parent
270 block.open = false
271 p.tip = above
272}
273
274func (p *parser) addChild(node NodeType, offset uint32) *Node {
275 for !p.tip.canContain(node) {
276 p.finalize(p.tip)
277 }
278 newNode := NewNode(node)
279 newNode.content = []byte{}
280 p.tip.appendChild(newNode)
281 p.tip = newNode
282 return newNode
283}
284
285func (p *parser) closeUnmatchedBlocks() {
286 if !p.allClosed {
287 for p.oldTip != p.lastMatchedContainer {
288 parent := p.oldTip.Parent
289 p.finalize(p.oldTip)
290 p.oldTip = parent
291 }
292 p.allClosed = true
293 }
294}
295
296//
297//
298// Public interface
299//
300//
301
302// Reference represents the details of a link.
303// See the documentation in Options for more details on use-case.
304type Reference struct {
305 // Link is usually the URL the reference points to.
306 Link string
307 // Title is the alternate text describing the link in more detail.
308 Title string
309 // Text is the optional text to override the ref with if the syntax used was
310 // [refid][]
311 Text string
312}
313
314// ReferenceOverrideFunc is expected to be called with a reference string and
315// return either a valid Reference type that the reference string maps to or
316// nil. If overridden is false, the default reference logic will be executed.
317// See the documentation in Options for more details on use-case.
318type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
319
320// Options represents configurable overrides and callbacks (in addition to the
321// extension flag set) for configuring a Markdown parse.
322type Options struct {
323 // Extensions is a flag set of bit-wise ORed extension bits. See the
324 // Extensions flags defined in this package.
325 Extensions Extensions
326
327 // ReferenceOverride is an optional function callback that is called every
328 // time a reference is resolved.
329 //
330 // In Markdown, the link reference syntax can be made to resolve a link to
331 // a reference instead of an inline URL, in one of the following ways:
332 //
333 // * [link text][refid]
334 // * [refid][]
335 //
336 // Usually, the refid is defined at the bottom of the Markdown document. If
337 // this override function is provided, the refid is passed to the override
338 // function first, before consulting the defined refids at the bottom. If
339 // the override function indicates an override did not occur, the refids at
340 // the bottom will be used to fill in the link details.
341 ReferenceOverride ReferenceOverrideFunc
342}
343
344// MarkdownBasic is a convenience function for simple rendering.
345// It processes markdown input with no extensions enabled.
346func MarkdownBasic(input []byte) []byte {
347 // set up the HTML renderer
348 htmlFlags := UseXHTML
349 renderer := HTMLRenderer(htmlFlags, CommonExtensions, "", "")
350
351 // set up the parser
352 return MarkdownOptions(input, renderer, Options{Extensions: 0})
353}
354
355// Call Markdown with most useful extensions enabled
356// MarkdownCommon is a convenience function for simple rendering.
357// It processes markdown input with common extensions enabled, including:
358//
359// * Smartypants processing with smart fractions and LaTeX dashes
360//
361// * Intra-word emphasis suppression
362//
363// * Tables
364//
365// * Fenced code blocks
366//
367// * Autolinking
368//
369// * Strikethrough support
370//
371// * Strict header parsing
372//
373// * Custom Header IDs
374func MarkdownCommon(input []byte) []byte {
375 // set up the HTML renderer
376 renderer := HTMLRenderer(CommonHtmlFlags, CommonExtensions, "", "")
377 return MarkdownOptions(input, renderer, DefaultOptions)
378}
379
380// Markdown is the main rendering function.
381// It parses and renders a block of markdown-encoded text.
382// The supplied Renderer is used to format the output, and extensions dictates
383// which non-standard extensions are enabled.
384//
385// To use the supplied Html or LaTeX renderers, see HtmlRenderer and
386// LatexRenderer, respectively.
387func Markdown(input []byte, renderer Renderer, extensions Extensions) []byte {
388 return MarkdownOptions(input, renderer, Options{
389 Extensions: extensions})
390}
391
392// MarkdownOptions is just like Markdown but takes additional options through
393// the Options struct.
394func MarkdownOptions(input []byte, renderer Renderer, opts Options) []byte {
395 // no point in parsing if we can't render
396 if renderer == nil {
397 return nil
398 }
399 return renderer.Render(Parse(input, opts))
400}
401
402func Parse(input []byte, opts Options) *Node {
403 extensions := opts.Extensions
404
405 // fill in the render structure
406 p := new(parser)
407 p.flags = extensions
408 p.refOverride = opts.ReferenceOverride
409 p.refs = make(map[string]*reference)
410 p.maxNesting = 16
411 p.insideLink = false
412
413 docNode := NewNode(Document)
414 p.doc = docNode
415 p.tip = docNode
416 p.oldTip = docNode
417 p.lastMatchedContainer = docNode
418 p.allClosed = true
419
420 // register inline parsers
421 p.inlineCallback['*'] = emphasis
422 p.inlineCallback['_'] = emphasis
423 if extensions&Strikethrough != 0 {
424 p.inlineCallback['~'] = emphasis
425 }
426 p.inlineCallback['`'] = codeSpan
427 p.inlineCallback['\n'] = lineBreak
428 p.inlineCallback['['] = link
429 p.inlineCallback['<'] = leftAngle
430 p.inlineCallback['\\'] = escape
431 p.inlineCallback['&'] = entity
432 p.inlineCallback['!'] = maybeImage
433 p.inlineCallback['^'] = maybeInlineFootnote
434
435 if extensions&Autolink != 0 {
436 p.inlineCallback['h'] = maybeAutoLink
437 p.inlineCallback['m'] = maybeAutoLink
438 p.inlineCallback['f'] = maybeAutoLink
439 p.inlineCallback['H'] = maybeAutoLink
440 p.inlineCallback['M'] = maybeAutoLink
441 p.inlineCallback['F'] = maybeAutoLink
442 }
443
444 if extensions&Footnotes != 0 {
445 p.notes = make([]*reference, 0)
446 }
447
448 first := firstPass(p, input)
449 secondPass(p, first)
450 // Walk the tree and finish up some of unfinished blocks
451 for p.tip != nil {
452 p.finalize(p.tip)
453 }
454 // Walk the tree again and process inline markdown in each block
455 p.doc.Walk(func(node *Node, entering bool) {
456 if node.Type == Paragraph || node.Type == Header || node.Type == TableCell {
457 p.currBlock = node
458 p.inline(node.content)
459 node.content = nil
460 }
461 })
462 p.parseRefsToAST()
463 p.generateTOC()
464 return p.doc
465}
466
467func (p *parser) generateTOC() {
468 if p.flags&TOC == 0 && p.flags&OmitContents == 0 {
469 return
470 }
471 navNode := NewNode(HTMLBlock)
472 navNode.Literal = []byte("<nav>")
473 navNode.open = false
474
475 var topList *Node
476 var listNode *Node
477 var lastItem *Node
478 headerCount := 0
479 var currentLevel uint32
480 p.doc.Walk(func(node *Node, entering bool) {
481 if entering && node.Type == Header {
482 if node.Level > currentLevel {
483 currentLevel++
484 newList := NewNode(List)
485 if lastItem != nil {
486 lastItem.appendChild(newList)
487 listNode = newList
488 } else {
489 listNode = newList
490 topList = listNode
491 }
492 }
493 if node.Level < currentLevel {
494 finalizeList(listNode)
495 lastItem = listNode.Parent
496 listNode = lastItem.Parent
497 }
498 node.HeaderID = fmt.Sprintf("toc_%d", headerCount)
499 headerCount++
500 lastItem = NewNode(Item)
501 listNode.appendChild(lastItem)
502 anchorNode := NewNode(Link)
503 anchorNode.Destination = []byte("#" + node.HeaderID)
504 lastItem.appendChild(anchorNode)
505 anchorNode.appendChild(text(node.FirstChild.Literal))
506 }
507 })
508 firstChild := p.doc.FirstChild
509 // Insert TOC only if there is anything to insert
510 if topList != nil {
511 finalizeList(topList)
512 firstChild.insertBefore(navNode)
513 firstChild.insertBefore(topList)
514 navCloseNode := NewNode(HTMLBlock)
515 navCloseNode.Literal = []byte("</nav>")
516 navCloseNode.open = false
517 firstChild.insertBefore(navCloseNode)
518 }
519 // Drop everything after the TOC if OmitContents was requested
520 if p.flags&OmitContents != 0 {
521 for firstChild != nil {
522 next := firstChild.Next
523 firstChild.unlink()
524 firstChild = next
525 }
526 }
527}
528
529func (p *parser) parseRefsToAST() {
530 if p.flags&Footnotes == 0 || len(p.notes) == 0 {
531 return
532 }
533 p.tip = p.doc
534 finalizeHtmlBlock(p.addBlock(HTMLBlock, []byte(`<div class="footnotes">`)))
535 p.addBlock(HorizontalRule, nil)
536 block := p.addBlock(List, nil)
537 block.ListFlags = ListTypeOrdered
538 flags := ListItemBeginningOfList
539 // Note: this loop is intentionally explicit, not range-form. This is
540 // because the body of the loop will append nested footnotes to p.notes and
541 // we need to process those late additions. Range form would only walk over
542 // the fixed initial set.
543 for i := 0; i < len(p.notes); i++ {
544 ref := p.notes[i]
545 block := p.addBlock(Item, nil)
546 block.ListFlags = ListTypeOrdered
547 block.RefLink = ref.link
548 if ref.hasBlock {
549 flags |= ListItemContainsBlock
550 p.block(ref.title)
551 } else {
552 p.currBlock = block
553 p.inline(ref.title)
554 }
555 flags &^= ListItemBeginningOfList | ListItemContainsBlock
556 }
557 above := block.Parent
558 finalizeList(block)
559 p.tip = above
560 finalizeHtmlBlock(p.addBlock(HTMLBlock, []byte("</div>")))
561 block.Walk(func(node *Node, entering bool) {
562 if node.Type == Paragraph || node.Type == Header {
563 p.currBlock = node
564 p.inline(node.content)
565 node.content = nil
566 }
567 })
568}
569
570// first pass:
571// - extract references
572// - expand tabs
573// - normalize newlines
574// - copy everything else
575func firstPass(p *parser, input []byte) []byte {
576 var out bytes.Buffer
577 tabSize := TabSizeDefault
578 if p.flags&TabSizeEight != 0 {
579 tabSize = TabSizeDouble
580 }
581 beg, end := 0, 0
582 lastFencedCodeBlockEnd := 0
583 for beg < len(input) { // iterate over lines
584 if end = isReference(p, input[beg:], tabSize); end > 0 {
585 beg += end
586 } else { // skip to the next line
587 end = beg
588 for end < len(input) && input[end] != '\n' && input[end] != '\r' {
589 end++
590 }
591
592 if p.flags&FencedCode != 0 {
593 // track fenced code block boundaries to suppress tab expansion
594 // inside them:
595 if beg >= lastFencedCodeBlockEnd {
596 if i := p.fencedCode(input[beg:], false); i > 0 {
597 lastFencedCodeBlockEnd = beg + i
598 }
599 }
600 }
601
602 // add the line body if present
603 if end > beg {
604 if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
605 out.Write(input[beg:end])
606 } else {
607 expandTabs(&out, input[beg:end], tabSize)
608 }
609 }
610 out.WriteByte('\n')
611
612 if end < len(input) && input[end] == '\r' {
613 end++
614 }
615 if end < len(input) && input[end] == '\n' {
616 end++
617 }
618
619 beg = end
620 }
621 }
622
623 // empty input?
624 if out.Len() == 0 {
625 out.WriteByte('\n')
626 }
627
628 return out.Bytes()
629}
630
631// second pass: actual rendering
632func secondPass(p *parser, input []byte) {
633 p.block(input)
634
635 if p.flags&Footnotes != 0 && len(p.notes) > 0 {
636 flags := ListItemBeginningOfList
637 for i := 0; i < len(p.notes); i += 1 {
638 ref := p.notes[i]
639 if ref.hasBlock {
640 flags |= ListItemContainsBlock
641 p.block(ref.title)
642 } else {
643 p.inline(ref.title)
644 }
645 flags &^= ListItemBeginningOfList | ListItemContainsBlock
646 }
647 }
648
649 if p.nesting != 0 {
650 panic("Nesting level did not end at zero")
651 }
652}
653
654//
655// Link references
656//
657// This section implements support for references that (usually) appear
658// as footnotes in a document, and can be referenced anywhere in the document.
659// The basic format is:
660//
661// [1]: http://www.google.com/ "Google"
662// [2]: http://www.github.com/ "Github"
663//
664// Anywhere in the document, the reference can be linked by referring to its
665// label, i.e., 1 and 2 in this example, as in:
666//
667// This library is hosted on [Github][2], a git hosting site.
668//
669// Actual footnotes as specified in Pandoc and supported by some other Markdown
670// libraries such as php-markdown are also taken care of. They look like this:
671//
672// This sentence needs a bit of further explanation.[^note]
673//
674// [^note]: This is the explanation.
675//
676// Footnotes should be placed at the end of the document in an ordered list.
677// Inline footnotes such as:
678//
679// Inline footnotes^[Not supported.] also exist.
680//
681// are not yet supported.
682
683// References are parsed and stored in this struct.
684type reference struct {
685 link []byte
686 title []byte
687 noteId int // 0 if not a footnote ref
688 hasBlock bool
689 text []byte
690}
691
692func (r *reference) String() string {
693 return fmt.Sprintf("{link: %q, title: %q, text: %q, noteId: %d, hasBlock: %v}",
694 r.link, r.title, r.text, r.noteId, r.hasBlock)
695}
696
697// Check whether or not data starts with a reference link.
698// If so, it is parsed and stored in the list of references
699// (in the render struct).
700// Returns the number of bytes to skip to move past it,
701// or zero if the first line is not a reference.
702func isReference(p *parser, data []byte, tabSize int) int {
703 // up to 3 optional leading spaces
704 if len(data) < 4 {
705 return 0
706 }
707 i := 0
708 for i < 3 && data[i] == ' ' {
709 i++
710 }
711
712 noteId := 0
713
714 // id part: anything but a newline between brackets
715 if data[i] != '[' {
716 return 0
717 }
718 i++
719 if p.flags&Footnotes != 0 {
720 if i < len(data) && data[i] == '^' {
721 // we can set it to anything here because the proper noteIds will
722 // be assigned later during the second pass. It just has to be != 0
723 noteId = 1
724 i++
725 }
726 }
727 idOffset := i
728 for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
729 i++
730 }
731 if i >= len(data) || data[i] != ']' {
732 return 0
733 }
734 idEnd := i
735
736 // spacer: colon (space | tab)* newline? (space | tab)*
737 i++
738 if i >= len(data) || data[i] != ':' {
739 return 0
740 }
741 i++
742 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
743 i++
744 }
745 if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
746 i++
747 if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
748 i++
749 }
750 }
751 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
752 i++
753 }
754 if i >= len(data) {
755 return 0
756 }
757
758 var (
759 linkOffset, linkEnd int
760 titleOffset, titleEnd int
761 lineEnd int
762 raw []byte
763 hasBlock bool
764 )
765
766 if p.flags&Footnotes != 0 && noteId != 0 {
767 linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
768 lineEnd = linkEnd
769 } else {
770 linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
771 }
772 if lineEnd == 0 {
773 return 0
774 }
775
776 // a valid ref has been found
777
778 ref := &reference{
779 noteId: noteId,
780 hasBlock: hasBlock,
781 }
782
783 if noteId > 0 {
784 // reusing the link field for the id since footnotes don't have links
785 ref.link = data[idOffset:idEnd]
786 // if footnote, it's not really a title, it's the contained text
787 ref.title = raw
788 } else {
789 ref.link = data[linkOffset:linkEnd]
790 ref.title = data[titleOffset:titleEnd]
791 }
792
793 // id matches are case-insensitive
794 id := string(bytes.ToLower(data[idOffset:idEnd]))
795
796 p.refs[id] = ref
797
798 return lineEnd
799}
800
801func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
802 // link: whitespace-free sequence, optionally between angle brackets
803 if data[i] == '<' {
804 i++
805 }
806 linkOffset = i
807 for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
808 i++
809 }
810 if i == len(data) {
811 return
812 }
813 linkEnd = i
814 if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
815 linkOffset++
816 linkEnd--
817 }
818
819 // optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
820 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
821 i++
822 }
823 if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
824 return
825 }
826
827 // compute end-of-line
828 if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
829 lineEnd = i
830 }
831 if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
832 lineEnd++
833 }
834
835 // optional (space|tab)* spacer after a newline
836 if lineEnd > 0 {
837 i = lineEnd + 1
838 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
839 i++
840 }
841 }
842
843 // optional title: any non-newline sequence enclosed in '"() alone on its line
844 if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
845 i++
846 titleOffset = i
847
848 // look for EOL
849 for i < len(data) && data[i] != '\n' && data[i] != '\r' {
850 i++
851 }
852 if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
853 titleEnd = i + 1
854 } else {
855 titleEnd = i
856 }
857
858 // step back
859 i--
860 for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
861 i--
862 }
863 if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
864 lineEnd = titleEnd
865 titleEnd = i
866 }
867 }
868
869 return
870}
871
872// The first bit of this logic is the same as (*parser).listItem, but the rest
873// is much simpler. This function simply finds the entire block and shifts it
874// over by one tab if it is indeed a block (just returns the line if it's not).
875// blockEnd is the end of the section in the input buffer, and contents is the
876// extracted text that was shifted over one tab. It will need to be rendered at
877// the end of the document.
878func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
879 if i == 0 || len(data) == 0 {
880 return
881 }
882
883 // skip leading whitespace on first line
884 for i < len(data) && data[i] == ' ' {
885 i++
886 }
887
888 blockStart = i
889
890 // find the end of the line
891 blockEnd = i
892 for i < len(data) && data[i-1] != '\n' {
893 i++
894 }
895
896 // get working buffer
897 var raw bytes.Buffer
898
899 // put the first line into the working buffer
900 raw.Write(data[blockEnd:i])
901 blockEnd = i
902
903 // process the following lines
904 containsBlankLine := false
905
906gatherLines:
907 for blockEnd < len(data) {
908 i++
909
910 // find the end of this line
911 for i < len(data) && data[i-1] != '\n' {
912 i++
913 }
914
915 // if it is an empty line, guess that it is part of this item
916 // and move on to the next line
917 if p.isEmpty(data[blockEnd:i]) > 0 {
918 containsBlankLine = true
919 blockEnd = i
920 continue
921 }
922
923 n := 0
924 if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
925 // this is the end of the block.
926 // we don't want to include this last line in the index.
927 break gatherLines
928 }
929
930 // if there were blank lines before this one, insert a new one now
931 if containsBlankLine {
932 raw.WriteByte('\n')
933 containsBlankLine = false
934 }
935
936 // get rid of that first tab, write to buffer
937 raw.Write(data[blockEnd+n : i])
938 hasBlock = true
939
940 blockEnd = i
941 }
942
943 if data[blockEnd-1] != '\n' {
944 raw.WriteByte('\n')
945 }
946
947 contents = raw.Bytes()
948
949 return
950}
951
952//
953//
954// Miscellaneous helper functions
955//
956//
957
958// Test if a character is a punctuation symbol.
959// Taken from a private function in regexp in the stdlib.
960func ispunct(c byte) bool {
961 for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
962 if c == r {
963 return true
964 }
965 }
966 return false
967}
968
969// Test if a character is a whitespace character.
970func isspace(c byte) bool {
971 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
972}
973
974// Test if a character is letter.
975func isletter(c byte) bool {
976 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
977}
978
979// Test if a character is a letter or a digit.
980// TODO: check when this is looking for ASCII alnum and when it should use unicode
981func isalnum(c byte) bool {
982 return (c >= '0' && c <= '9') || isletter(c)
983}
984
985// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
986// always ends output with a newline
987func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
988 // first, check for common cases: no tabs, or only tabs at beginning of line
989 i, prefix := 0, 0
990 slowcase := false
991 for i = 0; i < len(line); i++ {
992 if line[i] == '\t' {
993 if prefix == i {
994 prefix++
995 } else {
996 slowcase = true
997 break
998 }
999 }
1000 }
1001
1002 // no need to decode runes if all tabs are at the beginning of the line
1003 if !slowcase {
1004 for i = 0; i < prefix*tabSize; i++ {
1005 out.WriteByte(' ')
1006 }
1007 out.Write(line[prefix:])
1008 return
1009 }
1010
1011 // the slow case: we need to count runes to figure out how
1012 // many spaces to insert for each tab
1013 column := 0
1014 i = 0
1015 for i < len(line) {
1016 start := i
1017 for i < len(line) && line[i] != '\t' {
1018 _, size := utf8.DecodeRune(line[i:])
1019 i += size
1020 column++
1021 }
1022
1023 if i > start {
1024 out.Write(line[start:i])
1025 }
1026
1027 if i >= len(line) {
1028 break
1029 }
1030
1031 for {
1032 out.WriteByte(' ')
1033 column++
1034 if column%tabSize == 0 {
1035 break
1036 }
1037 }
1038
1039 i++
1040 }
1041}
1042
1043// Find if a line counts as indented or not.
1044// Returns number of characters the indent is (0 = not indented).
1045func isIndented(data []byte, indentSize int) int {
1046 if len(data) == 0 {
1047 return 0
1048 }
1049 if data[0] == '\t' {
1050 return 1
1051 }
1052 if len(data) < indentSize {
1053 return 0
1054 }
1055 for i := 0; i < indentSize; i++ {
1056 if data[i] != ' ' {
1057 return 0
1058 }
1059 }
1060 return indentSize
1061}
1062
1063// Create a url-safe slug for fragments
1064func slugify(in []byte) []byte {
1065 if len(in) == 0 {
1066 return in
1067 }
1068 out := make([]byte, 0, len(in))
1069 sym := false
1070
1071 for _, ch := range in {
1072 if isalnum(ch) {
1073 sym = false
1074 out = append(out, ch)
1075 } else if sym {
1076 continue
1077 } else {
1078 out = append(out, '-')
1079 sym = true
1080 }
1081 }
1082 var a, b int
1083 var ch byte
1084 for a, ch = range out {
1085 if ch != '-' {
1086 break
1087 }
1088 }
1089 for b = len(out) - 1; b > 0; b-- {
1090 if out[b] != '-' {
1091 break
1092 }
1093 }
1094 return out[a : b+1]
1095}