all repos — grayfriday @ 427717f9916ccbf0bea4cecc949b42355bf318f9

blackfriday fork with a few changes

markdown.go (view raw)

  1// Blackfriday Markdown Processor
  2// Available at http://github.com/russross/blackfriday
  3//
  4// Copyright © 2011 Russ Ross <russ@russross.com>.
  5// Distributed under the Simplified BSD License.
  6// See README.md for details.
  7
  8package blackfriday
  9
 10import (
 11	"bytes"
 12	"fmt"
 13	"io"
 14	"strings"
 15	"unicode/utf8"
 16)
 17
 18//
 19// Markdown parsing and processing
 20//
 21
 22// Version string of the package.
 23const Version = "2.0"
 24
 25// Extensions is a bitwise or'ed collection of enabled Blackfriday's
 26// extensions.
 27type Extensions int
 28
 29// These are the supported markdown parsing extensions.
 30// OR these values together to select multiple extensions.
 31const (
 32	NoExtensions           Extensions = 0
 33	NoIntraEmphasis        Extensions = 1 << iota // Ignore emphasis markers inside words
 34	Tables                                        // Render tables
 35	FencedCode                                    // Render fenced code blocks
 36	Autolink                                      // Detect embedded URLs that are not explicitly marked
 37	Strikethrough                                 // Strikethrough text using ~~test~~
 38	LaxHTMLBlocks                                 // Loosen up HTML block parsing rules
 39	SpaceHeadings                                 // Be strict about prefix heading rules
 40	HardLineBreak                                 // Translate newlines into line breaks
 41	TabSizeEight                                  // Expand tabs to eight spaces instead of four
 42	Footnotes                                     // Pandoc-style footnotes
 43	NoEmptyLineBeforeBlock                        // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
 44	HeadingIDs                                    // specify heading IDs  with {#id}
 45	Titleblock                                    // Titleblock ala pandoc
 46	AutoHeadingIDs                                // Create the heading ID from the text
 47	BackslashLineBreak                            // Translate trailing backslashes into line breaks
 48	DefinitionLists                               // Render definition lists
 49
 50	CommonHTMLFlags HTMLFlags = UseXHTML | Smartypants |
 51		SmartypantsFractions | SmartypantsDashes | SmartypantsLatexDashes
 52
 53	CommonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
 54		Autolink | Strikethrough | SpaceHeadings | HeadingIDs |
 55		BackslashLineBreak | DefinitionLists
 56)
 57
 58// ListType contains bitwise or'ed flags for list and list item objects.
 59type ListType int
 60
 61// These are the possible flag values for the ListItem renderer.
 62// Multiple flag values may be ORed together.
 63// These are mostly of interest if you are writing a new output format.
 64const (
 65	ListTypeOrdered ListType = 1 << iota
 66	ListTypeDefinition
 67	ListTypeTerm
 68
 69	ListItemContainsBlock
 70	ListItemBeginningOfList // TODO: figure out if this is of any use now
 71	ListItemEndOfList
 72)
 73
 74// CellAlignFlags holds a type of alignment in a table cell.
 75type CellAlignFlags int
 76
 77// These are the possible flag values for the table cell renderer.
 78// Only a single one of these values will be used; they are not ORed together.
 79// These are mostly of interest if you are writing a new output format.
 80const (
 81	TableAlignmentLeft CellAlignFlags = 1 << iota
 82	TableAlignmentRight
 83	TableAlignmentCenter = (TableAlignmentLeft | TableAlignmentRight)
 84)
 85
 86// The size of a tab stop.
 87const (
 88	TabSizeDefault = 4
 89	TabSizeDouble  = 8
 90)
 91
 92// blockTags is a set of tags that are recognized as HTML block tags.
 93// Any of these can be included in markdown text without special escaping.
 94var blockTags = map[string]struct{}{
 95	"blockquote": struct{}{},
 96	"del":        struct{}{},
 97	"div":        struct{}{},
 98	"dl":         struct{}{},
 99	"fieldset":   struct{}{},
100	"form":       struct{}{},
101	"h1":         struct{}{},
102	"h2":         struct{}{},
103	"h3":         struct{}{},
104	"h4":         struct{}{},
105	"h5":         struct{}{},
106	"h6":         struct{}{},
107	"iframe":     struct{}{},
108	"ins":        struct{}{},
109	"math":       struct{}{},
110	"noscript":   struct{}{},
111	"ol":         struct{}{},
112	"pre":        struct{}{},
113	"p":          struct{}{},
114	"script":     struct{}{},
115	"style":      struct{}{},
116	"table":      struct{}{},
117	"ul":         struct{}{},
118
119	// HTML5
120	"address":    struct{}{},
121	"article":    struct{}{},
122	"aside":      struct{}{},
123	"canvas":     struct{}{},
124	"figcaption": struct{}{},
125	"figure":     struct{}{},
126	"footer":     struct{}{},
127	"header":     struct{}{},
128	"hgroup":     struct{}{},
129	"main":       struct{}{},
130	"nav":        struct{}{},
131	"output":     struct{}{},
132	"progress":   struct{}{},
133	"section":    struct{}{},
134	"video":      struct{}{},
135}
136
137// Renderer is the rendering interface. This is mostly of interest if you are
138// implementing a new rendering format.
139//
140// Only an HTML implementation is provided in this repository, see the README
141// for external implementations.
142type Renderer interface {
143	// RenderNode is the main rendering method. It will be called once for
144	// every leaf node and twice for every non-leaf node (first with
145	// entering=true, then with entering=false). The method should write its
146	// rendition of the node to the supplied writer w.
147	RenderNode(w io.Writer, node *Node, entering bool) WalkStatus
148
149	// RenderHeader is a method that allows the renderer to produce some
150	// content preceding the main body of the output document. The header is
151	// understood in the broad sense here. For example, the default HTML
152	// renderer will write not only the HTML document preamble, but also the
153	// table of contents if it was requested.
154	//
155	// The method will be passed an entire document tree, in case a particular
156	// implementation needs to inspect it to produce output.
157	//
158	// The output should be written to the supplied writer w. If your
159	// implementation has no header to write, supply an empty implementation.
160	RenderHeader(w io.Writer, ast *Node)
161
162	// RenderFooter is a symmetric counterpart of RenderHeader.
163	RenderFooter(w io.Writer, ast *Node)
164}
165
166// Callback functions for inline parsing. One such function is defined
167// for each character that triggers a response when parsing inline data.
168type inlineParser func(p *Markdown, data []byte, offset int) (int, *Node)
169
170// Markdown is a type that holds:
171// - extensions and the runtime state used by Parse,
172// - the renderer.
173type Markdown struct {
174	renderer          Renderer
175	referenceOverride ReferenceOverrideFunc
176	refs              map[string]*reference
177	inlineCallback    [256]inlineParser
178	extensions        Extensions
179	nesting           int
180	maxNesting        int
181	insideLink        bool
182
183	// Footnotes need to be ordered as well as available to quickly check for
184	// presence. If a ref is also a footnote, it's stored both in refs and here
185	// in notes. Slice is nil if footnotes not enabled.
186	notes []*reference
187
188	doc                  *Node
189	tip                  *Node // = doc
190	oldTip               *Node
191	lastMatchedContainer *Node // = doc
192	allClosed            bool
193}
194
195func (p *Markdown) getRef(refid string) (ref *reference, found bool) {
196	if p.referenceOverride != nil {
197		r, overridden := p.referenceOverride(refid)
198		if overridden {
199			if r == nil {
200				return nil, false
201			}
202			return &reference{
203				link:     []byte(r.Link),
204				title:    []byte(r.Title),
205				noteID:   0,
206				hasBlock: false,
207				text:     []byte(r.Text)}, true
208		}
209	}
210	// refs are case insensitive
211	ref, found = p.refs[strings.ToLower(refid)]
212	return ref, found
213}
214
215func (p *Markdown) finalize(block *Node) {
216	above := block.Parent
217	block.open = false
218	p.tip = above
219}
220
221func (p *Markdown) addChild(node NodeType, offset uint32) *Node {
222	return p.addExistingChild(NewNode(node), offset)
223}
224
225func (p *Markdown) addExistingChild(node *Node, offset uint32) *Node {
226	for !p.tip.canContain(node.Type) {
227		p.finalize(p.tip)
228	}
229	p.tip.AppendChild(node)
230	p.tip = node
231	return node
232}
233
234func (p *Markdown) closeUnmatchedBlocks() {
235	if !p.allClosed {
236		for p.oldTip != p.lastMatchedContainer {
237			parent := p.oldTip.Parent
238			p.finalize(p.oldTip)
239			p.oldTip = parent
240		}
241		p.allClosed = true
242	}
243}
244
245//
246//
247// Public interface
248//
249//
250
251// Reference represents the details of a link.
252// See the documentation in Options for more details on use-case.
253type Reference struct {
254	// Link is usually the URL the reference points to.
255	Link string
256	// Title is the alternate text describing the link in more detail.
257	Title string
258	// Text is the optional text to override the ref with if the syntax used was
259	// [refid][]
260	Text string
261}
262
263// ReferenceOverrideFunc is expected to be called with a reference string and
264// return either a valid Reference type that the reference string maps to or
265// nil. If overridden is false, the default reference logic will be executed.
266// See the documentation in Options for more details on use-case.
267type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
268
269// New constructs a Markdown processor. You can use the same With* functions as
270// for Run() to customize parser's behavior and the renderer.
271func New(opts ...Option) *Markdown {
272	var p Markdown
273	for _, opt := range opts {
274		opt(&p)
275	}
276	p.refs = make(map[string]*reference)
277	p.maxNesting = 16
278	p.insideLink = false
279	docNode := NewNode(Document)
280	p.doc = docNode
281	p.tip = docNode
282	p.oldTip = docNode
283	p.lastMatchedContainer = docNode
284	p.allClosed = true
285	// register inline parsers
286	p.inlineCallback[' '] = maybeLineBreak
287	p.inlineCallback['*'] = emphasis
288	p.inlineCallback['_'] = emphasis
289	if p.extensions&Strikethrough != 0 {
290		p.inlineCallback['~'] = emphasis
291	}
292	p.inlineCallback['`'] = codeSpan
293	p.inlineCallback['\n'] = lineBreak
294	p.inlineCallback['['] = link
295	p.inlineCallback['<'] = leftAngle
296	p.inlineCallback['\\'] = escape
297	p.inlineCallback['&'] = entity
298	p.inlineCallback['!'] = maybeImage
299	p.inlineCallback['^'] = maybeInlineFootnote
300	if p.extensions&Autolink != 0 {
301		p.inlineCallback['h'] = maybeAutoLink
302		p.inlineCallback['m'] = maybeAutoLink
303		p.inlineCallback['f'] = maybeAutoLink
304		p.inlineCallback['H'] = maybeAutoLink
305		p.inlineCallback['M'] = maybeAutoLink
306		p.inlineCallback['F'] = maybeAutoLink
307	}
308	if p.extensions&Footnotes != 0 {
309		p.notes = make([]*reference, 0)
310	}
311	return &p
312}
313
314// Option customizes the Markdown processor's default behavior.
315type Option func(*Markdown)
316
317// WithRenderer allows you to override the default renderer.
318func WithRenderer(r Renderer) Option {
319	return func(p *Markdown) {
320		p.renderer = r
321	}
322}
323
324// WithExtensions allows you to pick some of the many extensions provided by
325// Blackfriday. You can bitwise OR them.
326func WithExtensions(e Extensions) Option {
327	return func(p *Markdown) {
328		p.extensions = e
329	}
330}
331
332// WithNoExtensions turns off all extensions and custom behavior.
333func WithNoExtensions() Option {
334	return func(p *Markdown) {
335		p.extensions = NoExtensions
336		p.renderer = NewHTMLRenderer(HTMLRendererParameters{
337			Flags: HTMLFlagsNone,
338		})
339	}
340}
341
342// WithRefOverride sets an optional function callback that is called every
343// time a reference is resolved.
344//
345// In Markdown, the link reference syntax can be made to resolve a link to
346// a reference instead of an inline URL, in one of the following ways:
347//
348//  * [link text][refid]
349//  * [refid][]
350//
351// Usually, the refid is defined at the bottom of the Markdown document. If
352// this override function is provided, the refid is passed to the override
353// function first, before consulting the defined refids at the bottom. If
354// the override function indicates an override did not occur, the refids at
355// the bottom will be used to fill in the link details.
356func WithRefOverride(o ReferenceOverrideFunc) Option {
357	return func(p *Markdown) {
358		p.referenceOverride = o
359	}
360}
361
362// Run is the main entry point to Blackfriday. It parses and renders a
363// block of markdown-encoded text.
364//
365// The simplest invocation of Run takes one argument, input:
366//     output := Run(input)
367// This will parse the input with CommonExtensions enabled and render it with
368// the default HTMLRenderer (with CommonHTMLFlags).
369//
370// Variadic arguments opts can customize the default behavior. Since Markdown
371// type does not contain exported fields, you can not use it directly. Instead,
372// use the With* functions. For example, this will call the most basic
373// functionality, with no extensions:
374//     output := Run(input, WithNoExtensions())
375//
376// You can use any number of With* arguments, even contradicting ones. They
377// will be applied in order of appearance and the latter will override the
378// former:
379//     output := Run(input, WithNoExtensions(), WithExtensions(exts),
380//         WithRenderer(yourRenderer))
381func Run(input []byte, opts ...Option) []byte {
382	r := NewHTMLRenderer(HTMLRendererParameters{
383		Flags: CommonHTMLFlags,
384	})
385	optList := []Option{WithRenderer(r), WithExtensions(CommonExtensions)}
386	optList = append(optList, opts...)
387	parser := New(optList...)
388	ast := parser.Parse(input)
389	var buf bytes.Buffer
390	parser.renderer.RenderHeader(&buf, ast)
391	ast.Walk(func(node *Node, entering bool) WalkStatus {
392		return parser.renderer.RenderNode(&buf, node, entering)
393	})
394	parser.renderer.RenderFooter(&buf, ast)
395	return buf.Bytes()
396}
397
398// Parse is an entry point to the parsing part of Blackfriday. It takes an
399// input markdown document and produces a syntax tree for its contents. This
400// tree can then be rendered with a default or custom renderer, or
401// analyzed/transformed by the caller to whatever non-standard needs they have.
402func (p *Markdown) Parse(input []byte) *Node {
403	p.block(input)
404	// Walk the tree and finish up some of unfinished blocks
405	for p.tip != nil {
406		p.finalize(p.tip)
407	}
408	// Walk the tree again and process inline markdown in each block
409	p.doc.Walk(func(node *Node, entering bool) WalkStatus {
410		if node.Type == Paragraph || node.Type == Heading || node.Type == TableCell {
411			p.inline(node, node.content)
412			node.content = nil
413		}
414		return GoToNext
415	})
416	p.parseRefsToAST()
417	return p.doc
418}
419
420func (p *Markdown) parseRefsToAST() {
421	if p.extensions&Footnotes == 0 || len(p.notes) == 0 {
422		return
423	}
424	p.tip = p.doc
425	block := p.addBlock(List, nil)
426	block.IsFootnotesList = true
427	block.ListFlags = ListTypeOrdered
428	flags := ListItemBeginningOfList
429	// Note: this loop is intentionally explicit, not range-form. This is
430	// because the body of the loop will append nested footnotes to p.notes and
431	// we need to process those late additions. Range form would only walk over
432	// the fixed initial set.
433	for i := 0; i < len(p.notes); i++ {
434		ref := p.notes[i]
435		p.addExistingChild(ref.footnote, 0)
436		block := ref.footnote
437		block.ListFlags = flags | ListTypeOrdered
438		block.RefLink = ref.link
439		if ref.hasBlock {
440			flags |= ListItemContainsBlock
441			p.block(ref.title)
442		} else {
443			p.inline(block, ref.title)
444		}
445		flags &^= ListItemBeginningOfList | ListItemContainsBlock
446	}
447	above := block.Parent
448	finalizeList(block)
449	p.tip = above
450	block.Walk(func(node *Node, entering bool) WalkStatus {
451		if node.Type == Paragraph || node.Type == Heading {
452			p.inline(node, node.content)
453			node.content = nil
454		}
455		return GoToNext
456	})
457}
458
459//
460// Link references
461//
462// This section implements support for references that (usually) appear
463// as footnotes in a document, and can be referenced anywhere in the document.
464// The basic format is:
465//
466//    [1]: http://www.google.com/ "Google"
467//    [2]: http://www.github.com/ "Github"
468//
469// Anywhere in the document, the reference can be linked by referring to its
470// label, i.e., 1 and 2 in this example, as in:
471//
472//    This library is hosted on [Github][2], a git hosting site.
473//
474// Actual footnotes as specified in Pandoc and supported by some other Markdown
475// libraries such as php-markdown are also taken care of. They look like this:
476//
477//    This sentence needs a bit of further explanation.[^note]
478//
479//    [^note]: This is the explanation.
480//
481// Footnotes should be placed at the end of the document in an ordered list.
482// Inline footnotes such as:
483//
484//    Inline footnotes^[Not supported.] also exist.
485//
486// are not yet supported.
487
488// reference holds all information necessary for a reference-style links or
489// footnotes.
490//
491// Consider this markdown with reference-style links:
492//
493//     [link][ref]
494//
495//     [ref]: /url/ "tooltip title"
496//
497// It will be ultimately converted to this HTML:
498//
499//     <p><a href=\"/url/\" title=\"title\">link</a></p>
500//
501// And a reference structure will be populated as follows:
502//
503//     p.refs["ref"] = &reference{
504//         link: "/url/",
505//         title: "tooltip title",
506//     }
507//
508// Alternatively, reference can contain information about a footnote. Consider
509// this markdown:
510//
511//     Text needing a footnote.[^a]
512//
513//     [^a]: This is the note
514//
515// A reference structure will be populated as follows:
516//
517//     p.refs["a"] = &reference{
518//         link: "a",
519//         title: "This is the note",
520//         noteID: <some positive int>,
521//     }
522//
523// TODO: As you can see, it begs for splitting into two dedicated structures
524// for refs and for footnotes.
525type reference struct {
526	link     []byte
527	title    []byte
528	noteID   int // 0 if not a footnote ref
529	hasBlock bool
530	footnote *Node // a link to the Item node within a list of footnotes
531
532	text []byte // only gets populated by refOverride feature with Reference.Text
533}
534
535func (r *reference) String() string {
536	return fmt.Sprintf("{link: %q, title: %q, text: %q, noteID: %d, hasBlock: %v}",
537		r.link, r.title, r.text, r.noteID, r.hasBlock)
538}
539
540// Check whether or not data starts with a reference link.
541// If so, it is parsed and stored in the list of references
542// (in the render struct).
543// Returns the number of bytes to skip to move past it,
544// or zero if the first line is not a reference.
545func isReference(p *Markdown, data []byte, tabSize int) int {
546	// up to 3 optional leading spaces
547	if len(data) < 4 {
548		return 0
549	}
550	i := 0
551	for i < 3 && data[i] == ' ' {
552		i++
553	}
554
555	noteID := 0
556
557	// id part: anything but a newline between brackets
558	if data[i] != '[' {
559		return 0
560	}
561	i++
562	if p.extensions&Footnotes != 0 {
563		if i < len(data) && data[i] == '^' {
564			// we can set it to anything here because the proper noteIds will
565			// be assigned later during the second pass. It just has to be != 0
566			noteID = 1
567			i++
568		}
569	}
570	idOffset := i
571	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
572		i++
573	}
574	if i >= len(data) || data[i] != ']' {
575		return 0
576	}
577	idEnd := i
578	// footnotes can have empty ID, like this: [^], but a reference can not be
579	// empty like this: []. Break early if it's not a footnote and there's no ID
580	if noteID == 0 && idOffset == idEnd {
581		return 0
582	}
583	// spacer: colon (space | tab)* newline? (space | tab)*
584	i++
585	if i >= len(data) || data[i] != ':' {
586		return 0
587	}
588	i++
589	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
590		i++
591	}
592	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
593		i++
594		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
595			i++
596		}
597	}
598	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
599		i++
600	}
601	if i >= len(data) {
602		return 0
603	}
604
605	var (
606		linkOffset, linkEnd   int
607		titleOffset, titleEnd int
608		lineEnd               int
609		raw                   []byte
610		hasBlock              bool
611	)
612
613	if p.extensions&Footnotes != 0 && noteID != 0 {
614		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
615		lineEnd = linkEnd
616	} else {
617		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
618	}
619	if lineEnd == 0 {
620		return 0
621	}
622
623	// a valid ref has been found
624
625	ref := &reference{
626		noteID:   noteID,
627		hasBlock: hasBlock,
628	}
629
630	if noteID > 0 {
631		// reusing the link field for the id since footnotes don't have links
632		ref.link = data[idOffset:idEnd]
633		// if footnote, it's not really a title, it's the contained text
634		ref.title = raw
635	} else {
636		ref.link = data[linkOffset:linkEnd]
637		ref.title = data[titleOffset:titleEnd]
638	}
639
640	// id matches are case-insensitive
641	id := string(bytes.ToLower(data[idOffset:idEnd]))
642
643	p.refs[id] = ref
644
645	return lineEnd
646}
647
648func scanLinkRef(p *Markdown, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
649	// link: whitespace-free sequence, optionally between angle brackets
650	if data[i] == '<' {
651		i++
652	}
653	linkOffset = i
654	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
655		i++
656	}
657	linkEnd = i
658	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
659		linkOffset++
660		linkEnd--
661	}
662
663	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
664	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
665		i++
666	}
667	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
668		return
669	}
670
671	// compute end-of-line
672	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
673		lineEnd = i
674	}
675	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
676		lineEnd++
677	}
678
679	// optional (space|tab)* spacer after a newline
680	if lineEnd > 0 {
681		i = lineEnd + 1
682		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
683			i++
684		}
685	}
686
687	// optional title: any non-newline sequence enclosed in '"() alone on its line
688	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
689		i++
690		titleOffset = i
691
692		// look for EOL
693		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
694			i++
695		}
696		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
697			titleEnd = i + 1
698		} else {
699			titleEnd = i
700		}
701
702		// step back
703		i--
704		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
705			i--
706		}
707		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
708			lineEnd = titleEnd
709			titleEnd = i
710		}
711	}
712
713	return
714}
715
716// The first bit of this logic is the same as Parser.listItem, but the rest
717// is much simpler. This function simply finds the entire block and shifts it
718// over by one tab if it is indeed a block (just returns the line if it's not).
719// blockEnd is the end of the section in the input buffer, and contents is the
720// extracted text that was shifted over one tab. It will need to be rendered at
721// the end of the document.
722func scanFootnote(p *Markdown, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
723	if i == 0 || len(data) == 0 {
724		return
725	}
726
727	// skip leading whitespace on first line
728	for i < len(data) && data[i] == ' ' {
729		i++
730	}
731
732	blockStart = i
733
734	// find the end of the line
735	blockEnd = i
736	for i < len(data) && data[i-1] != '\n' {
737		i++
738	}
739
740	// get working buffer
741	var raw bytes.Buffer
742
743	// put the first line into the working buffer
744	raw.Write(data[blockEnd:i])
745	blockEnd = i
746
747	// process the following lines
748	containsBlankLine := false
749
750gatherLines:
751	for blockEnd < len(data) {
752		i++
753
754		// find the end of this line
755		for i < len(data) && data[i-1] != '\n' {
756			i++
757		}
758
759		// if it is an empty line, guess that it is part of this item
760		// and move on to the next line
761		if p.isEmpty(data[blockEnd:i]) > 0 {
762			containsBlankLine = true
763			blockEnd = i
764			continue
765		}
766
767		n := 0
768		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
769			// this is the end of the block.
770			// we don't want to include this last line in the index.
771			break gatherLines
772		}
773
774		// if there were blank lines before this one, insert a new one now
775		if containsBlankLine {
776			raw.WriteByte('\n')
777			containsBlankLine = false
778		}
779
780		// get rid of that first tab, write to buffer
781		raw.Write(data[blockEnd+n : i])
782		hasBlock = true
783
784		blockEnd = i
785	}
786
787	if data[blockEnd-1] != '\n' {
788		raw.WriteByte('\n')
789	}
790
791	contents = raw.Bytes()
792
793	return
794}
795
796//
797//
798// Miscellaneous helper functions
799//
800//
801
802// Test if a character is a punctuation symbol.
803// Taken from a private function in regexp in the stdlib.
804func ispunct(c byte) bool {
805	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
806		if c == r {
807			return true
808		}
809	}
810	return false
811}
812
813// Test if a character is a whitespace character.
814func isspace(c byte) bool {
815	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
816}
817
818// Test if a character is letter.
819func isletter(c byte) bool {
820	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
821}
822
823// Test if a character is a letter or a digit.
824// TODO: check when this is looking for ASCII alnum and when it should use unicode
825func isalnum(c byte) bool {
826	return (c >= '0' && c <= '9') || isletter(c)
827}
828
829// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
830// always ends output with a newline
831func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
832	// first, check for common cases: no tabs, or only tabs at beginning of line
833	i, prefix := 0, 0
834	slowcase := false
835	for i = 0; i < len(line); i++ {
836		if line[i] == '\t' {
837			if prefix == i {
838				prefix++
839			} else {
840				slowcase = true
841				break
842			}
843		}
844	}
845
846	// no need to decode runes if all tabs are at the beginning of the line
847	if !slowcase {
848		for i = 0; i < prefix*tabSize; i++ {
849			out.WriteByte(' ')
850		}
851		out.Write(line[prefix:])
852		return
853	}
854
855	// the slow case: we need to count runes to figure out how
856	// many spaces to insert for each tab
857	column := 0
858	i = 0
859	for i < len(line) {
860		start := i
861		for i < len(line) && line[i] != '\t' {
862			_, size := utf8.DecodeRune(line[i:])
863			i += size
864			column++
865		}
866
867		if i > start {
868			out.Write(line[start:i])
869		}
870
871		if i >= len(line) {
872			break
873		}
874
875		for {
876			out.WriteByte(' ')
877			column++
878			if column%tabSize == 0 {
879				break
880			}
881		}
882
883		i++
884	}
885}
886
887// Find if a line counts as indented or not.
888// Returns number of characters the indent is (0 = not indented).
889func isIndented(data []byte, indentSize int) int {
890	if len(data) == 0 {
891		return 0
892	}
893	if data[0] == '\t' {
894		return 1
895	}
896	if len(data) < indentSize {
897		return 0
898	}
899	for i := 0; i < indentSize; i++ {
900		if data[i] != ' ' {
901			return 0
902		}
903	}
904	return indentSize
905}
906
907// Create a url-safe slug for fragments
908func slugify(in []byte) []byte {
909	if len(in) == 0 {
910		return in
911	}
912	out := make([]byte, 0, len(in))
913	sym := false
914
915	for _, ch := range in {
916		if isalnum(ch) {
917			sym = false
918			out = append(out, ch)
919		} else if sym {
920			continue
921		} else {
922			out = append(out, '-')
923			sym = true
924		}
925	}
926	var a, b int
927	var ch byte
928	for a, ch = range out {
929		if ch != '-' {
930			break
931		}
932	}
933	for b = len(out) - 1; b > 0; b-- {
934		if out[b] != '-' {
935			break
936		}
937	}
938	return out[a : b+1]
939}