all repos — grayfriday @ c5943e068510d99ad28d07865734667a4b260775

blackfriday fork with a few changes

markdown.go (view raw)

  1//
  2// Blackfriday Markdown Processor
  3// Available at http://github.com/russross/blackfriday
  4//
  5// Copyright © 2011 Russ Ross <russ@russross.com>.
  6// Distributed under the Simplified BSD License.
  7// See README.md for details.
  8//
  9
 10//
 11//
 12// Markdown parsing and processing
 13//
 14//
 15
 16// Blackfriday markdown processor.
 17//
 18// Translates plain text with simple formatting rules into HTML or LaTeX.
 19package blackfriday
 20
 21import (
 22	"bytes"
 23	"strings"
 24	"unicode/utf8"
 25)
 26
 27const VERSION = "1.4"
 28
 29// These are the supported markdown parsing extensions.
 30// OR these values together to select multiple extensions.
 31const (
 32	EXTENSION_NO_INTRA_EMPHASIS          = 1 << iota // ignore emphasis markers inside words
 33	EXTENSION_TABLES                                 // render tables
 34	EXTENSION_FENCED_CODE                            // render fenced code blocks
 35	EXTENSION_AUTOLINK                               // detect embedded URLs that are not explicitly marked
 36	EXTENSION_STRIKETHROUGH                          // strikethrough text using ~~test~~
 37	EXTENSION_LAX_HTML_BLOCKS                        // loosen up HTML block parsing rules
 38	EXTENSION_SPACE_HEADERS                          // be strict about prefix header rules
 39	EXTENSION_HARD_LINE_BREAK                        // translate newlines into line breaks
 40	EXTENSION_TAB_SIZE_EIGHT                         // expand tabs to eight spaces instead of four
 41	EXTENSION_FOOTNOTES                              // Pandoc-style footnotes
 42	EXTENSION_NO_EMPTY_LINE_BEFORE_BLOCK             // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
 43	EXTENSION_HEADER_IDS                             // specify header IDs  with {#id}
 44	EXTENSION_TITLEBLOCK                             // Titleblock ala pandoc
 45	EXTENSION_AUTO_HEADER_IDS                        // Create the header ID from the text
 46	EXTENSION_BACKSLASH_LINE_BREAK                   // translate trailing backslashes into line breaks
 47	EXTENSION_DEFINITION_LISTS                       // render definition lists
 48
 49	commonHtmlFlags = 0 |
 50		HTML_USE_XHTML |
 51		HTML_USE_SMARTYPANTS |
 52		HTML_SMARTYPANTS_FRACTIONS |
 53		HTML_SMARTYPANTS_DASHES |
 54		HTML_SMARTYPANTS_LATEX_DASHES
 55
 56	commonExtensions = 0 |
 57		EXTENSION_NO_INTRA_EMPHASIS |
 58		EXTENSION_TABLES |
 59		EXTENSION_FENCED_CODE |
 60		EXTENSION_AUTOLINK |
 61		EXTENSION_STRIKETHROUGH |
 62		EXTENSION_SPACE_HEADERS |
 63		EXTENSION_HEADER_IDS |
 64		EXTENSION_BACKSLASH_LINE_BREAK |
 65		EXTENSION_DEFINITION_LISTS
 66)
 67
 68// These are the possible flag values for the link renderer.
 69// Only a single one of these values will be used; they are not ORed together.
 70// These are mostly of interest if you are writing a new output format.
 71const (
 72	LINK_TYPE_NOT_AUTOLINK = iota
 73	LINK_TYPE_NORMAL
 74	LINK_TYPE_EMAIL
 75)
 76
 77// These are the possible flag values for the ListItem renderer.
 78// Multiple flag values may be ORed together.
 79// These are mostly of interest if you are writing a new output format.
 80const (
 81	LIST_TYPE_ORDERED = 1 << iota
 82	LIST_TYPE_DEFINITION
 83	LIST_TYPE_TERM
 84	LIST_ITEM_CONTAINS_BLOCK
 85	LIST_ITEM_BEGINNING_OF_LIST
 86	LIST_ITEM_END_OF_LIST
 87)
 88
 89// These are the possible flag values for the table cell renderer.
 90// Only a single one of these values will be used; they are not ORed together.
 91// These are mostly of interest if you are writing a new output format.
 92const (
 93	TABLE_ALIGNMENT_LEFT = 1 << iota
 94	TABLE_ALIGNMENT_RIGHT
 95	TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
 96)
 97
 98// The size of a tab stop.
 99const (
100	TAB_SIZE_DEFAULT = 4
101	TAB_SIZE_EIGHT   = 8
102)
103
104// These are the tags that are recognized as HTML block tags.
105// Any of these can be included in markdown text without special escaping.
106var blockTags = map[string]bool{
107	"p":          true,
108	"dl":         true,
109	"h1":         true,
110	"h2":         true,
111	"h3":         true,
112	"h4":         true,
113	"h5":         true,
114	"h6":         true,
115	"ol":         true,
116	"ul":         true,
117	"del":        true,
118	"div":        true,
119	"ins":        true,
120	"pre":        true,
121	"form":       true,
122	"math":       true,
123	"table":      true,
124	"iframe":     true,
125	"script":     true,
126	"style":      true,
127	"fieldset":   true,
128	"noscript":   true,
129	"blockquote": true,
130
131	// HTML5
132	"video":      true,
133	"aside":      true,
134	"canvas":     true,
135	"figure":     true,
136	"footer":     true,
137	"header":     true,
138	"hgroup":     true,
139	"output":     true,
140	"article":    true,
141	"section":    true,
142	"progress":   true,
143	"figcaption": true,
144}
145
146// Renderer is the rendering interface.
147// This is mostly of interest if you are implementing a new rendering format.
148//
149// When a byte slice is provided, it contains the (rendered) contents of the
150// element.
151//
152// When a callback is provided instead, it will write the contents of the
153// respective element directly to the output buffer and return true on success.
154// If the callback returns false, the rendering function should reset the
155// output buffer as though it had never been called.
156//
157// Currently Html and Latex implementations are provided
158type Renderer interface {
159	// block-level callbacks
160	BlockCode(out *bytes.Buffer, text []byte, lang string)
161	BlockQuote(out *bytes.Buffer, text []byte)
162	BlockHtml(out *bytes.Buffer, text []byte)
163	Header(out *bytes.Buffer, text func() bool, level int, id string)
164	HRule(out *bytes.Buffer)
165	List(out *bytes.Buffer, text func() bool, flags int)
166	ListItem(out *bytes.Buffer, text []byte, flags int)
167	Paragraph(out *bytes.Buffer, text func() bool)
168	Table(out *bytes.Buffer, header []byte, body []byte, columnData []int)
169	TableRow(out *bytes.Buffer, text []byte)
170	TableHeaderCell(out *bytes.Buffer, text []byte, flags int)
171	TableCell(out *bytes.Buffer, text []byte, flags int)
172	Footnotes(out *bytes.Buffer, text func() bool)
173	FootnoteItem(out *bytes.Buffer, name, text []byte, flags int)
174	TitleBlock(out *bytes.Buffer, text []byte)
175
176	// Span-level callbacks
177	AutoLink(out *bytes.Buffer, link []byte, kind int)
178	CodeSpan(out *bytes.Buffer, text []byte)
179	DoubleEmphasis(out *bytes.Buffer, text []byte)
180	Emphasis(out *bytes.Buffer, text []byte)
181	Image(out *bytes.Buffer, link []byte, title []byte, alt []byte)
182	LineBreak(out *bytes.Buffer)
183	Link(out *bytes.Buffer, link []byte, title []byte, content []byte)
184	RawHtmlTag(out *bytes.Buffer, tag []byte)
185	TripleEmphasis(out *bytes.Buffer, text []byte)
186	StrikeThrough(out *bytes.Buffer, text []byte)
187	FootnoteRef(out *bytes.Buffer, ref []byte, id int)
188
189	// Low-level callbacks
190	Entity(out *bytes.Buffer, entity []byte)
191	NormalText(out *bytes.Buffer, text []byte)
192
193	// Header and footer
194	DocumentHeader(out *bytes.Buffer)
195	DocumentFooter(out *bytes.Buffer)
196
197	GetFlags() int
198}
199
200// Callback functions for inline parsing. One such function is defined
201// for each character that triggers a response when parsing inline data.
202type inlineParser func(p *parser, out *bytes.Buffer, data []byte, offset int) int
203
204// Parser holds runtime state used by the parser.
205// This is constructed by the Markdown function.
206type parser struct {
207	r              Renderer
208	refOverride    ReferenceOverrideFunc
209	refs           map[string]*reference
210	inlineCallback [256]inlineParser
211	flags          int
212	nesting        int
213	maxNesting     int
214	insideLink     bool
215
216	// Footnotes need to be ordered as well as available to quickly check for
217	// presence. If a ref is also a footnote, it's stored both in refs and here
218	// in notes. Slice is nil if footnotes not enabled.
219	notes []*reference
220}
221
222func (p *parser) getRef(refid string) (ref *reference, found bool) {
223	if p.refOverride != nil {
224		r, overridden := p.refOverride(refid)
225		if overridden {
226			if r == nil {
227				return nil, false
228			}
229			return &reference{
230				link:     []byte(r.Link),
231				title:    []byte(r.Title),
232				noteId:   0,
233				hasBlock: false,
234				text:     []byte(r.Text)}, true
235		}
236	}
237	// refs are case insensitive
238	ref, found = p.refs[strings.ToLower(refid)]
239	return ref, found
240}
241
242//
243//
244// Public interface
245//
246//
247
248// Reference represents the details of a link.
249// See the documentation in Options for more details on use-case.
250type Reference struct {
251	// Link is usually the URL the reference points to.
252	Link string
253	// Title is the alternate text describing the link in more detail.
254	Title string
255	// Text is the optional text to override the ref with if the syntax used was
256	// [refid][]
257	Text string
258}
259
260// ReferenceOverrideFunc is expected to be called with a reference string and
261// return either a valid Reference type that the reference string maps to or
262// nil. If overridden is false, the default reference logic will be executed.
263// See the documentation in Options for more details on use-case.
264type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
265
266// Options represents configurable overrides and callbacks (in addition to the
267// extension flag set) for configuring a Markdown parse.
268type Options struct {
269	// Extensions is a flag set of bit-wise ORed extension bits. See the
270	// EXTENSION_* flags defined in this package.
271	Extensions int
272
273	// ReferenceOverride is an optional function callback that is called every
274	// time a reference is resolved.
275	//
276	// In Markdown, the link reference syntax can be made to resolve a link to
277	// a reference instead of an inline URL, in one of the following ways:
278	//
279	//  * [link text][refid]
280	//  * [refid][]
281	//
282	// Usually, the refid is defined at the bottom of the Markdown document. If
283	// this override function is provided, the refid is passed to the override
284	// function first, before consulting the defined refids at the bottom. If
285	// the override function indicates an override did not occur, the refids at
286	// the bottom will be used to fill in the link details.
287	ReferenceOverride ReferenceOverrideFunc
288}
289
290// MarkdownBasic is a convenience function for simple rendering.
291// It processes markdown input with no extensions enabled.
292func MarkdownBasic(input []byte) []byte {
293	// set up the HTML renderer
294	htmlFlags := HTML_USE_XHTML
295	renderer := HtmlRenderer(htmlFlags, "", "")
296
297	// set up the parser
298	return MarkdownOptions(input, renderer, Options{Extensions: 0})
299}
300
301// Call Markdown with most useful extensions enabled
302// MarkdownCommon is a convenience function for simple rendering.
303// It processes markdown input with common extensions enabled, including:
304//
305// * Smartypants processing with smart fractions and LaTeX dashes
306//
307// * Intra-word emphasis suppression
308//
309// * Tables
310//
311// * Fenced code blocks
312//
313// * Autolinking
314//
315// * Strikethrough support
316//
317// * Strict header parsing
318//
319// * Custom Header IDs
320func MarkdownCommon(input []byte) []byte {
321	// set up the HTML renderer
322	renderer := HtmlRenderer(commonHtmlFlags, "", "")
323	return MarkdownOptions(input, renderer, Options{
324		Extensions: commonExtensions})
325}
326
327// Markdown is the main rendering function.
328// It parses and renders a block of markdown-encoded text.
329// The supplied Renderer is used to format the output, and extensions dictates
330// which non-standard extensions are enabled.
331//
332// To use the supplied Html or LaTeX renderers, see HtmlRenderer and
333// LatexRenderer, respectively.
334func Markdown(input []byte, renderer Renderer, extensions int) []byte {
335	return MarkdownOptions(input, renderer, Options{
336		Extensions: extensions})
337}
338
339// MarkdownOptions is just like Markdown but takes additional options through
340// the Options struct.
341func MarkdownOptions(input []byte, renderer Renderer, opts Options) []byte {
342	// no point in parsing if we can't render
343	if renderer == nil {
344		return nil
345	}
346
347	extensions := opts.Extensions
348
349	// fill in the render structure
350	p := new(parser)
351	p.r = renderer
352	p.flags = extensions
353	p.refOverride = opts.ReferenceOverride
354	p.refs = make(map[string]*reference)
355	p.maxNesting = 16
356	p.insideLink = false
357
358	// register inline parsers
359	p.inlineCallback['*'] = emphasis
360	p.inlineCallback['_'] = emphasis
361	if extensions&EXTENSION_STRIKETHROUGH != 0 {
362		p.inlineCallback['~'] = emphasis
363	}
364	p.inlineCallback['`'] = codeSpan
365	p.inlineCallback['\n'] = lineBreak
366	p.inlineCallback['['] = link
367	p.inlineCallback['<'] = leftAngle
368	p.inlineCallback['\\'] = escape
369	p.inlineCallback['&'] = entity
370
371	if extensions&EXTENSION_AUTOLINK != 0 {
372		p.inlineCallback[':'] = autoLink
373	}
374
375	if extensions&EXTENSION_FOOTNOTES != 0 {
376		p.notes = make([]*reference, 0)
377	}
378
379	first := firstPass(p, input)
380	second := secondPass(p, first)
381	return second
382}
383
384// first pass:
385// - extract references
386// - expand tabs
387// - normalize newlines
388// - copy everything else
389func firstPass(p *parser, input []byte) []byte {
390	var out bytes.Buffer
391	tabSize := TAB_SIZE_DEFAULT
392	if p.flags&EXTENSION_TAB_SIZE_EIGHT != 0 {
393		tabSize = TAB_SIZE_EIGHT
394	}
395	beg, end := 0, 0
396	lastFencedCodeBlockEnd := 0
397	for beg < len(input) { // iterate over lines
398		if end = isReference(p, input[beg:], tabSize); end > 0 {
399			beg += end
400		} else { // skip to the next line
401			end = beg
402			for end < len(input) && input[end] != '\n' && input[end] != '\r' {
403				end++
404			}
405
406			if p.flags&EXTENSION_FENCED_CODE != 0 {
407				// track fenced code block boundaries to suppress tab expansion
408				// inside them:
409				if beg >= lastFencedCodeBlockEnd {
410					if i := p.fencedCode(&out, input[beg:], false); i > 0 {
411						lastFencedCodeBlockEnd = beg + i
412					}
413				}
414			}
415
416			// add the line body if present
417			if end > beg {
418				if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
419					out.Write(input[beg:end])
420				} else {
421					expandTabs(&out, input[beg:end], tabSize)
422				}
423			}
424			out.WriteByte('\n')
425
426			if end < len(input) && input[end] == '\r' {
427				end++
428			}
429			if end < len(input) && input[end] == '\n' {
430				end++
431			}
432
433			beg = end
434		}
435	}
436
437	// empty input?
438	if out.Len() == 0 {
439		out.WriteByte('\n')
440	}
441
442	return out.Bytes()
443}
444
445// second pass: actual rendering
446func secondPass(p *parser, input []byte) []byte {
447	var output bytes.Buffer
448
449	p.r.DocumentHeader(&output)
450	p.block(&output, input)
451
452	if p.flags&EXTENSION_FOOTNOTES != 0 && len(p.notes) > 0 {
453		p.r.Footnotes(&output, func() bool {
454			flags := LIST_ITEM_BEGINNING_OF_LIST
455			for _, ref := range p.notes {
456				var buf bytes.Buffer
457				if ref.hasBlock {
458					flags |= LIST_ITEM_CONTAINS_BLOCK
459					p.block(&buf, ref.title)
460				} else {
461					p.inline(&buf, ref.title)
462				}
463				p.r.FootnoteItem(&output, ref.link, buf.Bytes(), flags)
464				flags &^= LIST_ITEM_BEGINNING_OF_LIST | LIST_ITEM_CONTAINS_BLOCK
465			}
466
467			return true
468		})
469	}
470
471	p.r.DocumentFooter(&output)
472
473	if p.nesting != 0 {
474		panic("Nesting level did not end at zero")
475	}
476
477	return output.Bytes()
478}
479
480//
481// Link references
482//
483// This section implements support for references that (usually) appear
484// as footnotes in a document, and can be referenced anywhere in the document.
485// The basic format is:
486//
487//    [1]: http://www.google.com/ "Google"
488//    [2]: http://www.github.com/ "Github"
489//
490// Anywhere in the document, the reference can be linked by referring to its
491// label, i.e., 1 and 2 in this example, as in:
492//
493//    This library is hosted on [Github][2], a git hosting site.
494//
495// Actual footnotes as specified in Pandoc and supported by some other Markdown
496// libraries such as php-markdown are also taken care of. They look like this:
497//
498//    This sentence needs a bit of further explanation.[^note]
499//
500//    [^note]: This is the explanation.
501//
502// Footnotes should be placed at the end of the document in an ordered list.
503// Inline footnotes such as:
504//
505//    Inline footnotes^[Not supported.] also exist.
506//
507// are not yet supported.
508
509// References are parsed and stored in this struct.
510type reference struct {
511	link     []byte
512	title    []byte
513	noteId   int // 0 if not a footnote ref
514	hasBlock bool
515	text     []byte
516}
517
518// Check whether or not data starts with a reference link.
519// If so, it is parsed and stored in the list of references
520// (in the render struct).
521// Returns the number of bytes to skip to move past it,
522// or zero if the first line is not a reference.
523func isReference(p *parser, data []byte, tabSize int) int {
524	// up to 3 optional leading spaces
525	if len(data) < 4 {
526		return 0
527	}
528	i := 0
529	for i < 3 && data[i] == ' ' {
530		i++
531	}
532
533	noteId := 0
534
535	// id part: anything but a newline between brackets
536	if data[i] != '[' {
537		return 0
538	}
539	i++
540	if p.flags&EXTENSION_FOOTNOTES != 0 {
541		if i < len(data) && data[i] == '^' {
542			// we can set it to anything here because the proper noteIds will
543			// be assigned later during the second pass. It just has to be != 0
544			noteId = 1
545			i++
546		}
547	}
548	idOffset := i
549	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
550		i++
551	}
552	if i >= len(data) || data[i] != ']' {
553		return 0
554	}
555	idEnd := i
556
557	// spacer: colon (space | tab)* newline? (space | tab)*
558	i++
559	if i >= len(data) || data[i] != ':' {
560		return 0
561	}
562	i++
563	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
564		i++
565	}
566	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
567		i++
568		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
569			i++
570		}
571	}
572	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
573		i++
574	}
575	if i >= len(data) {
576		return 0
577	}
578
579	var (
580		linkOffset, linkEnd   int
581		titleOffset, titleEnd int
582		lineEnd               int
583		raw                   []byte
584		hasBlock              bool
585	)
586
587	if p.flags&EXTENSION_FOOTNOTES != 0 && noteId != 0 {
588		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
589		lineEnd = linkEnd
590	} else {
591		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
592	}
593	if lineEnd == 0 {
594		return 0
595	}
596
597	// a valid ref has been found
598
599	ref := &reference{
600		noteId:   noteId,
601		hasBlock: hasBlock,
602	}
603
604	if noteId > 0 {
605		// reusing the link field for the id since footnotes don't have links
606		ref.link = data[idOffset:idEnd]
607		// if footnote, it's not really a title, it's the contained text
608		ref.title = raw
609	} else {
610		ref.link = data[linkOffset:linkEnd]
611		ref.title = data[titleOffset:titleEnd]
612	}
613
614	// id matches are case-insensitive
615	id := string(bytes.ToLower(data[idOffset:idEnd]))
616
617	p.refs[id] = ref
618
619	return lineEnd
620}
621
622func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
623	// link: whitespace-free sequence, optionally between angle brackets
624	if data[i] == '<' {
625		i++
626	}
627	linkOffset = i
628	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
629		i++
630	}
631	if i == len(data) {
632		return
633	}
634	linkEnd = i
635	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
636		linkOffset++
637		linkEnd--
638	}
639
640	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
641	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
642		i++
643	}
644	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
645		return
646	}
647
648	// compute end-of-line
649	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
650		lineEnd = i
651	}
652	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
653		lineEnd++
654	}
655
656	// optional (space|tab)* spacer after a newline
657	if lineEnd > 0 {
658		i = lineEnd + 1
659		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
660			i++
661		}
662	}
663
664	// optional title: any non-newline sequence enclosed in '"() alone on its line
665	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
666		i++
667		titleOffset = i
668
669		// look for EOL
670		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
671			i++
672		}
673		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
674			titleEnd = i + 1
675		} else {
676			titleEnd = i
677		}
678
679		// step back
680		i--
681		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
682			i--
683		}
684		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
685			lineEnd = titleEnd
686			titleEnd = i
687		}
688	}
689
690	return
691}
692
693// The first bit of this logic is the same as (*parser).listItem, but the rest
694// is much simpler. This function simply finds the entire block and shifts it
695// over by one tab if it is indeed a block (just returns the line if it's not).
696// blockEnd is the end of the section in the input buffer, and contents is the
697// extracted text that was shifted over one tab. It will need to be rendered at
698// the end of the document.
699func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
700	if i == 0 || len(data) == 0 {
701		return
702	}
703
704	// skip leading whitespace on first line
705	for i < len(data) && data[i] == ' ' {
706		i++
707	}
708
709	blockStart = i
710
711	// find the end of the line
712	blockEnd = i
713	for i < len(data) && data[i-1] != '\n' {
714		i++
715	}
716
717	// get working buffer
718	var raw bytes.Buffer
719
720	// put the first line into the working buffer
721	raw.Write(data[blockEnd:i])
722	blockEnd = i
723
724	// process the following lines
725	containsBlankLine := false
726
727gatherLines:
728	for blockEnd < len(data) {
729		i++
730
731		// find the end of this line
732		for i < len(data) && data[i-1] != '\n' {
733			i++
734		}
735
736		// if it is an empty line, guess that it is part of this item
737		// and move on to the next line
738		if p.isEmpty(data[blockEnd:i]) > 0 {
739			containsBlankLine = true
740			blockEnd = i
741			continue
742		}
743
744		n := 0
745		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
746			// this is the end of the block.
747			// we don't want to include this last line in the index.
748			break gatherLines
749		}
750
751		// if there were blank lines before this one, insert a new one now
752		if containsBlankLine {
753			raw.WriteByte('\n')
754			containsBlankLine = false
755		}
756
757		// get rid of that first tab, write to buffer
758		raw.Write(data[blockEnd+n : i])
759		hasBlock = true
760
761		blockEnd = i
762	}
763
764	if data[blockEnd-1] != '\n' {
765		raw.WriteByte('\n')
766	}
767
768	contents = raw.Bytes()
769
770	return
771}
772
773//
774//
775// Miscellaneous helper functions
776//
777//
778
779// Test if a character is a punctuation symbol.
780// Taken from a private function in regexp in the stdlib.
781func ispunct(c byte) bool {
782	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
783		if c == r {
784			return true
785		}
786	}
787	return false
788}
789
790// Test if a character is a whitespace character.
791func isspace(c byte) bool {
792	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
793}
794
795// Test if a character is letter.
796func isletter(c byte) bool {
797	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
798}
799
800// Test if a character is a letter or a digit.
801// TODO: check when this is looking for ASCII alnum and when it should use unicode
802func isalnum(c byte) bool {
803	return (c >= '0' && c <= '9') || isletter(c)
804}
805
806// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
807// always ends output with a newline
808func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
809	// first, check for common cases: no tabs, or only tabs at beginning of line
810	i, prefix := 0, 0
811	slowcase := false
812	for i = 0; i < len(line); i++ {
813		if line[i] == '\t' {
814			if prefix == i {
815				prefix++
816			} else {
817				slowcase = true
818				break
819			}
820		}
821	}
822
823	// no need to decode runes if all tabs are at the beginning of the line
824	if !slowcase {
825		for i = 0; i < prefix*tabSize; i++ {
826			out.WriteByte(' ')
827		}
828		out.Write(line[prefix:])
829		return
830	}
831
832	// the slow case: we need to count runes to figure out how
833	// many spaces to insert for each tab
834	column := 0
835	i = 0
836	for i < len(line) {
837		start := i
838		for i < len(line) && line[i] != '\t' {
839			_, size := utf8.DecodeRune(line[i:])
840			i += size
841			column++
842		}
843
844		if i > start {
845			out.Write(line[start:i])
846		}
847
848		if i >= len(line) {
849			break
850		}
851
852		for {
853			out.WriteByte(' ')
854			column++
855			if column%tabSize == 0 {
856				break
857			}
858		}
859
860		i++
861	}
862}
863
864// Find if a line counts as indented or not.
865// Returns number of characters the indent is (0 = not indented).
866func isIndented(data []byte, indentSize int) int {
867	if len(data) == 0 {
868		return 0
869	}
870	if data[0] == '\t' {
871		return 1
872	}
873	if len(data) < indentSize {
874		return 0
875	}
876	for i := 0; i < indentSize; i++ {
877		if data[i] != ' ' {
878			return 0
879		}
880	}
881	return indentSize
882}
883
884// Create a url-safe slug for fragments
885func slugify(in []byte) []byte {
886	if len(in) == 0 {
887		return in
888	}
889	out := make([]byte, 0, len(in))
890	sym := false
891
892	for _, ch := range in {
893		if isalnum(ch) {
894			sym = false
895			out = append(out, ch)
896		} else if sym {
897			continue
898		} else {
899			out = append(out, '-')
900			sym = true
901		}
902	}
903	var a, b int
904	var ch byte
905	for a, ch = range out {
906		if ch != '-' {
907			break
908		}
909	}
910	for b = len(out) - 1; b > 0; b-- {
911		if out[b] != '-' {
912			break
913		}
914	}
915	return out[a : b+1]
916}