all repos — grayfriday @ c29209fcdacb19244702838b1b5b5a4cb6337ffd

blackfriday fork with a few changes

markdown.go (view raw)

  1//
  2// Blackfriday Markdown Processor
  3// Available at http://github.com/russross/blackfriday
  4//
  5// Copyright © 2011 Russ Ross <russ@russross.com>.
  6// Distributed under the Simplified BSD License.
  7// See README.md for details.
  8//
  9
 10//
 11//
 12// Markdown parsing and processing
 13//
 14//
 15
 16// Blackfriday markdown processor.
 17//
 18// Translates plain text with simple formatting rules into HTML or LaTeX.
 19package blackfriday
 20
 21import (
 22	"bytes"
 23	"fmt"
 24	"strings"
 25	"unicode/utf8"
 26)
 27
 28const VERSION = "1.4"
 29
 30// These are the supported markdown parsing extensions.
 31// OR these values together to select multiple extensions.
 32const (
 33	EXTENSION_NO_INTRA_EMPHASIS          = 1 << iota // ignore emphasis markers inside words
 34	EXTENSION_TABLES                                 // render tables
 35	EXTENSION_FENCED_CODE                            // render fenced code blocks
 36	EXTENSION_AUTOLINK                               // detect embedded URLs that are not explicitly marked
 37	EXTENSION_STRIKETHROUGH                          // strikethrough text using ~~test~~
 38	EXTENSION_LAX_HTML_BLOCKS                        // loosen up HTML block parsing rules
 39	EXTENSION_SPACE_HEADERS                          // be strict about prefix header rules
 40	EXTENSION_HARD_LINE_BREAK                        // translate newlines into line breaks
 41	EXTENSION_TAB_SIZE_EIGHT                         // expand tabs to eight spaces instead of four
 42	EXTENSION_FOOTNOTES                              // Pandoc-style footnotes
 43	EXTENSION_NO_EMPTY_LINE_BEFORE_BLOCK             // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
 44	EXTENSION_HEADER_IDS                             // specify header IDs  with {#id}
 45	EXTENSION_TITLEBLOCK                             // Titleblock ala pandoc
 46	EXTENSION_AUTO_HEADER_IDS                        // Create the header ID from the text
 47	EXTENSION_BACKSLASH_LINE_BREAK                   // translate trailing backslashes into line breaks
 48	EXTENSION_DEFINITION_LISTS                       // render definition lists
 49
 50	commonHtmlFlags = 0 |
 51		HTML_USE_XHTML |
 52		HTML_USE_SMARTYPANTS |
 53		HTML_SMARTYPANTS_FRACTIONS |
 54		HTML_SMARTYPANTS_DASHES |
 55		HTML_SMARTYPANTS_LATEX_DASHES
 56
 57	commonExtensions = 0 |
 58		EXTENSION_NO_INTRA_EMPHASIS |
 59		EXTENSION_TABLES |
 60		EXTENSION_FENCED_CODE |
 61		EXTENSION_AUTOLINK |
 62		EXTENSION_STRIKETHROUGH |
 63		EXTENSION_SPACE_HEADERS |
 64		EXTENSION_HEADER_IDS |
 65		EXTENSION_BACKSLASH_LINE_BREAK |
 66		EXTENSION_DEFINITION_LISTS
 67)
 68
 69// These are the possible flag values for the link renderer.
 70// Only a single one of these values will be used; they are not ORed together.
 71// These are mostly of interest if you are writing a new output format.
 72const (
 73	LINK_TYPE_NOT_AUTOLINK = iota
 74	LINK_TYPE_NORMAL
 75	LINK_TYPE_EMAIL
 76)
 77
 78// These are the possible flag values for the ListItem renderer.
 79// Multiple flag values may be ORed together.
 80// These are mostly of interest if you are writing a new output format.
 81const (
 82	LIST_TYPE_ORDERED = 1 << iota
 83	LIST_TYPE_DEFINITION
 84	LIST_TYPE_TERM
 85	LIST_ITEM_CONTAINS_BLOCK
 86	LIST_ITEM_BEGINNING_OF_LIST
 87	LIST_ITEM_END_OF_LIST
 88)
 89
 90// These are the possible flag values for the table cell renderer.
 91// Only a single one of these values will be used; they are not ORed together.
 92// These are mostly of interest if you are writing a new output format.
 93const (
 94	TABLE_ALIGNMENT_LEFT = 1 << iota
 95	TABLE_ALIGNMENT_RIGHT
 96	TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
 97)
 98
 99// The size of a tab stop.
100const (
101	TAB_SIZE_DEFAULT = 4
102	TAB_SIZE_EIGHT   = 8
103)
104
105// These are the tags that are recognized as HTML block tags.
106// Any of these can be included in markdown text without special escaping.
107var blockTags = map[string]bool{
108	"p":          true,
109	"dl":         true,
110	"h1":         true,
111	"h2":         true,
112	"h3":         true,
113	"h4":         true,
114	"h5":         true,
115	"h6":         true,
116	"ol":         true,
117	"ul":         true,
118	"del":        true,
119	"div":        true,
120	"ins":        true,
121	"pre":        true,
122	"form":       true,
123	"math":       true,
124	"table":      true,
125	"iframe":     true,
126	"script":     true,
127	"style":      true,
128	"fieldset":   true,
129	"noscript":   true,
130	"blockquote": true,
131
132	// HTML5
133	"video":      true,
134	"aside":      true,
135	"canvas":     true,
136	"figure":     true,
137	"footer":     true,
138	"header":     true,
139	"hgroup":     true,
140	"output":     true,
141	"article":    true,
142	"section":    true,
143	"progress":   true,
144	"figcaption": true,
145}
146
147// Renderer is the rendering interface.
148// This is mostly of interest if you are implementing a new rendering format.
149//
150// When a byte slice is provided, it contains the (rendered) contents of the
151// element.
152//
153// When a callback is provided instead, it will write the contents of the
154// respective element directly to the output buffer and return true on success.
155// If the callback returns false, the rendering function should reset the
156// output buffer as though it had never been called.
157//
158// Currently Html and Latex implementations are provided
159type Renderer interface {
160	// block-level callbacks
161	BlockCode(out *bytes.Buffer, text []byte, lang string)
162	BlockQuote(out *bytes.Buffer, text []byte)
163	BlockHtml(out *bytes.Buffer, text []byte)
164	Header(out *bytes.Buffer, text func() bool, level int, id string)
165	HRule(out *bytes.Buffer)
166	List(out *bytes.Buffer, text func() bool, flags int)
167	ListItem(out *bytes.Buffer, text []byte, flags int)
168	Paragraph(out *bytes.Buffer, text func() bool)
169	Table(out *bytes.Buffer, header []byte, body []byte, columnData []int)
170	TableRow(out *bytes.Buffer, text []byte)
171	TableHeaderCell(out *bytes.Buffer, text []byte, flags int)
172	TableCell(out *bytes.Buffer, text []byte, flags int)
173	Footnotes(out *bytes.Buffer, text func() bool)
174	FootnoteItem(out *bytes.Buffer, name, text []byte, flags int)
175	TitleBlock(out *bytes.Buffer, text []byte)
176
177	// Span-level callbacks
178	AutoLink(out *bytes.Buffer, link []byte, kind int)
179	CodeSpan(out *bytes.Buffer, text []byte)
180	DoubleEmphasis(out *bytes.Buffer, text []byte)
181	Emphasis(out *bytes.Buffer, text []byte)
182	Image(out *bytes.Buffer, link []byte, title []byte, alt []byte)
183	LineBreak(out *bytes.Buffer)
184	Link(out *bytes.Buffer, link []byte, title []byte, content []byte)
185	RawHtmlTag(out *bytes.Buffer, tag []byte)
186	TripleEmphasis(out *bytes.Buffer, text []byte)
187	StrikeThrough(out *bytes.Buffer, text []byte)
188	FootnoteRef(out *bytes.Buffer, ref []byte, id int)
189
190	// Low-level callbacks
191	Entity(out *bytes.Buffer, entity []byte)
192	NormalText(out *bytes.Buffer, text []byte)
193
194	// Header and footer
195	DocumentHeader(out *bytes.Buffer)
196	DocumentFooter(out *bytes.Buffer)
197
198	GetFlags() int
199}
200
201// Callback functions for inline parsing. One such function is defined
202// for each character that triggers a response when parsing inline data.
203type inlineParser func(p *parser, out *bytes.Buffer, data []byte, offset int) int
204
205// Parser holds runtime state used by the parser.
206// This is constructed by the Markdown function.
207type parser struct {
208	r              Renderer
209	refOverride    ReferenceOverrideFunc
210	refs           map[string]*reference
211	inlineCallback [256]inlineParser
212	flags          int
213	nesting        int
214	maxNesting     int
215	insideLink     bool
216
217	// Footnotes need to be ordered as well as available to quickly check for
218	// presence. If a ref is also a footnote, it's stored both in refs and here
219	// in notes. Slice is nil if footnotes not enabled.
220	notes []*reference
221}
222
223func (p *parser) getRef(refid string) (ref *reference, found bool) {
224	if p.refOverride != nil {
225		r, overridden := p.refOverride(refid)
226		if overridden {
227			if r == nil {
228				return nil, false
229			}
230			return &reference{
231				link:     []byte(r.Link),
232				title:    []byte(r.Title),
233				noteId:   0,
234				hasBlock: false,
235				text:     []byte(r.Text)}, true
236		}
237	}
238	// refs are case insensitive
239	ref, found = p.refs[strings.ToLower(refid)]
240	return ref, found
241}
242
243//
244//
245// Public interface
246//
247//
248
249// Reference represents the details of a link.
250// See the documentation in Options for more details on use-case.
251type Reference struct {
252	// Link is usually the URL the reference points to.
253	Link string
254	// Title is the alternate text describing the link in more detail.
255	Title string
256	// Text is the optional text to override the ref with if the syntax used was
257	// [refid][]
258	Text string
259}
260
261// ReferenceOverrideFunc is expected to be called with a reference string and
262// return either a valid Reference type that the reference string maps to or
263// nil. If overridden is false, the default reference logic will be executed.
264// See the documentation in Options for more details on use-case.
265type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
266
267// Options represents configurable overrides and callbacks (in addition to the
268// extension flag set) for configuring a Markdown parse.
269type Options struct {
270	// Extensions is a flag set of bit-wise ORed extension bits. See the
271	// EXTENSION_* flags defined in this package.
272	Extensions int
273
274	// ReferenceOverride is an optional function callback that is called every
275	// time a reference is resolved.
276	//
277	// In Markdown, the link reference syntax can be made to resolve a link to
278	// a reference instead of an inline URL, in one of the following ways:
279	//
280	//  * [link text][refid]
281	//  * [refid][]
282	//
283	// Usually, the refid is defined at the bottom of the Markdown document. If
284	// this override function is provided, the refid is passed to the override
285	// function first, before consulting the defined refids at the bottom. If
286	// the override function indicates an override did not occur, the refids at
287	// the bottom will be used to fill in the link details.
288	ReferenceOverride ReferenceOverrideFunc
289}
290
291// MarkdownBasic is a convenience function for simple rendering.
292// It processes markdown input with no extensions enabled.
293func MarkdownBasic(input []byte) []byte {
294	// set up the HTML renderer
295	htmlFlags := HTML_USE_XHTML
296	renderer := HtmlRenderer(htmlFlags, "", "")
297
298	// set up the parser
299	return MarkdownOptions(input, renderer, Options{Extensions: 0})
300}
301
302// Call Markdown with most useful extensions enabled
303// MarkdownCommon is a convenience function for simple rendering.
304// It processes markdown input with common extensions enabled, including:
305//
306// * Smartypants processing with smart fractions and LaTeX dashes
307//
308// * Intra-word emphasis suppression
309//
310// * Tables
311//
312// * Fenced code blocks
313//
314// * Autolinking
315//
316// * Strikethrough support
317//
318// * Strict header parsing
319//
320// * Custom Header IDs
321func MarkdownCommon(input []byte) []byte {
322	// set up the HTML renderer
323	renderer := HtmlRenderer(commonHtmlFlags, "", "")
324	return MarkdownOptions(input, renderer, Options{
325		Extensions: commonExtensions})
326}
327
328// Markdown is the main rendering function.
329// It parses and renders a block of markdown-encoded text.
330// The supplied Renderer is used to format the output, and extensions dictates
331// which non-standard extensions are enabled.
332//
333// To use the supplied Html or LaTeX renderers, see HtmlRenderer and
334// LatexRenderer, respectively.
335func Markdown(input []byte, renderer Renderer, extensions int) []byte {
336	return MarkdownOptions(input, renderer, Options{
337		Extensions: extensions})
338}
339
340// MarkdownOptions is just like Markdown but takes additional options through
341// the Options struct.
342func MarkdownOptions(input []byte, renderer Renderer, opts Options) []byte {
343	// no point in parsing if we can't render
344	if renderer == nil {
345		return nil
346	}
347
348	extensions := opts.Extensions
349
350	// fill in the render structure
351	p := new(parser)
352	p.r = renderer
353	p.flags = extensions
354	p.refOverride = opts.ReferenceOverride
355	p.refs = make(map[string]*reference)
356	p.maxNesting = 16
357	p.insideLink = false
358
359	// register inline parsers
360	p.inlineCallback['*'] = emphasis
361	p.inlineCallback['_'] = emphasis
362	if extensions&EXTENSION_STRIKETHROUGH != 0 {
363		p.inlineCallback['~'] = emphasis
364	}
365	p.inlineCallback['`'] = codeSpan
366	p.inlineCallback['\n'] = lineBreak
367	p.inlineCallback['['] = link
368	p.inlineCallback['<'] = leftAngle
369	p.inlineCallback['\\'] = escape
370	p.inlineCallback['&'] = entity
371
372	if extensions&EXTENSION_AUTOLINK != 0 {
373		p.inlineCallback[':'] = autoLink
374	}
375
376	if extensions&EXTENSION_FOOTNOTES != 0 {
377		p.notes = make([]*reference, 0)
378	}
379
380	first := firstPass(p, input)
381	second := secondPass(p, first)
382	return second
383}
384
385// first pass:
386// - extract references
387// - expand tabs
388// - normalize newlines
389// - copy everything else
390func firstPass(p *parser, input []byte) []byte {
391	var out bytes.Buffer
392	tabSize := TAB_SIZE_DEFAULT
393	if p.flags&EXTENSION_TAB_SIZE_EIGHT != 0 {
394		tabSize = TAB_SIZE_EIGHT
395	}
396	beg, end := 0, 0
397	lastFencedCodeBlockEnd := 0
398	for beg < len(input) { // iterate over lines
399		if end = isReference(p, input[beg:], tabSize); end > 0 {
400			beg += end
401		} else { // skip to the next line
402			end = beg
403			for end < len(input) && input[end] != '\n' && input[end] != '\r' {
404				end++
405			}
406
407			if p.flags&EXTENSION_FENCED_CODE != 0 {
408				// track fenced code block boundaries to suppress tab expansion
409				// inside them:
410				if beg >= lastFencedCodeBlockEnd {
411					if i := p.fencedCode(&out, input[beg:], false); i > 0 {
412						lastFencedCodeBlockEnd = beg + i
413					}
414				}
415			}
416
417			// add the line body if present
418			if end > beg {
419				if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
420					out.Write(input[beg:end])
421				} else {
422					expandTabs(&out, input[beg:end], tabSize)
423				}
424			}
425			out.WriteByte('\n')
426
427			if end < len(input) && input[end] == '\r' {
428				end++
429			}
430			if end < len(input) && input[end] == '\n' {
431				end++
432			}
433
434			beg = end
435		}
436	}
437
438	// empty input?
439	if out.Len() == 0 {
440		out.WriteByte('\n')
441	}
442
443	return out.Bytes()
444}
445
446// second pass: actual rendering
447func secondPass(p *parser, input []byte) []byte {
448	var output bytes.Buffer
449
450	p.r.DocumentHeader(&output)
451	p.block(&output, input)
452
453	if p.flags&EXTENSION_FOOTNOTES != 0 && len(p.notes) > 0 {
454		p.r.Footnotes(&output, func() bool {
455			flags := LIST_ITEM_BEGINNING_OF_LIST
456			for i := 0; i < len(p.notes); i += 1 {
457				var buf bytes.Buffer
458				ref := p.notes[i]
459				if ref.hasBlock {
460					flags |= LIST_ITEM_CONTAINS_BLOCK
461					p.block(&buf, ref.title)
462				} else {
463					p.inline(&buf, ref.title)
464				}
465				p.r.FootnoteItem(&output, ref.link, buf.Bytes(), flags)
466				flags &^= LIST_ITEM_BEGINNING_OF_LIST | LIST_ITEM_CONTAINS_BLOCK
467			}
468
469			return true
470		})
471	}
472
473	p.r.DocumentFooter(&output)
474
475	if p.nesting != 0 {
476		panic("Nesting level did not end at zero")
477	}
478
479	return output.Bytes()
480}
481
482//
483// Link references
484//
485// This section implements support for references that (usually) appear
486// as footnotes in a document, and can be referenced anywhere in the document.
487// The basic format is:
488//
489//    [1]: http://www.google.com/ "Google"
490//    [2]: http://www.github.com/ "Github"
491//
492// Anywhere in the document, the reference can be linked by referring to its
493// label, i.e., 1 and 2 in this example, as in:
494//
495//    This library is hosted on [Github][2], a git hosting site.
496//
497// Actual footnotes as specified in Pandoc and supported by some other Markdown
498// libraries such as php-markdown are also taken care of. They look like this:
499//
500//    This sentence needs a bit of further explanation.[^note]
501//
502//    [^note]: This is the explanation.
503//
504// Footnotes should be placed at the end of the document in an ordered list.
505// Inline footnotes such as:
506//
507//    Inline footnotes^[Not supported.] also exist.
508//
509// are not yet supported.
510
511// References are parsed and stored in this struct.
512type reference struct {
513	link     []byte
514	title    []byte
515	noteId   int // 0 if not a footnote ref
516	hasBlock bool
517	text     []byte
518}
519
520func (r *reference) String() string {
521	return fmt.Sprintf("{link: %q, title: %q, text: %q, noteId: %d, hasBlock: %v}",
522		r.link, r.title, r.text, r.noteId, r.hasBlock)
523}
524
525// Check whether or not data starts with a reference link.
526// If so, it is parsed and stored in the list of references
527// (in the render struct).
528// Returns the number of bytes to skip to move past it,
529// or zero if the first line is not a reference.
530func isReference(p *parser, data []byte, tabSize int) int {
531	// up to 3 optional leading spaces
532	if len(data) < 4 {
533		return 0
534	}
535	i := 0
536	for i < 3 && data[i] == ' ' {
537		i++
538	}
539
540	noteId := 0
541
542	// id part: anything but a newline between brackets
543	if data[i] != '[' {
544		return 0
545	}
546	i++
547	if p.flags&EXTENSION_FOOTNOTES != 0 {
548		if i < len(data) && data[i] == '^' {
549			// we can set it to anything here because the proper noteIds will
550			// be assigned later during the second pass. It just has to be != 0
551			noteId = 1
552			i++
553		}
554	}
555	idOffset := i
556	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
557		i++
558	}
559	if i >= len(data) || data[i] != ']' {
560		return 0
561	}
562	idEnd := i
563
564	// spacer: colon (space | tab)* newline? (space | tab)*
565	i++
566	if i >= len(data) || data[i] != ':' {
567		return 0
568	}
569	i++
570	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
571		i++
572	}
573	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
574		i++
575		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
576			i++
577		}
578	}
579	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
580		i++
581	}
582	if i >= len(data) {
583		return 0
584	}
585
586	var (
587		linkOffset, linkEnd   int
588		titleOffset, titleEnd int
589		lineEnd               int
590		raw                   []byte
591		hasBlock              bool
592	)
593
594	if p.flags&EXTENSION_FOOTNOTES != 0 && noteId != 0 {
595		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
596		lineEnd = linkEnd
597	} else {
598		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
599	}
600	if lineEnd == 0 {
601		return 0
602	}
603
604	// a valid ref has been found
605
606	ref := &reference{
607		noteId:   noteId,
608		hasBlock: hasBlock,
609	}
610
611	if noteId > 0 {
612		// reusing the link field for the id since footnotes don't have links
613		ref.link = data[idOffset:idEnd]
614		// if footnote, it's not really a title, it's the contained text
615		ref.title = raw
616	} else {
617		ref.link = data[linkOffset:linkEnd]
618		ref.title = data[titleOffset:titleEnd]
619	}
620
621	// id matches are case-insensitive
622	id := string(bytes.ToLower(data[idOffset:idEnd]))
623
624	p.refs[id] = ref
625
626	return lineEnd
627}
628
629func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
630	// link: whitespace-free sequence, optionally between angle brackets
631	if data[i] == '<' {
632		i++
633	}
634	linkOffset = i
635	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
636		i++
637	}
638	if i == len(data) {
639		return
640	}
641	linkEnd = i
642	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
643		linkOffset++
644		linkEnd--
645	}
646
647	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
648	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
649		i++
650	}
651	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
652		return
653	}
654
655	// compute end-of-line
656	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
657		lineEnd = i
658	}
659	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
660		lineEnd++
661	}
662
663	// optional (space|tab)* spacer after a newline
664	if lineEnd > 0 {
665		i = lineEnd + 1
666		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
667			i++
668		}
669	}
670
671	// optional title: any non-newline sequence enclosed in '"() alone on its line
672	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
673		i++
674		titleOffset = i
675
676		// look for EOL
677		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
678			i++
679		}
680		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
681			titleEnd = i + 1
682		} else {
683			titleEnd = i
684		}
685
686		// step back
687		i--
688		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
689			i--
690		}
691		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
692			lineEnd = titleEnd
693			titleEnd = i
694		}
695	}
696
697	return
698}
699
700// The first bit of this logic is the same as (*parser).listItem, but the rest
701// is much simpler. This function simply finds the entire block and shifts it
702// over by one tab if it is indeed a block (just returns the line if it's not).
703// blockEnd is the end of the section in the input buffer, and contents is the
704// extracted text that was shifted over one tab. It will need to be rendered at
705// the end of the document.
706func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
707	if i == 0 || len(data) == 0 {
708		return
709	}
710
711	// skip leading whitespace on first line
712	for i < len(data) && data[i] == ' ' {
713		i++
714	}
715
716	blockStart = i
717
718	// find the end of the line
719	blockEnd = i
720	for i < len(data) && data[i-1] != '\n' {
721		i++
722	}
723
724	// get working buffer
725	var raw bytes.Buffer
726
727	// put the first line into the working buffer
728	raw.Write(data[blockEnd:i])
729	blockEnd = i
730
731	// process the following lines
732	containsBlankLine := false
733
734gatherLines:
735	for blockEnd < len(data) {
736		i++
737
738		// find the end of this line
739		for i < len(data) && data[i-1] != '\n' {
740			i++
741		}
742
743		// if it is an empty line, guess that it is part of this item
744		// and move on to the next line
745		if p.isEmpty(data[blockEnd:i]) > 0 {
746			containsBlankLine = true
747			blockEnd = i
748			continue
749		}
750
751		n := 0
752		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
753			// this is the end of the block.
754			// we don't want to include this last line in the index.
755			break gatherLines
756		}
757
758		// if there were blank lines before this one, insert a new one now
759		if containsBlankLine {
760			raw.WriteByte('\n')
761			containsBlankLine = false
762		}
763
764		// get rid of that first tab, write to buffer
765		raw.Write(data[blockEnd+n : i])
766		hasBlock = true
767
768		blockEnd = i
769	}
770
771	if data[blockEnd-1] != '\n' {
772		raw.WriteByte('\n')
773	}
774
775	contents = raw.Bytes()
776
777	return
778}
779
780//
781//
782// Miscellaneous helper functions
783//
784//
785
786// Test if a character is a punctuation symbol.
787// Taken from a private function in regexp in the stdlib.
788func ispunct(c byte) bool {
789	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
790		if c == r {
791			return true
792		}
793	}
794	return false
795}
796
797// Test if a character is a whitespace character.
798func isspace(c byte) bool {
799	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
800}
801
802// Test if a character is letter.
803func isletter(c byte) bool {
804	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
805}
806
807// Test if a character is a letter or a digit.
808// TODO: check when this is looking for ASCII alnum and when it should use unicode
809func isalnum(c byte) bool {
810	return (c >= '0' && c <= '9') || isletter(c)
811}
812
813// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
814// always ends output with a newline
815func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
816	// first, check for common cases: no tabs, or only tabs at beginning of line
817	i, prefix := 0, 0
818	slowcase := false
819	for i = 0; i < len(line); i++ {
820		if line[i] == '\t' {
821			if prefix == i {
822				prefix++
823			} else {
824				slowcase = true
825				break
826			}
827		}
828	}
829
830	// no need to decode runes if all tabs are at the beginning of the line
831	if !slowcase {
832		for i = 0; i < prefix*tabSize; i++ {
833			out.WriteByte(' ')
834		}
835		out.Write(line[prefix:])
836		return
837	}
838
839	// the slow case: we need to count runes to figure out how
840	// many spaces to insert for each tab
841	column := 0
842	i = 0
843	for i < len(line) {
844		start := i
845		for i < len(line) && line[i] != '\t' {
846			_, size := utf8.DecodeRune(line[i:])
847			i += size
848			column++
849		}
850
851		if i > start {
852			out.Write(line[start:i])
853		}
854
855		if i >= len(line) {
856			break
857		}
858
859		for {
860			out.WriteByte(' ')
861			column++
862			if column%tabSize == 0 {
863				break
864			}
865		}
866
867		i++
868	}
869}
870
871// Find if a line counts as indented or not.
872// Returns number of characters the indent is (0 = not indented).
873func isIndented(data []byte, indentSize int) int {
874	if len(data) == 0 {
875		return 0
876	}
877	if data[0] == '\t' {
878		return 1
879	}
880	if len(data) < indentSize {
881		return 0
882	}
883	for i := 0; i < indentSize; i++ {
884		if data[i] != ' ' {
885			return 0
886		}
887	}
888	return indentSize
889}
890
891// Create a url-safe slug for fragments
892func slugify(in []byte) []byte {
893	if len(in) == 0 {
894		return in
895	}
896	out := make([]byte, 0, len(in))
897	sym := false
898
899	for _, ch := range in {
900		if isalnum(ch) {
901			sym = false
902			out = append(out, ch)
903		} else if sym {
904			continue
905		} else {
906			out = append(out, '-')
907			sym = true
908		}
909	}
910	var a, b int
911	var ch byte
912	for a, ch = range out {
913		if ch != '-' {
914			break
915		}
916	}
917	for b = len(out) - 1; b > 0; b-- {
918		if out[b] != '-' {
919			break
920		}
921	}
922	return out[a : b+1]
923}