all repos — grayfriday @ 2bbed304fbd05594bb4dd76154add618f70d1e7c

blackfriday fork with a few changes

markdown.go (view raw)

  1//
  2// Blackfriday Markdown Processor
  3// Available at http://github.com/russross/blackfriday
  4//
  5// Copyright © 2011 Russ Ross <russ@russross.com>.
  6// Distributed under the Simplified BSD License.
  7// See README.md for details.
  8//
  9
 10//
 11//
 12// Markdown parsing and processing
 13//
 14//
 15
 16// Blackfriday markdown processor.
 17//
 18// Translates plain text with simple formatting rules into HTML or LaTeX.
 19package blackfriday
 20
 21import (
 22	"bytes"
 23	"fmt"
 24	"strings"
 25	"unicode/utf8"
 26)
 27
 28const VERSION = "1.4"
 29
 30type Extensions int
 31
 32// These are the supported markdown parsing extensions.
 33// OR these values together to select multiple extensions.
 34const (
 35	NoExtensions           Extensions = 0
 36	NoIntraEmphasis        Extensions = 1 << iota // Ignore emphasis markers inside words
 37	Tables                                        // Render tables
 38	FencedCode                                    // Render fenced code blocks
 39	Autolink                                      // Detect embedded URLs that are not explicitly marked
 40	Strikethrough                                 // Strikethrough text using ~~test~~
 41	LaxHTMLBlocks                                 // Loosen up HTML block parsing rules
 42	SpaceHeaders                                  // Be strict about prefix header rules
 43	HardLineBreak                                 // Translate newlines into line breaks
 44	TabSizeEight                                  // Expand tabs to eight spaces instead of four
 45	Footnotes                                     // Pandoc-style footnotes
 46	NoEmptyLineBeforeBlock                        // No need to insert an empty line to start a (code, quote, ordered list, unordered list) block
 47	HeaderIDs                                     // specify header IDs  with {#id}
 48	Titleblock                                    // Titleblock ala pandoc
 49	AutoHeaderIDs                                 // Create the header ID from the text
 50	BackslashLineBreak                            // Translate trailing backslashes into line breaks
 51	DefinitionLists                               // Render definition lists
 52
 53	commonHtmlFlags HtmlFlags = UseXHTML | UseSmartypants |
 54		SmartypantsFractions | SmartypantsDashes | SmartypantsLatexDashes
 55
 56	commonExtensions Extensions = NoIntraEmphasis | Tables | FencedCode |
 57		Autolink | Strikethrough | SpaceHeaders | HeaderIDs |
 58		BackslashLineBreak | DefinitionLists
 59)
 60
 61type LinkType int
 62
 63// These are the possible flag values for the link renderer.
 64// Only a single one of these values will be used; they are not ORed together.
 65// These are mostly of interest if you are writing a new output format.
 66const (
 67	LinkTypeNotAutolink LinkType = iota
 68	LinkTypeNormal
 69	LinkTypeEmail
 70)
 71
 72type ListType int
 73
 74// These are the possible flag values for the ListItem renderer.
 75// Multiple flag values may be ORed together.
 76// These are mostly of interest if you are writing a new output format.
 77const (
 78	ListTypeOrdered ListType = 1 << iota
 79	ListTypeDefinition
 80	ListTypeTerm
 81
 82	ListItemContainsBlock
 83	ListItemBeginningOfList
 84	ListItemEndOfList
 85)
 86
 87type TableFlags int
 88
 89// These are the possible flag values for the table cell renderer.
 90// Only a single one of these values will be used; they are not ORed together.
 91// These are mostly of interest if you are writing a new output format.
 92const (
 93	TableAlignmentLeft = 1 << iota
 94	TableAlignmentRight
 95	TableAlignmentCenter = (TableAlignmentLeft | TableAlignmentRight)
 96)
 97
 98// The size of a tab stop.
 99const (
100	TabSizeDefault = 4
101	TabSizeDouble  = 8
102)
103
104// blockTags is a set of tags that are recognized as HTML block tags.
105// Any of these can be included in markdown text without special escaping.
106var blockTags = map[string]struct{}{
107	"blockquote": struct{}{},
108	"del":        struct{}{},
109	"div":        struct{}{},
110	"dl":         struct{}{},
111	"fieldset":   struct{}{},
112	"form":       struct{}{},
113	"h1":         struct{}{},
114	"h2":         struct{}{},
115	"h3":         struct{}{},
116	"h4":         struct{}{},
117	"h5":         struct{}{},
118	"h6":         struct{}{},
119	"iframe":     struct{}{},
120	"ins":        struct{}{},
121	"math":       struct{}{},
122	"noscript":   struct{}{},
123	"ol":         struct{}{},
124	"pre":        struct{}{},
125	"p":          struct{}{},
126	"script":     struct{}{},
127	"style":      struct{}{},
128	"table":      struct{}{},
129	"ul":         struct{}{},
130
131	// HTML5
132	"address":    struct{}{},
133	"article":    struct{}{},
134	"aside":      struct{}{},
135	"canvas":     struct{}{},
136	"figcaption": struct{}{},
137	"figure":     struct{}{},
138	"footer":     struct{}{},
139	"header":     struct{}{},
140	"hgroup":     struct{}{},
141	"main":       struct{}{},
142	"nav":        struct{}{},
143	"output":     struct{}{},
144	"progress":   struct{}{},
145	"section":    struct{}{},
146	"video":      struct{}{},
147}
148
149// Renderer is the rendering interface.
150// This is mostly of interest if you are implementing a new rendering format.
151//
152// When a byte slice is provided, it contains the (rendered) contents of the
153// element.
154//
155// When a callback is provided instead, it will write the contents of the
156// respective element directly to the output buffer and return true on success.
157// If the callback returns false, the rendering function should reset the
158// output buffer as though it had never been called.
159//
160// Currently Html and Latex implementations are provided
161type Renderer interface {
162	// block-level callbacks
163	BlockCode(text []byte, lang string)
164	BlockQuote(text []byte)
165	BlockHtml(text []byte)
166	BeginHeader(level int, id string)
167	EndHeader(level int, id string, header []byte)
168	HRule()
169	BeginList(flags ListType)
170	EndList(flags ListType)
171	ListItem(text []byte, flags ListType)
172	BeginParagraph()
173	EndParagraph()
174	Table(header []byte, body []byte, columnData []int)
175	TableRow(text []byte)
176	TableHeaderCell(out *bytes.Buffer, text []byte, flags int)
177	TableCell(out *bytes.Buffer, text []byte, flags int)
178	BeginFootnotes()
179	EndFootnotes()
180	FootnoteItem(name, text []byte, flags ListType)
181	TitleBlock(text []byte)
182
183	// Span-level callbacks
184	AutoLink(link []byte, kind LinkType)
185	CodeSpan(text []byte)
186	DoubleEmphasis(text []byte)
187	Emphasis(text []byte)
188	Image(link []byte, title []byte, alt []byte)
189	LineBreak()
190	Link(link []byte, title []byte, content []byte)
191	RawHtmlTag(tag []byte)
192	TripleEmphasis(text []byte)
193	StrikeThrough(text []byte)
194	FootnoteRef(ref []byte, id int)
195
196	// Low-level callbacks
197	Entity(entity []byte)
198	NormalText(text []byte)
199
200	// Header and footer
201	DocumentHeader()
202	DocumentFooter()
203
204	GetFlags() HtmlFlags
205	CaptureWrites(processor func()) []byte
206	CopyWrites(processor func()) []byte
207	Write(b []byte) (int, error)
208	GetResult() []byte
209}
210
211// Callback functions for inline parsing. One such function is defined
212// for each character that triggers a response when parsing inline data.
213type inlineParser func(p *parser, data []byte, offset int) int
214
215// Parser holds runtime state used by the parser.
216// This is constructed by the Markdown function.
217type parser struct {
218	r              Renderer
219	refOverride    ReferenceOverrideFunc
220	refs           map[string]*reference
221	inlineCallback [256]inlineParser
222	flags          Extensions
223	nesting        int
224	maxNesting     int
225	insideLink     bool
226
227	// Footnotes need to be ordered as well as available to quickly check for
228	// presence. If a ref is also a footnote, it's stored both in refs and here
229	// in notes. Slice is nil if footnotes not enabled.
230	notes []*reference
231}
232
233func (p *parser) getRef(refid string) (ref *reference, found bool) {
234	if p.refOverride != nil {
235		r, overridden := p.refOverride(refid)
236		if overridden {
237			if r == nil {
238				return nil, false
239			}
240			return &reference{
241				link:     []byte(r.Link),
242				title:    []byte(r.Title),
243				noteId:   0,
244				hasBlock: false,
245				text:     []byte(r.Text)}, true
246		}
247	}
248	// refs are case insensitive
249	ref, found = p.refs[strings.ToLower(refid)]
250	return ref, found
251}
252
253//
254//
255// Public interface
256//
257//
258
259// Reference represents the details of a link.
260// See the documentation in Options for more details on use-case.
261type Reference struct {
262	// Link is usually the URL the reference points to.
263	Link string
264	// Title is the alternate text describing the link in more detail.
265	Title string
266	// Text is the optional text to override the ref with if the syntax used was
267	// [refid][]
268	Text string
269}
270
271// ReferenceOverrideFunc is expected to be called with a reference string and
272// return either a valid Reference type that the reference string maps to or
273// nil. If overridden is false, the default reference logic will be executed.
274// See the documentation in Options for more details on use-case.
275type ReferenceOverrideFunc func(reference string) (ref *Reference, overridden bool)
276
277// Options represents configurable overrides and callbacks (in addition to the
278// extension flag set) for configuring a Markdown parse.
279type Options struct {
280	// Extensions is a flag set of bit-wise ORed extension bits. See the
281	// Extensions flags defined in this package.
282	Extensions Extensions
283
284	// ReferenceOverride is an optional function callback that is called every
285	// time a reference is resolved.
286	//
287	// In Markdown, the link reference syntax can be made to resolve a link to
288	// a reference instead of an inline URL, in one of the following ways:
289	//
290	//  * [link text][refid]
291	//  * [refid][]
292	//
293	// Usually, the refid is defined at the bottom of the Markdown document. If
294	// this override function is provided, the refid is passed to the override
295	// function first, before consulting the defined refids at the bottom. If
296	// the override function indicates an override did not occur, the refids at
297	// the bottom will be used to fill in the link details.
298	ReferenceOverride ReferenceOverrideFunc
299}
300
301// MarkdownBasic is a convenience function for simple rendering.
302// It processes markdown input with no extensions enabled.
303func MarkdownBasic(input []byte) []byte {
304	// set up the HTML renderer
305	htmlFlags := UseXHTML
306	renderer := HtmlRenderer(htmlFlags, "", "")
307
308	// set up the parser
309	return MarkdownOptions(input, renderer, Options{Extensions: 0})
310}
311
312// Call Markdown with most useful extensions enabled
313// MarkdownCommon is a convenience function for simple rendering.
314// It processes markdown input with common extensions enabled, including:
315//
316// * Smartypants processing with smart fractions and LaTeX dashes
317//
318// * Intra-word emphasis suppression
319//
320// * Tables
321//
322// * Fenced code blocks
323//
324// * Autolinking
325//
326// * Strikethrough support
327//
328// * Strict header parsing
329//
330// * Custom Header IDs
331func MarkdownCommon(input []byte) []byte {
332	// set up the HTML renderer
333	renderer := HtmlRenderer(commonHtmlFlags, "", "")
334	return MarkdownOptions(input, renderer, Options{
335		Extensions: commonExtensions})
336}
337
338// Markdown is the main rendering function.
339// It parses and renders a block of markdown-encoded text.
340// The supplied Renderer is used to format the output, and extensions dictates
341// which non-standard extensions are enabled.
342//
343// To use the supplied Html or LaTeX renderers, see HtmlRenderer and
344// LatexRenderer, respectively.
345func Markdown(input []byte, renderer Renderer, extensions Extensions) []byte {
346	return MarkdownOptions(input, renderer, Options{
347		Extensions: extensions})
348}
349
350// MarkdownOptions is just like Markdown but takes additional options through
351// the Options struct.
352func MarkdownOptions(input []byte, renderer Renderer, opts Options) []byte {
353	// no point in parsing if we can't render
354	if renderer == nil {
355		return nil
356	}
357
358	extensions := opts.Extensions
359
360	// fill in the render structure
361	p := new(parser)
362	p.r = renderer
363	p.flags = extensions
364	p.refOverride = opts.ReferenceOverride
365	p.refs = make(map[string]*reference)
366	p.maxNesting = 16
367	p.insideLink = false
368
369	// register inline parsers
370	p.inlineCallback['*'] = emphasis
371	p.inlineCallback['_'] = emphasis
372	if extensions&Strikethrough != 0 {
373		p.inlineCallback['~'] = emphasis
374	}
375	p.inlineCallback['`'] = codeSpan
376	p.inlineCallback['\n'] = lineBreak
377	p.inlineCallback['['] = link
378	p.inlineCallback['<'] = leftAngle
379	p.inlineCallback['\\'] = escape
380	p.inlineCallback['&'] = entity
381	p.inlineCallback['!'] = maybeImage
382	p.inlineCallback['^'] = maybeInlineFootnote
383
384	if extensions&Autolink != 0 {
385		p.inlineCallback['h'] = maybeAutoLink
386		p.inlineCallback['m'] = maybeAutoLink
387		p.inlineCallback['f'] = maybeAutoLink
388		p.inlineCallback['H'] = maybeAutoLink
389		p.inlineCallback['M'] = maybeAutoLink
390		p.inlineCallback['F'] = maybeAutoLink
391	}
392
393	if extensions&Footnotes != 0 {
394		p.notes = make([]*reference, 0)
395	}
396
397	first := firstPass(p, input)
398	second := secondPass(p, first)
399	return second
400}
401
402// first pass:
403// - extract references
404// - expand tabs
405// - normalize newlines
406// - copy everything else
407func firstPass(p *parser, input []byte) []byte {
408	var out bytes.Buffer
409	tabSize := TabSizeDefault
410	if p.flags&TabSizeEight != 0 {
411		tabSize = TabSizeDouble
412	}
413	beg, end := 0, 0
414	lastFencedCodeBlockEnd := 0
415	for beg < len(input) { // iterate over lines
416		if end = isReference(p, input[beg:], tabSize); end > 0 {
417			beg += end
418		} else { // skip to the next line
419			end = beg
420			for end < len(input) && input[end] != '\n' && input[end] != '\r' {
421				end++
422			}
423
424			if p.flags&FencedCode != 0 {
425				// track fenced code block boundaries to suppress tab expansion
426				// inside them:
427				if beg >= lastFencedCodeBlockEnd {
428					if i := p.fencedCode(input[beg:], false); i > 0 {
429						lastFencedCodeBlockEnd = beg + i
430					}
431				}
432			}
433
434			// add the line body if present
435			if end > beg {
436				if end < lastFencedCodeBlockEnd { // Do not expand tabs while inside fenced code blocks.
437					out.Write(input[beg:end])
438				} else {
439					expandTabs(&out, input[beg:end], tabSize)
440				}
441			}
442			out.WriteByte('\n')
443
444			if end < len(input) && input[end] == '\r' {
445				end++
446			}
447			if end < len(input) && input[end] == '\n' {
448				end++
449			}
450
451			beg = end
452		}
453	}
454
455	// empty input?
456	if out.Len() == 0 {
457		out.WriteByte('\n')
458	}
459
460	return out.Bytes()
461}
462
463// second pass: actual rendering
464func secondPass(p *parser, input []byte) []byte {
465	p.r.DocumentHeader()
466	p.block(input)
467
468	if p.flags&Footnotes != 0 && len(p.notes) > 0 {
469		p.r.BeginFootnotes()
470		flags := ListItemBeginningOfList
471		for i := 0; i < len(p.notes); i += 1 {
472			ref := p.notes[i]
473			var buf bytes.Buffer
474			if ref.hasBlock {
475				flags |= ListItemContainsBlock
476				buf.Write(p.r.CaptureWrites(func() {
477					p.block(ref.title)
478				}))
479			} else {
480				buf.Write(p.r.CaptureWrites(func() {
481					p.inline(ref.title)
482				}))
483			}
484			p.r.FootnoteItem(ref.link, buf.Bytes(), flags)
485			flags &^= ListItemBeginningOfList | ListItemContainsBlock
486		}
487		p.r.EndFootnotes()
488	}
489
490	p.r.DocumentFooter()
491
492	if p.nesting != 0 {
493		panic("Nesting level did not end at zero")
494	}
495
496	return p.r.GetResult()
497}
498
499//
500// Link references
501//
502// This section implements support for references that (usually) appear
503// as footnotes in a document, and can be referenced anywhere in the document.
504// The basic format is:
505//
506//    [1]: http://www.google.com/ "Google"
507//    [2]: http://www.github.com/ "Github"
508//
509// Anywhere in the document, the reference can be linked by referring to its
510// label, i.e., 1 and 2 in this example, as in:
511//
512//    This library is hosted on [Github][2], a git hosting site.
513//
514// Actual footnotes as specified in Pandoc and supported by some other Markdown
515// libraries such as php-markdown are also taken care of. They look like this:
516//
517//    This sentence needs a bit of further explanation.[^note]
518//
519//    [^note]: This is the explanation.
520//
521// Footnotes should be placed at the end of the document in an ordered list.
522// Inline footnotes such as:
523//
524//    Inline footnotes^[Not supported.] also exist.
525//
526// are not yet supported.
527
528// References are parsed and stored in this struct.
529type reference struct {
530	link     []byte
531	title    []byte
532	noteId   int // 0 if not a footnote ref
533	hasBlock bool
534	text     []byte
535}
536
537func (r *reference) String() string {
538	return fmt.Sprintf("{link: %q, title: %q, text: %q, noteId: %d, hasBlock: %v}",
539		r.link, r.title, r.text, r.noteId, r.hasBlock)
540}
541
542// Check whether or not data starts with a reference link.
543// If so, it is parsed and stored in the list of references
544// (in the render struct).
545// Returns the number of bytes to skip to move past it,
546// or zero if the first line is not a reference.
547func isReference(p *parser, data []byte, tabSize int) int {
548	// up to 3 optional leading spaces
549	if len(data) < 4 {
550		return 0
551	}
552	i := 0
553	for i < 3 && data[i] == ' ' {
554		i++
555	}
556
557	noteId := 0
558
559	// id part: anything but a newline between brackets
560	if data[i] != '[' {
561		return 0
562	}
563	i++
564	if p.flags&Footnotes != 0 {
565		if i < len(data) && data[i] == '^' {
566			// we can set it to anything here because the proper noteIds will
567			// be assigned later during the second pass. It just has to be != 0
568			noteId = 1
569			i++
570		}
571	}
572	idOffset := i
573	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
574		i++
575	}
576	if i >= len(data) || data[i] != ']' {
577		return 0
578	}
579	idEnd := i
580
581	// spacer: colon (space | tab)* newline? (space | tab)*
582	i++
583	if i >= len(data) || data[i] != ':' {
584		return 0
585	}
586	i++
587	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
588		i++
589	}
590	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
591		i++
592		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
593			i++
594		}
595	}
596	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
597		i++
598	}
599	if i >= len(data) {
600		return 0
601	}
602
603	var (
604		linkOffset, linkEnd   int
605		titleOffset, titleEnd int
606		lineEnd               int
607		raw                   []byte
608		hasBlock              bool
609	)
610
611	if p.flags&Footnotes != 0 && noteId != 0 {
612		linkOffset, linkEnd, raw, hasBlock = scanFootnote(p, data, i, tabSize)
613		lineEnd = linkEnd
614	} else {
615		linkOffset, linkEnd, titleOffset, titleEnd, lineEnd = scanLinkRef(p, data, i)
616	}
617	if lineEnd == 0 {
618		return 0
619	}
620
621	// a valid ref has been found
622
623	ref := &reference{
624		noteId:   noteId,
625		hasBlock: hasBlock,
626	}
627
628	if noteId > 0 {
629		// reusing the link field for the id since footnotes don't have links
630		ref.link = data[idOffset:idEnd]
631		// if footnote, it's not really a title, it's the contained text
632		ref.title = raw
633	} else {
634		ref.link = data[linkOffset:linkEnd]
635		ref.title = data[titleOffset:titleEnd]
636	}
637
638	// id matches are case-insensitive
639	id := string(bytes.ToLower(data[idOffset:idEnd]))
640
641	p.refs[id] = ref
642
643	return lineEnd
644}
645
646func scanLinkRef(p *parser, data []byte, i int) (linkOffset, linkEnd, titleOffset, titleEnd, lineEnd int) {
647	// link: whitespace-free sequence, optionally between angle brackets
648	if data[i] == '<' {
649		i++
650	}
651	linkOffset = i
652	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
653		i++
654	}
655	if i == len(data) {
656		return
657	}
658	linkEnd = i
659	if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
660		linkOffset++
661		linkEnd--
662	}
663
664	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
665	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
666		i++
667	}
668	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
669		return
670	}
671
672	// compute end-of-line
673	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
674		lineEnd = i
675	}
676	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
677		lineEnd++
678	}
679
680	// optional (space|tab)* spacer after a newline
681	if lineEnd > 0 {
682		i = lineEnd + 1
683		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
684			i++
685		}
686	}
687
688	// optional title: any non-newline sequence enclosed in '"() alone on its line
689	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
690		i++
691		titleOffset = i
692
693		// look for EOL
694		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
695			i++
696		}
697		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
698			titleEnd = i + 1
699		} else {
700			titleEnd = i
701		}
702
703		// step back
704		i--
705		for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
706			i--
707		}
708		if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
709			lineEnd = titleEnd
710			titleEnd = i
711		}
712	}
713
714	return
715}
716
717// The first bit of this logic is the same as (*parser).listItem, but the rest
718// is much simpler. This function simply finds the entire block and shifts it
719// over by one tab if it is indeed a block (just returns the line if it's not).
720// blockEnd is the end of the section in the input buffer, and contents is the
721// extracted text that was shifted over one tab. It will need to be rendered at
722// the end of the document.
723func scanFootnote(p *parser, data []byte, i, indentSize int) (blockStart, blockEnd int, contents []byte, hasBlock bool) {
724	if i == 0 || len(data) == 0 {
725		return
726	}
727
728	// skip leading whitespace on first line
729	for i < len(data) && data[i] == ' ' {
730		i++
731	}
732
733	blockStart = i
734
735	// find the end of the line
736	blockEnd = i
737	for i < len(data) && data[i-1] != '\n' {
738		i++
739	}
740
741	// get working buffer
742	var raw bytes.Buffer
743
744	// put the first line into the working buffer
745	raw.Write(data[blockEnd:i])
746	blockEnd = i
747
748	// process the following lines
749	containsBlankLine := false
750
751gatherLines:
752	for blockEnd < len(data) {
753		i++
754
755		// find the end of this line
756		for i < len(data) && data[i-1] != '\n' {
757			i++
758		}
759
760		// if it is an empty line, guess that it is part of this item
761		// and move on to the next line
762		if p.isEmpty(data[blockEnd:i]) > 0 {
763			containsBlankLine = true
764			blockEnd = i
765			continue
766		}
767
768		n := 0
769		if n = isIndented(data[blockEnd:i], indentSize); n == 0 {
770			// this is the end of the block.
771			// we don't want to include this last line in the index.
772			break gatherLines
773		}
774
775		// if there were blank lines before this one, insert a new one now
776		if containsBlankLine {
777			raw.WriteByte('\n')
778			containsBlankLine = false
779		}
780
781		// get rid of that first tab, write to buffer
782		raw.Write(data[blockEnd+n : i])
783		hasBlock = true
784
785		blockEnd = i
786	}
787
788	if data[blockEnd-1] != '\n' {
789		raw.WriteByte('\n')
790	}
791
792	contents = raw.Bytes()
793
794	return
795}
796
797//
798//
799// Miscellaneous helper functions
800//
801//
802
803// Test if a character is a punctuation symbol.
804// Taken from a private function in regexp in the stdlib.
805func ispunct(c byte) bool {
806	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
807		if c == r {
808			return true
809		}
810	}
811	return false
812}
813
814// Test if a character is a whitespace character.
815func isspace(c byte) bool {
816	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
817}
818
819// Test if a character is letter.
820func isletter(c byte) bool {
821	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
822}
823
824// Test if a character is a letter or a digit.
825// TODO: check when this is looking for ASCII alnum and when it should use unicode
826func isalnum(c byte) bool {
827	return (c >= '0' && c <= '9') || isletter(c)
828}
829
830// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
831// always ends output with a newline
832func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
833	// first, check for common cases: no tabs, or only tabs at beginning of line
834	i, prefix := 0, 0
835	slowcase := false
836	for i = 0; i < len(line); i++ {
837		if line[i] == '\t' {
838			if prefix == i {
839				prefix++
840			} else {
841				slowcase = true
842				break
843			}
844		}
845	}
846
847	// no need to decode runes if all tabs are at the beginning of the line
848	if !slowcase {
849		for i = 0; i < prefix*tabSize; i++ {
850			out.WriteByte(' ')
851		}
852		out.Write(line[prefix:])
853		return
854	}
855
856	// the slow case: we need to count runes to figure out how
857	// many spaces to insert for each tab
858	column := 0
859	i = 0
860	for i < len(line) {
861		start := i
862		for i < len(line) && line[i] != '\t' {
863			_, size := utf8.DecodeRune(line[i:])
864			i += size
865			column++
866		}
867
868		if i > start {
869			out.Write(line[start:i])
870		}
871
872		if i >= len(line) {
873			break
874		}
875
876		for {
877			out.WriteByte(' ')
878			column++
879			if column%tabSize == 0 {
880				break
881			}
882		}
883
884		i++
885	}
886}
887
888// Find if a line counts as indented or not.
889// Returns number of characters the indent is (0 = not indented).
890func isIndented(data []byte, indentSize int) int {
891	if len(data) == 0 {
892		return 0
893	}
894	if data[0] == '\t' {
895		return 1
896	}
897	if len(data) < indentSize {
898		return 0
899	}
900	for i := 0; i < indentSize; i++ {
901		if data[i] != ' ' {
902			return 0
903		}
904	}
905	return indentSize
906}
907
908// Create a url-safe slug for fragments
909func slugify(in []byte) []byte {
910	if len(in) == 0 {
911		return in
912	}
913	out := make([]byte, 0, len(in))
914	sym := false
915
916	for _, ch := range in {
917		if isalnum(ch) {
918			sym = false
919			out = append(out, ch)
920		} else if sym {
921			continue
922		} else {
923			out = append(out, '-')
924			sym = true
925		}
926	}
927	var a, b int
928	var ch byte
929	for a, ch = range out {
930		if ch != '-' {
931			break
932		}
933	}
934	for b = len(out) - 1; b > 0; b-- {
935		if out[b] != '-' {
936			break
937		}
938	}
939	return out[a : b+1]
940}