all repos — grayfriday @ 4bd8627b2ca34a46892d1f5bdf88daadf0609f06

blackfriday fork with a few changes

markdown.go (view raw)

  1//
  2// Blackfriday Markdown Processor
  3// Available at http://github.com/russross/blackfriday
  4//
  5// Copyright © 2011 Russ Ross <russ@russross.com>.
  6// Licensed under the Simplified BSD License.
  7// See README.md for details.
  8//
  9
 10//
 11//
 12// Markdown parsing and processing
 13//
 14//
 15
 16package blackfriday
 17
 18import (
 19	"bytes"
 20	"utf8"
 21)
 22
 23// These are the supported markdown parsing extensions.
 24// OR these values together to select multiple extensions.
 25const (
 26	EXTENSION_NO_INTRA_EMPHASIS = 1 << iota
 27	EXTENSION_TABLES
 28	EXTENSION_FENCED_CODE
 29	EXTENSION_AUTOLINK
 30	EXTENSION_STRIKETHROUGH
 31	EXTENSION_LAX_HTML_BLOCKS
 32	EXTENSION_SPACE_HEADERS
 33	EXTENSION_HARD_LINE_BREAK
 34)
 35
 36// These are the possible flag values for the link renderer.
 37// Only a single one of these values will be used; they are not ORed together.
 38// These are mostly of interest if you are writing a new output format.
 39const (
 40	LINK_TYPE_NOT_AUTOLINK = iota
 41	LINK_TYPE_NORMAL
 42	LINK_TYPE_EMAIL
 43)
 44
 45// These are the possible flag values for the listitem renderer.
 46// Multiple flag values may be ORed together.
 47// These are mostly of interest if you are writing a new output format.
 48const (
 49	LIST_TYPE_ORDERED = 1 << iota
 50	LIST_ITEM_CONTAINS_BLOCK
 51	LIST_ITEM_END_OF_LIST
 52)
 53
 54// These are the possible flag values for the table cell renderer.
 55// Only a single one of these values will be used; they are not ORed together.
 56// These are mostly of interest if you are writing a new output format.
 57const (
 58	TABLE_ALIGNMENT_LEFT = 1 << iota
 59	TABLE_ALIGNMENT_RIGHT
 60	TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
 61)
 62
 63// The size of a tab stop.
 64const TAB_SIZE = 4
 65
 66// These are the tags that are recognized as HTML block tags.
 67// Any of these can be included in markdown text without special escaping.
 68var block_tags = map[string]bool{
 69	"p":          true,
 70	"dl":         true,
 71	"h1":         true,
 72	"h2":         true,
 73	"h3":         true,
 74	"h4":         true,
 75	"h5":         true,
 76	"h6":         true,
 77	"ol":         true,
 78	"ul":         true,
 79	"del":        true,
 80	"div":        true,
 81	"ins":        true,
 82	"pre":        true,
 83	"form":       true,
 84	"math":       true,
 85	"table":      true,
 86	"iframe":     true,
 87	"script":     true,
 88	"fieldset":   true,
 89	"noscript":   true,
 90	"blockquote": true,
 91}
 92
 93// This struct defines the rendering interface.
 94// A series of callback functions are registered to form a complete renderer.
 95// A single interface{} value field is provided, and that value is handed to
 96// each callback. Leaving a field blank suppresses rendering that type of output
 97// except where noted.
 98//
 99// This is mostly of interest if you are implementing a new rendering format.
100// Most users will use the convenience functions to fill in this structure.
101type Renderer struct {
102	// block-level callbacks---nil skips the block
103	BlockCode  func(out *bytes.Buffer, text []byte, lang string, opaque interface{})
104	BlockQuote func(out *bytes.Buffer, text []byte, opaque interface{})
105	BlockHtml  func(out *bytes.Buffer, text []byte, opaque interface{})
106	Header     func(out *bytes.Buffer, text func() bool, level int, opaque interface{})
107	HRule      func(out *bytes.Buffer, opaque interface{})
108	List       func(out *bytes.Buffer, text func() bool, flags int, opaque interface{})
109	ListItem   func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
110	Paragraph  func(out *bytes.Buffer, text func() bool, opaque interface{})
111	Table      func(out *bytes.Buffer, header []byte, body []byte, columnData []int, opaque interface{})
112	TableRow   func(out *bytes.Buffer, text []byte, opaque interface{})
113	TableCell  func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
114
115	// Span-level callbacks---nil or return 0 prints the span verbatim
116	AutoLink       func(out *bytes.Buffer, link []byte, kind int, opaque interface{}) int
117	CodeSpan       func(out *bytes.Buffer, text []byte, opaque interface{}) int
118	DoubleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
119	Emphasis       func(out *bytes.Buffer, text []byte, opaque interface{}) int
120	Image          func(out *bytes.Buffer, link []byte, title []byte, alt []byte, opaque interface{}) int
121	LineBreak      func(out *bytes.Buffer, opaque interface{}) int
122	Link           func(out *bytes.Buffer, link []byte, title []byte, content []byte, opaque interface{}) int
123	RawHtmlTag     func(out *bytes.Buffer, tag []byte, opaque interface{}) int
124	TripleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
125	StrikeThrough  func(out *bytes.Buffer, text []byte, opaque interface{}) int
126
127	// Low-level callbacks---nil copies input directly into the output
128	Entity     func(out *bytes.Buffer, entity []byte, opaque interface{})
129	NormalText func(out *bytes.Buffer, text []byte, opaque interface{})
130
131	// Header and footer
132	DocumentHeader func(out *bytes.Buffer, opaque interface{})
133	DocumentFooter func(out *bytes.Buffer, opaque interface{})
134
135	// User data---passed back to every callback
136	Opaque interface{}
137}
138
139type inlineParser func(out *bytes.Buffer, rndr *render, data []byte, offset int) int
140
141type render struct {
142	mk         *Renderer
143	refs       map[string]*reference
144	inline     [256]inlineParser
145	flags      uint32
146	nesting    int
147	maxNesting int
148	insideLink bool
149}
150
151
152//
153//
154// Public interface
155//
156//
157
158// Parse and render a block of markdown-encoded text.
159// The renderer is used to format the output, and extensions dictates which
160// non-standard extensions are enabled.
161func Markdown(input []byte, renderer *Renderer, extensions uint32) []byte {
162	// no point in parsing if we can't render
163	if renderer == nil {
164		return nil
165	}
166
167	// fill in the render structure
168	rndr := new(render)
169	rndr.mk = renderer
170	rndr.flags = extensions
171	rndr.refs = make(map[string]*reference)
172	rndr.maxNesting = 16
173	rndr.insideLink = false
174
175	// register inline parsers
176	if rndr.mk.Emphasis != nil || rndr.mk.DoubleEmphasis != nil || rndr.mk.TripleEmphasis != nil {
177		rndr.inline['*'] = inlineEmphasis
178		rndr.inline['_'] = inlineEmphasis
179		if extensions&EXTENSION_STRIKETHROUGH != 0 {
180			rndr.inline['~'] = inlineEmphasis
181		}
182	}
183	if rndr.mk.CodeSpan != nil {
184		rndr.inline['`'] = inlineCodeSpan
185	}
186	if rndr.mk.LineBreak != nil {
187		rndr.inline['\n'] = inlineLineBreak
188	}
189	if rndr.mk.Image != nil || rndr.mk.Link != nil {
190		rndr.inline['['] = inlineLink
191	}
192	rndr.inline['<'] = inlineLAngle
193	rndr.inline['\\'] = inlineEscape
194	rndr.inline['&'] = inlineEntity
195
196	if extensions&EXTENSION_AUTOLINK != 0 {
197		rndr.inline[':'] = inlineAutoLink
198	}
199
200	first := FirstPass(rndr, input)
201	second := SecondPass(rndr, first)
202
203	return second
204}
205
206// first pass:
207// - extract references
208// - expand tabs
209// - normalize newlines
210// - copy everything else
211func FirstPass(rndr *render, input []byte) []byte {
212	var out bytes.Buffer
213	beg, end := 0, 0
214	for beg < len(input) { // iterate over lines
215		if end = isReference(rndr, input[beg:]); end > 0 {
216			beg += end
217		} else { // skip to the next line
218			end = beg
219			for end < len(input) && input[end] != '\n' && input[end] != '\r' {
220				end++
221			}
222
223			// add the line body if present
224			if end > beg {
225				expandTabs(&out, input[beg:end])
226			} else {
227				out.WriteByte('\n')
228			}
229
230			if end < len(input) && input[end] == '\r' {
231				end++
232			}
233			if end < len(input) && input[end] == '\n' {
234				end++
235			}
236
237			beg = end
238		}
239	}
240	return out.Bytes()
241}
242
243// second pass: actual rendering
244func SecondPass(rndr *render, input []byte) []byte {
245	var output bytes.Buffer
246	if rndr.mk.DocumentHeader != nil {
247		rndr.mk.DocumentHeader(&output, rndr.mk.Opaque)
248	}
249
250	parseBlock(&output, rndr, input)
251
252	if rndr.mk.DocumentFooter != nil {
253		rndr.mk.DocumentFooter(&output, rndr.mk.Opaque)
254	}
255
256	if rndr.nesting != 0 {
257		panic("Nesting level did not end at zero")
258	}
259
260	return output.Bytes()
261}
262
263
264//
265// Link references
266//
267// This section implements support for references that (usually) appear
268// as footnotes in a document, and can be referenced anywhere in the document.
269// The basic format is:
270//
271//    [1]: http://www.google.com/ "Google"
272//    [2]: http://www.github.com/ "Github"
273//
274// Anywhere in the document, the reference can be linked by referring to its
275// label, i.e., 1 and 2 in this example, as in:
276//
277//    This library is hosted on [Github][2], a git hosting site.
278
279// References are parsed and stored in this struct.
280type reference struct {
281	link  []byte
282	title []byte
283}
284
285// Check whether or not data starts with a reference link.
286// If so, it is parsed and stored in the list of references
287// (in the render struct).
288// Returns the number of bytes to skip to move past it,
289// or zero if the first line is not a reference.
290func isReference(rndr *render, data []byte) int {
291	// up to 3 optional leading spaces
292	if len(data) < 4 {
293		return 0
294	}
295	i := 0
296	for i < 3 && data[i] == ' ' {
297		i++
298	}
299
300	// id part: anything but a newline between brackets
301	if data[i] != '[' {
302		return 0
303	}
304	i++
305	id_offset := i
306	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
307		i++
308	}
309	if i >= len(data) || data[i] != ']' {
310		return 0
311	}
312	id_end := i
313
314	// spacer: colon (space | tab)* newline? (space | tab)*
315	i++
316	if i >= len(data) || data[i] != ':' {
317		return 0
318	}
319	i++
320	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
321		i++
322	}
323	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
324		i++
325		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
326			i++
327		}
328	}
329	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
330		i++
331	}
332	if i >= len(data) {
333		return 0
334	}
335
336	// link: whitespace-free sequence, optionally between angle brackets
337	if data[i] == '<' {
338		i++
339	}
340	link_offset := i
341	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
342		i++
343	}
344	link_end := i
345	if data[link_offset] == '<' && data[link_end-1] == '>' {
346		link_offset++
347		link_end--
348	}
349
350	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
351	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
352		i++
353	}
354	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
355		return 0
356	}
357
358	// compute end-of-line
359	line_end := 0
360	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
361		line_end = i
362	}
363	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
364		line_end++
365	}
366
367	// optional (space|tab)* spacer after a newline
368	if line_end > 0 {
369		i = line_end + 1
370		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
371			i++
372		}
373	}
374
375	// optional title: any non-newline sequence enclosed in '"() alone on its line
376	title_offset, title_end := 0, 0
377	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
378		i++
379		title_offset = i
380
381		// look for EOL
382		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
383			i++
384		}
385		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
386			title_end = i + 1
387		} else {
388			title_end = i
389		}
390
391		// step back
392		i--
393		for i > title_offset && (data[i] == ' ' || data[i] == '\t') {
394			i--
395		}
396		if i > title_offset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
397			line_end = title_end
398			title_end = i
399		}
400	}
401	if line_end == 0 { // garbage after the link
402		return 0
403	}
404
405	// a valid ref has been found
406	if rndr == nil {
407		return line_end
408	}
409
410	// id matches are case-insensitive
411	id := string(bytes.ToLower(data[id_offset:id_end]))
412	rndr.refs[id] = &reference{
413		link:  data[link_offset:link_end],
414		title: data[title_offset:title_end],
415	}
416
417	return line_end
418}
419
420
421//
422//
423// Miscellaneous helper functions
424//
425//
426
427
428// Test if a character is a punctuation symbol.
429// Taken from a private function in regexp in the stdlib.
430func ispunct(c byte) bool {
431	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
432		if c == r {
433			return true
434		}
435	}
436	return false
437}
438
439// Test if a character is a whitespace character.
440func isspace(c byte) bool {
441	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
442}
443
444// Test if a character is a letter or a digit.
445// TODO: check when this is looking for ASCII alnum and when it should use unicode
446func isalnum(c byte) bool {
447	return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
448}
449
450// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
451// always ends output with a newline
452func expandTabs(out *bytes.Buffer, line []byte) {
453	// first, check for common cases: no tabs, or only tabs at beginning of line
454	i, prefix := 0, 0
455	slowcase := false
456	for i = 0; i < len(line); i++ {
457		if line[i] == '\t' {
458			if prefix == i {
459				prefix++
460			} else {
461				slowcase = true
462				break
463			}
464		}
465	}
466
467	// no need to decode runes if all tabs are at the beginning of the line
468	if !slowcase {
469		for i = 0; i < prefix*TAB_SIZE; i++ {
470			out.WriteByte(' ')
471		}
472		out.Write(line[prefix:])
473		out.WriteByte('\n')
474		return
475	}
476
477	// the slow case: we need to count runes to figure out how
478	// many spaces to insert for each tab
479	column := 0
480	i = 0
481	for i < len(line) {
482		start := i
483		for i < len(line) && line[i] != '\t' {
484			_, size := utf8.DecodeRune(line[i:])
485			i += size
486			column++
487		}
488
489		if i > start {
490			out.Write(line[start:i])
491		}
492
493		if i >= len(line) {
494			break
495		}
496
497		for {
498			out.WriteByte(' ')
499			column++
500			if column%TAB_SIZE == 0 {
501				break
502			}
503		}
504
505		i++
506	}
507	out.WriteByte('\n')
508}