all repos — grayfriday @ 1e40ebaf472ae0587bc0c5efb72a807279df4e12

blackfriday fork with a few changes

markdown.go (view raw)

  1//
  2// Black Friday Markdown Processor
  3// Originally based on http://github.com/tanoku/upskirt
  4// by Russ Ross <russ@russross.com>
  5//
  6
  7//
  8//
  9// Markdown parsing and processing
 10//
 11//
 12
 13package blackfriday
 14
 15import (
 16	"bytes"
 17	"utf8"
 18)
 19
 20// These are the supported markdown parsing extensions.
 21// OR these values together to select multiple extensions.
 22const (
 23	EXTENSION_NO_INTRA_EMPHASIS = 1 << iota
 24	EXTENSION_TABLES
 25	EXTENSION_FENCED_CODE
 26	EXTENSION_AUTOLINK
 27	EXTENSION_STRIKETHROUGH
 28	EXTENSION_LAX_HTML_BLOCKS
 29	EXTENSION_SPACE_HEADERS
 30)
 31
 32// These are the possible flag values for the link renderer.
 33// Only a single one of these values will be used; they are not ORed together.
 34// These are mostly of interest if you are writing a new output format.
 35const (
 36	LINK_TYPE_NOT_AUTOLINK = iota
 37	LINK_TYPE_NORMAL
 38	LINK_TYPE_EMAIL
 39)
 40
 41// These are the possible flag values for the listitem renderer.
 42// Multiple flag values may be ORed together.
 43// These are mostly of interest if you are writing a new output format.
 44const (
 45	LIST_TYPE_ORDERED = 1 << iota
 46	LIST_ITEM_CONTAINS_BLOCK
 47	LIST_ITEM_END_OF_LIST
 48)
 49
 50// These are the possible flag values for the table cell renderer.
 51// Only a single one of these values will be used; they are not ORed together.
 52// These are mostly of interest if you are writing a new output format.
 53const (
 54	TABLE_ALIGNMENT_LEFT = 1 << iota
 55	TABLE_ALIGNMENT_RIGHT
 56	TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
 57)
 58
 59// The size of a tab stop.
 60const TAB_SIZE = 4
 61
 62// These are the tags that are recognized as HTML block tags.
 63// Any of these can be included in markdown text without special escaping.
 64var block_tags = map[string]bool{
 65	"p":          true,
 66	"dl":         true,
 67	"h1":         true,
 68	"h2":         true,
 69	"h3":         true,
 70	"h4":         true,
 71	"h5":         true,
 72	"h6":         true,
 73	"ol":         true,
 74	"ul":         true,
 75	"del":        true,
 76	"div":        true,
 77	"ins":        true,
 78	"pre":        true,
 79	"form":       true,
 80	"math":       true,
 81	"table":      true,
 82	"iframe":     true,
 83	"script":     true,
 84	"fieldset":   true,
 85	"noscript":   true,
 86	"blockquote": true,
 87}
 88
 89// This struct defines the rendering interface.
 90// A series of callback functions are registered to form a complete renderer.
 91// A single interface{} value field is provided, and that value is handed to
 92// each callback. Leaving a field blank suppresses rendering that type of output
 93// except where noted.
 94//
 95// This is mostly of interest if you are implementing a new rendering format.
 96// Most users will use the convenience functions to fill in this structure.
 97type Renderer struct {
 98	// block-level callbacks---nil skips the block
 99	BlockCode  func(out *bytes.Buffer, text []byte, lang string, opaque interface{})
100	BlockQuote func(out *bytes.Buffer, text []byte, opaque interface{})
101	BlockHtml  func(out *bytes.Buffer, text []byte, opaque interface{})
102	Header     func(out *bytes.Buffer, text []byte, level int, opaque interface{})
103	HRule      func(out *bytes.Buffer, opaque interface{})
104	List       func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
105	ListItem   func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
106	Paragraph  func(out *bytes.Buffer, text []byte, opaque interface{})
107	Table      func(out *bytes.Buffer, header []byte, body []byte, columnData []int, opaque interface{})
108	TableRow   func(out *bytes.Buffer, text []byte, opaque interface{})
109	TableCell  func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
110
111	// Span-level callbacks---nil or return 0 prints the span verbatim
112	AutoLink       func(out *bytes.Buffer, link []byte, kind int, opaque interface{}) int
113	CodeSpan       func(out *bytes.Buffer, text []byte, opaque interface{}) int
114	DoubleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
115	Emphasis       func(out *bytes.Buffer, text []byte, opaque interface{}) int
116	Image          func(out *bytes.Buffer, link []byte, title []byte, alt []byte, opaque interface{}) int
117	LineBreak      func(out *bytes.Buffer, opaque interface{}) int
118	Link           func(out *bytes.Buffer, link []byte, title []byte, content []byte, opaque interface{}) int
119	RawHtmlTag     func(out *bytes.Buffer, tag []byte, opaque interface{}) int
120	TripleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
121	StrikeThrough  func(out *bytes.Buffer, text []byte, opaque interface{}) int
122
123	// Low-level callbacks---nil copies input directly into the output
124	Entity     func(out *bytes.Buffer, entity []byte, opaque interface{})
125	NormalText func(out *bytes.Buffer, text []byte, opaque interface{})
126
127	// Header and footer
128	DocumentHeader func(out *bytes.Buffer, opaque interface{})
129	DocumentFooter func(out *bytes.Buffer, opaque interface{})
130
131	// User data---passed back to every callback
132	Opaque interface{}
133}
134
135type inlineParser func(out *bytes.Buffer, rndr *render, data []byte, offset int) int
136
137type render struct {
138	mk         *Renderer
139	refs       map[string]*reference
140	inline     [256]inlineParser
141	flags      uint32
142	nesting    int
143	maxNesting int
144}
145
146
147//
148//
149// Public interface
150//
151//
152
153// Parse and render a block of markdown-encoded text.
154// The renderer is used to format the output, and extensions dictates which
155// non-standard extensions are enabled.
156func Markdown(input []byte, renderer *Renderer, extensions uint32) []byte {
157	// no point in parsing if we can't render
158	if renderer == nil {
159		return nil
160	}
161
162	// fill in the render structure
163	rndr := new(render)
164	rndr.mk = renderer
165	rndr.flags = extensions
166	rndr.refs = make(map[string]*reference)
167	rndr.maxNesting = 16
168
169	// register inline parsers
170	if rndr.mk.Emphasis != nil || rndr.mk.DoubleEmphasis != nil || rndr.mk.TripleEmphasis != nil {
171		rndr.inline['*'] = inlineEmphasis
172		rndr.inline['_'] = inlineEmphasis
173		if extensions&EXTENSION_STRIKETHROUGH != 0 {
174			rndr.inline['~'] = inlineEmphasis
175		}
176	}
177	if rndr.mk.CodeSpan != nil {
178		rndr.inline['`'] = inlineCodeSpan
179	}
180	if rndr.mk.LineBreak != nil {
181		rndr.inline['\n'] = inlineLineBreak
182	}
183	if rndr.mk.Image != nil || rndr.mk.Link != nil {
184		rndr.inline['['] = inlineLink
185	}
186	rndr.inline['<'] = inlineLAngle
187	rndr.inline['\\'] = inlineEscape
188	rndr.inline['&'] = inlineEntity
189
190	if extensions&EXTENSION_AUTOLINK != 0 {
191		rndr.inline[':'] = inlineAutoLink
192	}
193
194	// first pass: look for references, copy everything else
195	var text bytes.Buffer
196	beg, end := 0, 0
197	for beg < len(input) { // iterate over lines
198		if end = isReference(rndr, input[beg:]); end > 0 {
199			beg += end
200		} else { // skip to the next line
201			end = beg
202			for end < len(input) && input[end] != '\n' && input[end] != '\r' {
203				end++
204			}
205
206			// add the line body if present
207			if end > beg {
208				expandTabs(&text, input[beg:end])
209			}
210
211			for end < len(input) && (input[end] == '\n' || input[end] == '\r') {
212				// add one \n per newline
213				if input[end] == '\n' || (end+1 < len(input) && input[end+1] != '\n') {
214					text.WriteByte('\n')
215				}
216				end++
217			}
218
219			beg = end
220		}
221	}
222
223	// second pass: actual rendering
224	var output bytes.Buffer
225	if rndr.mk.DocumentHeader != nil {
226		rndr.mk.DocumentHeader(&output, rndr.mk.Opaque)
227	}
228
229	if text.Len() > 0 {
230		// add a final newline if not already present
231		finalchar := text.Bytes()[text.Len()-1]
232		if finalchar != '\n' && finalchar != '\r' {
233			text.WriteByte('\n')
234		}
235		parseBlock(&output, rndr, text.Bytes())
236	}
237
238	if rndr.mk.DocumentFooter != nil {
239		rndr.mk.DocumentFooter(&output, rndr.mk.Opaque)
240	}
241
242	if rndr.nesting != 0 {
243		panic("Nesting level did not end at zero")
244	}
245
246	return output.Bytes()
247}
248
249
250//
251// Link references
252//
253// This section implements support for references that (usually) appear
254// as footnotes in a document, and can be referenced anywhere in the document.
255// The basic format is:
256//
257//    [1]: http://www.google.com/ "Google"
258//    [2]: http://www.github.com/ "Github"
259//
260// Anywhere in the document, the reference can be linked by referring to its
261// label, i.e., 1 and 2 in this example, as in:
262//
263//    This library is hosted on [Github][2], a git hosting site.
264
265// References are parsed and stored in this struct.
266type reference struct {
267	link  []byte
268	title []byte
269}
270
271// Check whether or not data starts with a reference link.
272// If so, it is parsed and stored in the list of references
273// (in the render struct).
274// Returns the number of bytes to skip to move past it, or zero
275// if there is the first line is not a reference.
276func isReference(rndr *render, data []byte) int {
277	// up to 3 optional leading spaces
278	if len(data) < 4 {
279		return 0
280	}
281	i := 0
282	for i < 3 && data[i] == ' ' {
283		i++
284	}
285	if data[i] == ' ' {
286		return 0
287	}
288
289	// id part: anything but a newline between brackets
290	if data[i] != '[' {
291		return 0
292	}
293	i++
294	id_offset := i
295	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
296		i++
297	}
298	if i >= len(data) || data[i] != ']' {
299		return 0
300	}
301	id_end := i
302
303	// spacer: colon (space | tab)* newline? (space | tab)*
304	i++
305	if i >= len(data) || data[i] != ':' {
306		return 0
307	}
308	i++
309	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
310		i++
311	}
312	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
313		i++
314		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
315			i++
316		}
317	}
318	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
319		i++
320	}
321	if i >= len(data) {
322		return 0
323	}
324
325	// link: whitespace-free sequence, optionally between angle brackets
326	if data[i] == '<' {
327		i++
328	}
329	link_offset := i
330	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
331		i++
332	}
333	link_end := i
334	if data[link_offset] == '<' && data[link_end-1] == '>' {
335		link_offset++
336		link_end--
337	}
338
339	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
340	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
341		i++
342	}
343	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
344		return 0
345	}
346
347	// compute end-of-line
348	line_end := 0
349	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
350		line_end = i
351	}
352	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
353		line_end++
354	}
355
356	// optional (space|tab)* spacer after a newline
357	if line_end > 0 {
358		i = line_end + 1
359		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
360			i++
361		}
362	}
363
364	// optional title: any non-newline sequence enclosed in '"() alone on its line
365	title_offset, title_end := 0, 0
366	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
367		i++
368		title_offset = i
369
370		// look for EOL
371		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
372			i++
373		}
374		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
375			title_end = i + 1
376		} else {
377			title_end = i
378		}
379
380		// step back
381		i--
382		for i > title_offset && (data[i] == ' ' || data[i] == '\t') {
383			i--
384		}
385		if i > title_offset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
386			line_end = title_end
387			title_end = i
388		}
389	}
390	if line_end == 0 { // garbage after the link
391		return 0
392	}
393
394	// a valid ref has been found
395	if rndr == nil {
396		return line_end
397	}
398
399	// id matches are case-insensitive
400	id := string(bytes.ToLower(data[id_offset:id_end]))
401	rndr.refs[id] = &reference{
402		link:  data[link_offset:link_end],
403		title: data[title_offset:title_end],
404	}
405
406	return line_end
407}
408
409
410//
411//
412// Miscellaneous helper functions
413//
414//
415
416
417// Test if a character is a punctuation symbol.
418// Taken from a private function in regexp in the stdlib.
419func ispunct(c byte) bool {
420	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
421		if c == r {
422			return true
423		}
424	}
425	return false
426}
427
428// Test if a character is a whitespace character.
429func isspace(c byte) bool {
430	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
431}
432
433// Test if a character is a letter or a digit.
434// TODO: check when this is looking for ASCII alnum and when it should use unicode
435func isalnum(c byte) bool {
436	return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
437}
438
439// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
440func expandTabs(out *bytes.Buffer, line []byte) {
441	// first, check for common cases: no tabs, or only tabs at beginning of line
442	i, prefix := 0, 0
443	slowcase := false
444	for i = 0; i < len(line); i++ {
445		if line[i] == '\t' {
446			if prefix == i {
447				prefix++
448			} else {
449				slowcase = true
450				break
451			}
452		}
453	}
454
455	// no need to decode runes if all tabs are at the beginning of the line
456	if !slowcase {
457		for i = 0; i < prefix*TAB_SIZE; i++ {
458			out.WriteByte(' ')
459		}
460		out.Write(line[prefix:])
461		return
462	}
463
464	// the slow case: we need to count runes to figure out how
465	// many spaces to insert for each tab
466	column := 0
467    i = 0
468	for i < len(line) {
469		start := i
470		for i < len(line) && line[i] != '\t' {
471			_, size := utf8.DecodeRune(line[i:])
472			i += size
473			column++
474		}
475
476		if i > start {
477			out.Write(line[start:i])
478		}
479
480		if i >= len(line) {
481			break
482		}
483
484		for {
485			out.WriteByte(' ')
486			column++
487			if column%TAB_SIZE == 0 {
488				break
489			}
490		}
491
492		i++
493	}
494}