all repos — grayfriday @ eff64c563f75c50f8af0ec650e719846efff659b

blackfriday fork with a few changes

markdown.go (view raw)

  1//
  2// Black Friday Markdown Processor
  3// Originally based on http://github.com/tanoku/upskirt
  4// by Russ Ross <russ@russross.com>
  5//
  6
  7//
  8//
  9// Markdown parsing and processing
 10//
 11//
 12
 13package blackfriday
 14
 15import (
 16	"bytes"
 17	"utf8"
 18)
 19
 20// These are the supported markdown parsing extensions.
 21// OR these values together to select multiple extensions.
 22const (
 23	EXTENSION_NO_INTRA_EMPHASIS = 1 << iota
 24	EXTENSION_TABLES
 25	EXTENSION_FENCED_CODE
 26	EXTENSION_AUTOLINK
 27	EXTENSION_STRIKETHROUGH
 28	EXTENSION_LAX_HTML_BLOCKS
 29	EXTENSION_SPACE_HEADERS
 30)
 31
 32// These are the possible flag values for the link renderer.
 33// Only a single one of these values will be used; they are not ORed together.
 34// These are mostly of interest if you are writing a new output format.
 35const (
 36	LINK_TYPE_NOT_AUTOLINK = iota
 37	LINK_TYPE_NORMAL
 38	LINK_TYPE_EMAIL
 39)
 40
 41// These are the possible flag values for the listitem renderer.
 42// Multiple flag values may be ORed together.
 43// These are mostly of interest if you are writing a new output format.
 44const (
 45	LIST_TYPE_ORDERED = 1 << iota
 46	LIST_ITEM_CONTAINS_BLOCK
 47	LIST_ITEM_END_OF_LIST
 48)
 49
 50// These are the possible flag values for the table cell renderer.
 51// Only a single one of these values will be used; they are not ORed together.
 52// These are mostly of interest if you are writing a new output format.
 53const (
 54	TABLE_ALIGNMENT_LEFT = 1 << iota
 55	TABLE_ALIGNMENT_RIGHT
 56	TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
 57)
 58
 59// The size of a tab stop.
 60const TAB_SIZE = 4
 61
 62// These are the tags that are recognized as HTML block tags.
 63// Any of these can be included in markdown text without special escaping.
 64var block_tags = map[string]bool{
 65	"p":          true,
 66	"dl":         true,
 67	"h1":         true,
 68	"h2":         true,
 69	"h3":         true,
 70	"h4":         true,
 71	"h5":         true,
 72	"h6":         true,
 73	"ol":         true,
 74	"ul":         true,
 75	"del":        true,
 76	"div":        true,
 77	"ins":        true,
 78	"pre":        true,
 79	"form":       true,
 80	"math":       true,
 81	"table":      true,
 82	"iframe":     true,
 83	"script":     true,
 84	"fieldset":   true,
 85	"noscript":   true,
 86	"blockquote": true,
 87}
 88
 89// This struct defines the rendering interface.
 90// A series of callback functions are registered to form a complete renderer.
 91// A single interface{} value field is provided, and that value is handed to
 92// each callback. Leaving a field blank suppresses rendering that type of output
 93// except where noted.
 94//
 95// This is mostly of interest if you are implementing a new rendering format.
 96// Most users will use the convenience functions to fill in this structure.
 97type Renderer struct {
 98	// block-level callbacks---nil skips the block
 99	BlockCode  func(out *bytes.Buffer, text []byte, lang string, opaque interface{})
100	BlockQuote func(out *bytes.Buffer, text []byte, opaque interface{})
101	BlockHtml  func(out *bytes.Buffer, text []byte, opaque interface{})
102	Header     func(out *bytes.Buffer, text func() bool, level int, opaque interface{})
103	HRule      func(out *bytes.Buffer, opaque interface{})
104	List       func(out *bytes.Buffer, text func() bool, flags int, opaque interface{})
105	ListItem   func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
106	Paragraph  func(out *bytes.Buffer, text []byte, opaque interface{})
107	Table      func(out *bytes.Buffer, header []byte, body []byte, columnData []int, opaque interface{})
108	TableRow   func(out *bytes.Buffer, text []byte, opaque interface{})
109	TableCell  func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
110
111	// Span-level callbacks---nil or return 0 prints the span verbatim
112	AutoLink       func(out *bytes.Buffer, link []byte, kind int, opaque interface{}) int
113	CodeSpan       func(out *bytes.Buffer, text []byte, opaque interface{}) int
114	DoubleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
115	Emphasis       func(out *bytes.Buffer, text []byte, opaque interface{}) int
116	Image          func(out *bytes.Buffer, link []byte, title []byte, alt []byte, opaque interface{}) int
117	LineBreak      func(out *bytes.Buffer, opaque interface{}) int
118	Link           func(out *bytes.Buffer, link []byte, title []byte, content []byte, opaque interface{}) int
119	RawHtmlTag     func(out *bytes.Buffer, tag []byte, opaque interface{}) int
120	TripleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
121	StrikeThrough  func(out *bytes.Buffer, text []byte, opaque interface{}) int
122
123	// Low-level callbacks---nil copies input directly into the output
124	Entity     func(out *bytes.Buffer, entity []byte, opaque interface{})
125	NormalText func(out *bytes.Buffer, text []byte, opaque interface{})
126
127	// Header and footer
128	DocumentHeader func(out *bytes.Buffer, opaque interface{})
129	DocumentFooter func(out *bytes.Buffer, opaque interface{})
130
131	// User data---passed back to every callback
132	Opaque interface{}
133}
134
135type inlineParser func(out *bytes.Buffer, rndr *render, data []byte, offset int) int
136
137type render struct {
138	mk         *Renderer
139	refs       map[string]*reference
140	inline     [256]inlineParser
141	flags      uint32
142	nesting    int
143	maxNesting int
144	insideLink bool
145}
146
147
148//
149//
150// Public interface
151//
152//
153
154// Parse and render a block of markdown-encoded text.
155// The renderer is used to format the output, and extensions dictates which
156// non-standard extensions are enabled.
157func Markdown(input []byte, renderer *Renderer, extensions uint32) []byte {
158	// no point in parsing if we can't render
159	if renderer == nil {
160		return nil
161	}
162
163	// fill in the render structure
164	rndr := new(render)
165	rndr.mk = renderer
166	rndr.flags = extensions
167	rndr.refs = make(map[string]*reference)
168	rndr.maxNesting = 16
169	rndr.insideLink = false
170
171	// register inline parsers
172	if rndr.mk.Emphasis != nil || rndr.mk.DoubleEmphasis != nil || rndr.mk.TripleEmphasis != nil {
173		rndr.inline['*'] = inlineEmphasis
174		rndr.inline['_'] = inlineEmphasis
175		if extensions&EXTENSION_STRIKETHROUGH != 0 {
176			rndr.inline['~'] = inlineEmphasis
177		}
178	}
179	if rndr.mk.CodeSpan != nil {
180		rndr.inline['`'] = inlineCodeSpan
181	}
182	if rndr.mk.LineBreak != nil {
183		rndr.inline['\n'] = inlineLineBreak
184	}
185	if rndr.mk.Image != nil || rndr.mk.Link != nil {
186		rndr.inline['['] = inlineLink
187	}
188	rndr.inline['<'] = inlineLAngle
189	rndr.inline['\\'] = inlineEscape
190	rndr.inline['&'] = inlineEntity
191
192	if extensions&EXTENSION_AUTOLINK != 0 {
193		rndr.inline[':'] = inlineAutoLink
194	}
195
196	// first pass: look for references, copy everything else
197	var text bytes.Buffer
198	beg, end := 0, 0
199	for beg < len(input) { // iterate over lines
200		if end = isReference(rndr, input[beg:]); end > 0 {
201			beg += end
202		} else { // skip to the next line
203			end = beg
204			for end < len(input) && input[end] != '\n' && input[end] != '\r' {
205				end++
206			}
207
208			// add the line body if present
209			if end > beg {
210				expandTabs(&text, input[beg:end])
211			}
212
213			for end < len(input) && (input[end] == '\n' || input[end] == '\r') {
214				// add one \n per newline
215				if input[end] == '\n' || (end+1 < len(input) && input[end+1] != '\n') {
216					text.WriteByte('\n')
217				}
218				end++
219			}
220
221			beg = end
222		}
223	}
224
225	// second pass: actual rendering
226	var output bytes.Buffer
227	if rndr.mk.DocumentHeader != nil {
228		rndr.mk.DocumentHeader(&output, rndr.mk.Opaque)
229	}
230
231	if text.Len() > 0 {
232		// add a final newline if not already present
233		finalchar := text.Bytes()[text.Len()-1]
234		if finalchar != '\n' && finalchar != '\r' {
235			text.WriteByte('\n')
236		}
237		parseBlock(&output, rndr, text.Bytes())
238	}
239
240	if rndr.mk.DocumentFooter != nil {
241		rndr.mk.DocumentFooter(&output, rndr.mk.Opaque)
242	}
243
244	if rndr.nesting != 0 {
245		panic("Nesting level did not end at zero")
246	}
247
248	return output.Bytes()
249}
250
251
252//
253// Link references
254//
255// This section implements support for references that (usually) appear
256// as footnotes in a document, and can be referenced anywhere in the document.
257// The basic format is:
258//
259//    [1]: http://www.google.com/ "Google"
260//    [2]: http://www.github.com/ "Github"
261//
262// Anywhere in the document, the reference can be linked by referring to its
263// label, i.e., 1 and 2 in this example, as in:
264//
265//    This library is hosted on [Github][2], a git hosting site.
266
267// References are parsed and stored in this struct.
268type reference struct {
269	link  []byte
270	title []byte
271}
272
273// Check whether or not data starts with a reference link.
274// If so, it is parsed and stored in the list of references
275// (in the render struct).
276// Returns the number of bytes to skip to move past it, or zero
277// if there is the first line is not a reference.
278func isReference(rndr *render, data []byte) int {
279	// up to 3 optional leading spaces
280	if len(data) < 4 {
281		return 0
282	}
283	i := 0
284	for i < 3 && data[i] == ' ' {
285		i++
286	}
287	if data[i] == ' ' {
288		return 0
289	}
290
291	// id part: anything but a newline between brackets
292	if data[i] != '[' {
293		return 0
294	}
295	i++
296	id_offset := i
297	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
298		i++
299	}
300	if i >= len(data) || data[i] != ']' {
301		return 0
302	}
303	id_end := i
304
305	// spacer: colon (space | tab)* newline? (space | tab)*
306	i++
307	if i >= len(data) || data[i] != ':' {
308		return 0
309	}
310	i++
311	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
312		i++
313	}
314	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
315		i++
316		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
317			i++
318		}
319	}
320	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
321		i++
322	}
323	if i >= len(data) {
324		return 0
325	}
326
327	// link: whitespace-free sequence, optionally between angle brackets
328	if data[i] == '<' {
329		i++
330	}
331	link_offset := i
332	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
333		i++
334	}
335	link_end := i
336	if data[link_offset] == '<' && data[link_end-1] == '>' {
337		link_offset++
338		link_end--
339	}
340
341	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
342	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
343		i++
344	}
345	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
346		return 0
347	}
348
349	// compute end-of-line
350	line_end := 0
351	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
352		line_end = i
353	}
354	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
355		line_end++
356	}
357
358	// optional (space|tab)* spacer after a newline
359	if line_end > 0 {
360		i = line_end + 1
361		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
362			i++
363		}
364	}
365
366	// optional title: any non-newline sequence enclosed in '"() alone on its line
367	title_offset, title_end := 0, 0
368	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
369		i++
370		title_offset = i
371
372		// look for EOL
373		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
374			i++
375		}
376		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
377			title_end = i + 1
378		} else {
379			title_end = i
380		}
381
382		// step back
383		i--
384		for i > title_offset && (data[i] == ' ' || data[i] == '\t') {
385			i--
386		}
387		if i > title_offset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
388			line_end = title_end
389			title_end = i
390		}
391	}
392	if line_end == 0 { // garbage after the link
393		return 0
394	}
395
396	// a valid ref has been found
397	if rndr == nil {
398		return line_end
399	}
400
401	// id matches are case-insensitive
402	id := string(bytes.ToLower(data[id_offset:id_end]))
403	rndr.refs[id] = &reference{
404		link:  data[link_offset:link_end],
405		title: data[title_offset:title_end],
406	}
407
408	return line_end
409}
410
411
412//
413//
414// Miscellaneous helper functions
415//
416//
417
418
419// Test if a character is a punctuation symbol.
420// Taken from a private function in regexp in the stdlib.
421func ispunct(c byte) bool {
422	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
423		if c == r {
424			return true
425		}
426	}
427	return false
428}
429
430// Test if a character is a whitespace character.
431func isspace(c byte) bool {
432	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
433}
434
435// Test if a character is a letter or a digit.
436// TODO: check when this is looking for ASCII alnum and when it should use unicode
437func isalnum(c byte) bool {
438	return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
439}
440
441// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
442func expandTabs(out *bytes.Buffer, line []byte) {
443	// first, check for common cases: no tabs, or only tabs at beginning of line
444	i, prefix := 0, 0
445	slowcase := false
446	for i = 0; i < len(line); i++ {
447		if line[i] == '\t' {
448			if prefix == i {
449				prefix++
450			} else {
451				slowcase = true
452				break
453			}
454		}
455	}
456
457	// no need to decode runes if all tabs are at the beginning of the line
458	if !slowcase {
459		for i = 0; i < prefix*TAB_SIZE; i++ {
460			out.WriteByte(' ')
461		}
462		out.Write(line[prefix:])
463		return
464	}
465
466	// the slow case: we need to count runes to figure out how
467	// many spaces to insert for each tab
468	column := 0
469	i = 0
470	for i < len(line) {
471		start := i
472		for i < len(line) && line[i] != '\t' {
473			_, size := utf8.DecodeRune(line[i:])
474			i += size
475			column++
476		}
477
478		if i > start {
479			out.Write(line[start:i])
480		}
481
482		if i >= len(line) {
483			break
484		}
485
486		for {
487			out.WriteByte(' ')
488			column++
489			if column%TAB_SIZE == 0 {
490				break
491			}
492		}
493
494		i++
495	}
496}