all repos — grayfriday @ fffbd3ed1a3469e71a3a8c9b9d7d77808cdeaeb8

blackfriday fork with a few changes

markdown.go (view raw)

  1//
  2// Black Friday Markdown Processor
  3// Originally based on http://github.com/tanoku/upskirt
  4// by Russ Ross <russ@russross.com>
  5//
  6
  7//
  8//
  9// Markdown parsing and processing
 10//
 11//
 12
 13package blackfriday
 14
 15import (
 16	"bytes"
 17	"utf8"
 18)
 19
 20// These are the supported markdown parsing extensions.
 21// OR these values together to select multiple extensions.
 22const (
 23	EXTENSION_NO_INTRA_EMPHASIS = 1 << iota
 24	EXTENSION_TABLES
 25	EXTENSION_FENCED_CODE
 26	EXTENSION_AUTOLINK
 27	EXTENSION_STRIKETHROUGH
 28	EXTENSION_LAX_HTML_BLOCKS
 29	EXTENSION_SPACE_HEADERS
 30	EXTENSION_HARD_LINE_BREAK
 31)
 32
 33// These are the possible flag values for the link renderer.
 34// Only a single one of these values will be used; they are not ORed together.
 35// These are mostly of interest if you are writing a new output format.
 36const (
 37	LINK_TYPE_NOT_AUTOLINK = iota
 38	LINK_TYPE_NORMAL
 39	LINK_TYPE_EMAIL
 40)
 41
 42// These are the possible flag values for the listitem renderer.
 43// Multiple flag values may be ORed together.
 44// These are mostly of interest if you are writing a new output format.
 45const (
 46	LIST_TYPE_ORDERED = 1 << iota
 47	LIST_ITEM_CONTAINS_BLOCK
 48	LIST_ITEM_END_OF_LIST
 49)
 50
 51// These are the possible flag values for the table cell renderer.
 52// Only a single one of these values will be used; they are not ORed together.
 53// These are mostly of interest if you are writing a new output format.
 54const (
 55	TABLE_ALIGNMENT_LEFT = 1 << iota
 56	TABLE_ALIGNMENT_RIGHT
 57	TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
 58)
 59
 60// The size of a tab stop.
 61const TAB_SIZE = 4
 62
 63// These are the tags that are recognized as HTML block tags.
 64// Any of these can be included in markdown text without special escaping.
 65var block_tags = map[string]bool{
 66	"p":          true,
 67	"dl":         true,
 68	"h1":         true,
 69	"h2":         true,
 70	"h3":         true,
 71	"h4":         true,
 72	"h5":         true,
 73	"h6":         true,
 74	"ol":         true,
 75	"ul":         true,
 76	"del":        true,
 77	"div":        true,
 78	"ins":        true,
 79	"pre":        true,
 80	"form":       true,
 81	"math":       true,
 82	"table":      true,
 83	"iframe":     true,
 84	"script":     true,
 85	"fieldset":   true,
 86	"noscript":   true,
 87	"blockquote": true,
 88}
 89
 90// This struct defines the rendering interface.
 91// A series of callback functions are registered to form a complete renderer.
 92// A single interface{} value field is provided, and that value is handed to
 93// each callback. Leaving a field blank suppresses rendering that type of output
 94// except where noted.
 95//
 96// This is mostly of interest if you are implementing a new rendering format.
 97// Most users will use the convenience functions to fill in this structure.
 98type Renderer struct {
 99	// block-level callbacks---nil skips the block
100	BlockCode  func(out *bytes.Buffer, text []byte, lang string, opaque interface{})
101	BlockQuote func(out *bytes.Buffer, text []byte, opaque interface{})
102	BlockHtml  func(out *bytes.Buffer, text []byte, opaque interface{})
103	Header     func(out *bytes.Buffer, text func() bool, level int, opaque interface{})
104	HRule      func(out *bytes.Buffer, opaque interface{})
105	List       func(out *bytes.Buffer, text func() bool, flags int, opaque interface{})
106	ListItem   func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
107	Paragraph  func(out *bytes.Buffer, text func() bool, opaque interface{})
108	Table      func(out *bytes.Buffer, header []byte, body []byte, columnData []int, opaque interface{})
109	TableRow   func(out *bytes.Buffer, text []byte, opaque interface{})
110	TableCell  func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
111
112	// Span-level callbacks---nil or return 0 prints the span verbatim
113	AutoLink       func(out *bytes.Buffer, link []byte, kind int, opaque interface{}) int
114	CodeSpan       func(out *bytes.Buffer, text []byte, opaque interface{}) int
115	DoubleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
116	Emphasis       func(out *bytes.Buffer, text []byte, opaque interface{}) int
117	Image          func(out *bytes.Buffer, link []byte, title []byte, alt []byte, opaque interface{}) int
118	LineBreak      func(out *bytes.Buffer, opaque interface{}) int
119	Link           func(out *bytes.Buffer, link []byte, title []byte, content []byte, opaque interface{}) int
120	RawHtmlTag     func(out *bytes.Buffer, tag []byte, opaque interface{}) int
121	TripleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
122	StrikeThrough  func(out *bytes.Buffer, text []byte, opaque interface{}) int
123
124	// Low-level callbacks---nil copies input directly into the output
125	Entity     func(out *bytes.Buffer, entity []byte, opaque interface{})
126	NormalText func(out *bytes.Buffer, text []byte, opaque interface{})
127
128	// Header and footer
129	DocumentHeader func(out *bytes.Buffer, opaque interface{})
130	DocumentFooter func(out *bytes.Buffer, opaque interface{})
131
132	// User data---passed back to every callback
133	Opaque interface{}
134}
135
136type inlineParser func(out *bytes.Buffer, rndr *render, data []byte, offset int) int
137
138type render struct {
139	mk         *Renderer
140	refs       map[string]*reference
141	inline     [256]inlineParser
142	flags      uint32
143	nesting    int
144	maxNesting int
145	insideLink bool
146}
147
148
149//
150//
151// Public interface
152//
153//
154
155// Parse and render a block of markdown-encoded text.
156// The renderer is used to format the output, and extensions dictates which
157// non-standard extensions are enabled.
158func Markdown(input []byte, renderer *Renderer, extensions uint32) []byte {
159	// no point in parsing if we can't render
160	if renderer == nil {
161		return nil
162	}
163
164	// fill in the render structure
165	rndr := new(render)
166	rndr.mk = renderer
167	rndr.flags = extensions
168	rndr.refs = make(map[string]*reference)
169	rndr.maxNesting = 16
170	rndr.insideLink = false
171
172	// register inline parsers
173	if rndr.mk.Emphasis != nil || rndr.mk.DoubleEmphasis != nil || rndr.mk.TripleEmphasis != nil {
174		rndr.inline['*'] = inlineEmphasis
175		rndr.inline['_'] = inlineEmphasis
176		if extensions&EXTENSION_STRIKETHROUGH != 0 {
177			rndr.inline['~'] = inlineEmphasis
178		}
179	}
180	if rndr.mk.CodeSpan != nil {
181		rndr.inline['`'] = inlineCodeSpan
182	}
183	if rndr.mk.LineBreak != nil {
184		rndr.inline['\n'] = inlineLineBreak
185	}
186	if rndr.mk.Image != nil || rndr.mk.Link != nil {
187		rndr.inline['['] = inlineLink
188	}
189	rndr.inline['<'] = inlineLAngle
190	rndr.inline['\\'] = inlineEscape
191	rndr.inline['&'] = inlineEntity
192
193	if extensions&EXTENSION_AUTOLINK != 0 {
194		rndr.inline[':'] = inlineAutoLink
195	}
196
197	first := FirstPass(rndr, input)
198	second := SecondPass(rndr, first)
199
200	return second
201}
202
203// first pass:
204// - extract references
205// - expand tabs
206// - normalize newlines
207// - copy everything else
208func FirstPass(rndr *render, input []byte) []byte {
209	var out bytes.Buffer
210	beg, end := 0, 0
211	for beg < len(input) { // iterate over lines
212		if end = isReference(rndr, input[beg:]); end > 0 {
213			beg += end
214		} else { // skip to the next line
215			end = beg
216			for end < len(input) && input[end] != '\n' && input[end] != '\r' {
217				end++
218			}
219
220			// add the line body if present
221			if end > beg {
222				expandTabs(&out, input[beg:end])
223			} else {
224				out.WriteByte('\n')
225			}
226
227			if end < len(input) && input[end] == '\r' {
228				end++
229			}
230			if end < len(input) && input[end] == '\n' {
231				end++
232			}
233
234			beg = end
235		}
236	}
237	return out.Bytes()
238}
239
240// second pass: actual rendering
241func SecondPass(rndr *render, input []byte) []byte {
242	var output bytes.Buffer
243	if rndr.mk.DocumentHeader != nil {
244		rndr.mk.DocumentHeader(&output, rndr.mk.Opaque)
245	}
246
247	parseBlock(&output, rndr, input)
248
249	if rndr.mk.DocumentFooter != nil {
250		rndr.mk.DocumentFooter(&output, rndr.mk.Opaque)
251	}
252
253	if rndr.nesting != 0 {
254		panic("Nesting level did not end at zero")
255	}
256
257	return output.Bytes()
258}
259
260
261//
262// Link references
263//
264// This section implements support for references that (usually) appear
265// as footnotes in a document, and can be referenced anywhere in the document.
266// The basic format is:
267//
268//    [1]: http://www.google.com/ "Google"
269//    [2]: http://www.github.com/ "Github"
270//
271// Anywhere in the document, the reference can be linked by referring to its
272// label, i.e., 1 and 2 in this example, as in:
273//
274//    This library is hosted on [Github][2], a git hosting site.
275
276// References are parsed and stored in this struct.
277type reference struct {
278	link  []byte
279	title []byte
280}
281
282// Check whether or not data starts with a reference link.
283// If so, it is parsed and stored in the list of references
284// (in the render struct).
285// Returns the number of bytes to skip to move past it,
286// or zero if the first line is not a reference.
287func isReference(rndr *render, data []byte) int {
288	// up to 3 optional leading spaces
289	if len(data) < 4 {
290		return 0
291	}
292	i := 0
293	for i < 3 && data[i] == ' ' {
294		i++
295	}
296
297	// id part: anything but a newline between brackets
298	if data[i] != '[' {
299		return 0
300	}
301	i++
302	id_offset := i
303	for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
304		i++
305	}
306	if i >= len(data) || data[i] != ']' {
307		return 0
308	}
309	id_end := i
310
311	// spacer: colon (space | tab)* newline? (space | tab)*
312	i++
313	if i >= len(data) || data[i] != ':' {
314		return 0
315	}
316	i++
317	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
318		i++
319	}
320	if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
321		i++
322		if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
323			i++
324		}
325	}
326	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
327		i++
328	}
329	if i >= len(data) {
330		return 0
331	}
332
333	// link: whitespace-free sequence, optionally between angle brackets
334	if data[i] == '<' {
335		i++
336	}
337	link_offset := i
338	for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
339		i++
340	}
341	link_end := i
342	if data[link_offset] == '<' && data[link_end-1] == '>' {
343		link_offset++
344		link_end--
345	}
346
347	// optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
348	for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
349		i++
350	}
351	if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
352		return 0
353	}
354
355	// compute end-of-line
356	line_end := 0
357	if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
358		line_end = i
359	}
360	if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
361		line_end++
362	}
363
364	// optional (space|tab)* spacer after a newline
365	if line_end > 0 {
366		i = line_end + 1
367		for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
368			i++
369		}
370	}
371
372	// optional title: any non-newline sequence enclosed in '"() alone on its line
373	title_offset, title_end := 0, 0
374	if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
375		i++
376		title_offset = i
377
378		// look for EOL
379		for i < len(data) && data[i] != '\n' && data[i] != '\r' {
380			i++
381		}
382		if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
383			title_end = i + 1
384		} else {
385			title_end = i
386		}
387
388		// step back
389		i--
390		for i > title_offset && (data[i] == ' ' || data[i] == '\t') {
391			i--
392		}
393		if i > title_offset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
394			line_end = title_end
395			title_end = i
396		}
397	}
398	if line_end == 0 { // garbage after the link
399		return 0
400	}
401
402	// a valid ref has been found
403	if rndr == nil {
404		return line_end
405	}
406
407	// id matches are case-insensitive
408	id := string(bytes.ToLower(data[id_offset:id_end]))
409	rndr.refs[id] = &reference{
410		link:  data[link_offset:link_end],
411		title: data[title_offset:title_end],
412	}
413
414	return line_end
415}
416
417
418//
419//
420// Miscellaneous helper functions
421//
422//
423
424
425// Test if a character is a punctuation symbol.
426// Taken from a private function in regexp in the stdlib.
427func ispunct(c byte) bool {
428	for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
429		if c == r {
430			return true
431		}
432	}
433	return false
434}
435
436// Test if a character is a whitespace character.
437func isspace(c byte) bool {
438	return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
439}
440
441// Test if a character is a letter or a digit.
442// TODO: check when this is looking for ASCII alnum and when it should use unicode
443func isalnum(c byte) bool {
444	return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
445}
446
447// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
448// always ends output with a newline
449func expandTabs(out *bytes.Buffer, line []byte) {
450	// first, check for common cases: no tabs, or only tabs at beginning of line
451	i, prefix := 0, 0
452	slowcase := false
453	for i = 0; i < len(line); i++ {
454		if line[i] == '\t' {
455			if prefix == i {
456				prefix++
457			} else {
458				slowcase = true
459				break
460			}
461		}
462	}
463
464	// no need to decode runes if all tabs are at the beginning of the line
465	if !slowcase {
466		for i = 0; i < prefix*TAB_SIZE; i++ {
467			out.WriteByte(' ')
468		}
469		out.Write(line[prefix:])
470		out.WriteByte('\n')
471		return
472	}
473
474	// the slow case: we need to count runes to figure out how
475	// many spaces to insert for each tab
476	column := 0
477	i = 0
478	for i < len(line) {
479		start := i
480		for i < len(line) && line[i] != '\t' {
481			_, size := utf8.DecodeRune(line[i:])
482			i += size
483			column++
484		}
485
486		if i > start {
487			out.Write(line[start:i])
488		}
489
490		if i >= len(line) {
491			break
492		}
493
494		for {
495			out.WriteByte(' ')
496			column++
497			if column%TAB_SIZE == 0 {
498				break
499			}
500		}
501
502		i++
503	}
504	out.WriteByte('\n')
505}