markdown.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11//
12// Markdown parsing and processing
13//
14//
15
16// Blackfriday markdown processor.
17//
18// Translates plain text with simple formatting rules into HTML or LaTeX.
19package blackfriday
20
21import (
22 "bytes"
23 "unicode/utf8"
24)
25
26const VERSION = "1.1"
27
28// These are the supported markdown parsing extensions.
29// OR these values together to select multiple extensions.
30const (
31 EXTENSION_NO_INTRA_EMPHASIS = 1 << iota // ignore emphasis markers inside words
32 EXTENSION_TABLES // render tables
33 EXTENSION_FENCED_CODE // render fenced code blocks
34 EXTENSION_AUTOLINK // detect embedded URLs that are not explicitly marked
35 EXTENSION_STRIKETHROUGH // strikethrough text using ~~test~~
36 EXTENSION_LAX_HTML_BLOCKS // loosen up HTML block parsing rules
37 EXTENSION_SPACE_HEADERS // be strict about prefix header rules
38 EXTENSION_HARD_LINE_BREAK // translate newlines into line breaks
39 EXTENSION_TAB_SIZE_EIGHT // expand tabs to eight spaces instead of four
40)
41
42// These are the possible flag values for the link renderer.
43// Only a single one of these values will be used; they are not ORed together.
44// These are mostly of interest if you are writing a new output format.
45const (
46 LINK_TYPE_NOT_AUTOLINK = iota
47 LINK_TYPE_NORMAL
48 LINK_TYPE_EMAIL
49)
50
51// These are the possible flag values for the ListItem renderer.
52// Multiple flag values may be ORed together.
53// These are mostly of interest if you are writing a new output format.
54const (
55 LIST_TYPE_ORDERED = 1 << iota
56 LIST_ITEM_CONTAINS_BLOCK
57 LIST_ITEM_BEGINNING_OF_LIST
58 LIST_ITEM_END_OF_LIST
59)
60
61// These are the possible flag values for the table cell renderer.
62// Only a single one of these values will be used; they are not ORed together.
63// These are mostly of interest if you are writing a new output format.
64const (
65 TABLE_ALIGNMENT_LEFT = 1 << iota
66 TABLE_ALIGNMENT_RIGHT
67 TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
68)
69
70// The size of a tab stop.
71const (
72 TAB_SIZE_DEFAULT = 4
73 TAB_SIZE_EIGHT = 8
74)
75
76// These are the tags that are recognized as HTML block tags.
77// Any of these can be included in markdown text without special escaping.
78var blockTags = map[string]bool{
79 "p": true,
80 "dl": true,
81 "h1": true,
82 "h2": true,
83 "h3": true,
84 "h4": true,
85 "h5": true,
86 "h6": true,
87 "ol": true,
88 "ul": true,
89 "del": true,
90 "div": true,
91 "ins": true,
92 "pre": true,
93 "form": true,
94 "math": true,
95 "table": true,
96 "iframe": true,
97 "script": true,
98 "fieldset": true,
99 "noscript": true,
100 "blockquote": true,
101
102 // HTML5
103 "video": true,
104 "aside": true,
105 "canvas": true,
106 "figure": true,
107 "footer": true,
108 "header": true,
109 "hgroup": true,
110 "output": true,
111 "article": true,
112 "section": true,
113 "progress": true,
114 "figcaption": true,
115}
116
117// Renderer is the rendering interface.
118// This is mostly of interest if you are implementing a new rendering format.
119//
120// When a byte slice is provided, it contains the (rendered) contents of the
121// element.
122//
123// When a callback is provided instead, it will write the contents of the
124// respective element directly to the output buffer and return true on success.
125// If the callback returns false, the rendering function should reset the
126// output buffer as though it had never been called.
127//
128// Currently Html and Latex implementations are provided
129type Renderer interface {
130 // block-level callbacks
131 BlockCode(out *bytes.Buffer, text []byte, lang string)
132 BlockQuote(out *bytes.Buffer, text []byte)
133 BlockHtml(out *bytes.Buffer, text []byte)
134 Header(out *bytes.Buffer, text func() bool, level int)
135 HRule(out *bytes.Buffer)
136 List(out *bytes.Buffer, text func() bool, flags int)
137 ListItem(out *bytes.Buffer, text []byte, flags int)
138 Paragraph(out *bytes.Buffer, text func() bool)
139 Table(out *bytes.Buffer, header []byte, body []byte, columnData []int)
140 TableRow(out *bytes.Buffer, text []byte)
141 TableCell(out *bytes.Buffer, text []byte, flags int)
142
143 // Span-level callbacks
144 AutoLink(out *bytes.Buffer, link []byte, kind int)
145 CodeSpan(out *bytes.Buffer, text []byte)
146 DoubleEmphasis(out *bytes.Buffer, text []byte)
147 Emphasis(out *bytes.Buffer, text []byte)
148 Image(out *bytes.Buffer, link []byte, title []byte, alt []byte)
149 LineBreak(out *bytes.Buffer)
150 Link(out *bytes.Buffer, link []byte, title []byte, content []byte)
151 RawHtmlTag(out *bytes.Buffer, tag []byte)
152 TripleEmphasis(out *bytes.Buffer, text []byte)
153 StrikeThrough(out *bytes.Buffer, text []byte)
154
155 // Low-level callbacks
156 Entity(out *bytes.Buffer, entity []byte)
157 NormalText(out *bytes.Buffer, text []byte)
158
159 // Header and footer
160 DocumentHeader(out *bytes.Buffer)
161 DocumentFooter(out *bytes.Buffer)
162}
163
164// Callback functions for inline parsing. One such function is defined
165// for each character that triggers a response when parsing inline data.
166type inlineParser func(p *parser, out *bytes.Buffer, data []byte, offset int) int
167
168// Parser holds runtime state used by the parser.
169// This is constructed by the Markdown function.
170type parser struct {
171 r Renderer
172 refs map[string]*reference
173 inlineCallback [256]inlineParser
174 flags int
175 nesting int
176 maxNesting int
177 insideLink bool
178}
179
180//
181//
182// Public interface
183//
184//
185
186// MarkdownBasic is a convenience function for simple rendering.
187// It processes markdown input with no extensions enabled.
188func MarkdownBasic(input []byte) []byte {
189 // set up the HTML renderer
190 htmlFlags := HTML_USE_XHTML
191 renderer := HtmlRenderer(htmlFlags, "", "")
192
193 // set up the parser
194 extensions := 0
195
196 return Markdown(input, renderer, extensions)
197}
198
199// Call Markdown with most useful extensions enabled
200// MarkdownCommon is a convenience function for simple rendering.
201// It processes markdown input with common extensions enabled, including:
202//
203// * Smartypants processing with smart fractions and LaTeX dashes
204//
205// * Intra-word emphasis suppression
206//
207// * Tables
208//
209// * Fenced code blocks
210//
211// * Autolinking
212//
213// * Strikethrough support
214//
215// * Strict header parsing
216func MarkdownCommon(input []byte) []byte {
217 // set up the HTML renderer
218 htmlFlags := 0
219 htmlFlags |= HTML_USE_XHTML
220 htmlFlags |= HTML_USE_SMARTYPANTS
221 htmlFlags |= HTML_SMARTYPANTS_FRACTIONS
222 htmlFlags |= HTML_SMARTYPANTS_LATEX_DASHES
223 htmlFlags |= HTML_SKIP_SCRIPT
224 renderer := HtmlRenderer(htmlFlags, "", "")
225
226 // set up the parser
227 extensions := 0
228 extensions |= EXTENSION_NO_INTRA_EMPHASIS
229 extensions |= EXTENSION_TABLES
230 extensions |= EXTENSION_FENCED_CODE
231 extensions |= EXTENSION_AUTOLINK
232 extensions |= EXTENSION_STRIKETHROUGH
233 extensions |= EXTENSION_SPACE_HEADERS
234
235 return Markdown(input, renderer, extensions)
236}
237
238// Markdown is the main rendering function.
239// It parses and renders a block of markdown-encoded text.
240// The supplied Renderer is used to format the output, and extensions dictates
241// which non-standard extensions are enabled.
242//
243// To use the supplied Html or LaTeX renderers, see HtmlRenderer and
244// LatexRenderer, respectively.
245func Markdown(input []byte, renderer Renderer, extensions int) []byte {
246 // no point in parsing if we can't render
247 if renderer == nil {
248 return nil
249 }
250
251 // fill in the render structure
252 p := new(parser)
253 p.r = renderer
254 p.flags = extensions
255 p.refs = make(map[string]*reference)
256 p.maxNesting = 16
257 p.insideLink = false
258
259 // register inline parsers
260 p.inlineCallback['*'] = emphasis
261 p.inlineCallback['_'] = emphasis
262 if extensions&EXTENSION_STRIKETHROUGH != 0 {
263 p.inlineCallback['~'] = emphasis
264 }
265 p.inlineCallback['`'] = codeSpan
266 p.inlineCallback['\n'] = lineBreak
267 p.inlineCallback['['] = link
268 p.inlineCallback['<'] = leftAngle
269 p.inlineCallback['\\'] = escape
270 p.inlineCallback['&'] = entity
271
272 if extensions&EXTENSION_AUTOLINK != 0 {
273 p.inlineCallback[':'] = autoLink
274 }
275
276 first := firstPass(p, input)
277 second := secondPass(p, first)
278
279 return second
280}
281
282// first pass:
283// - extract references
284// - expand tabs
285// - normalize newlines
286// - copy everything else
287func firstPass(p *parser, input []byte) []byte {
288 var out bytes.Buffer
289 tabSize := TAB_SIZE_DEFAULT
290 if p.flags&EXTENSION_TAB_SIZE_EIGHT != 0 {
291 tabSize = TAB_SIZE_EIGHT
292 }
293 beg, end := 0, 0
294 for beg < len(input) { // iterate over lines
295 if end = isReference(p, input[beg:]); end > 0 {
296 beg += end
297 } else { // skip to the next line
298 end = beg
299 for end < len(input) && input[end] != '\n' && input[end] != '\r' {
300 end++
301 }
302
303 // add the line body if present
304 if end > beg {
305 expandTabs(&out, input[beg:end], tabSize)
306 }
307 out.WriteByte('\n')
308
309 if end < len(input) && input[end] == '\r' {
310 end++
311 }
312 if end < len(input) && input[end] == '\n' {
313 end++
314 }
315
316 beg = end
317 }
318 }
319
320 // empty input?
321 if out.Len() == 0 {
322 out.WriteByte('\n')
323 }
324
325 return out.Bytes()
326}
327
328// second pass: actual rendering
329func secondPass(p *parser, input []byte) []byte {
330 var output bytes.Buffer
331
332 p.r.DocumentHeader(&output)
333 p.block(&output, input)
334 p.r.DocumentFooter(&output)
335
336 if p.nesting != 0 {
337 panic("Nesting level did not end at zero")
338 }
339
340 return output.Bytes()
341}
342
343//
344// Link references
345//
346// This section implements support for references that (usually) appear
347// as footnotes in a document, and can be referenced anywhere in the document.
348// The basic format is:
349//
350// [1]: http://www.google.com/ "Google"
351// [2]: http://www.github.com/ "Github"
352//
353// Anywhere in the document, the reference can be linked by referring to its
354// label, i.e., 1 and 2 in this example, as in:
355//
356// This library is hosted on [Github][2], a git hosting site.
357
358// References are parsed and stored in this struct.
359type reference struct {
360 link []byte
361 title []byte
362}
363
364// Check whether or not data starts with a reference link.
365// If so, it is parsed and stored in the list of references
366// (in the render struct).
367// Returns the number of bytes to skip to move past it,
368// or zero if the first line is not a reference.
369func isReference(p *parser, data []byte) int {
370 // up to 3 optional leading spaces
371 if len(data) < 4 {
372 return 0
373 }
374 i := 0
375 for i < 3 && data[i] == ' ' {
376 i++
377 }
378
379 // id part: anything but a newline between brackets
380 if data[i] != '[' {
381 return 0
382 }
383 i++
384 idOffset := i
385 for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
386 i++
387 }
388 if i >= len(data) || data[i] != ']' {
389 return 0
390 }
391 idEnd := i
392
393 // spacer: colon (space | tab)* newline? (space | tab)*
394 i++
395 if i >= len(data) || data[i] != ':' {
396 return 0
397 }
398 i++
399 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
400 i++
401 }
402 if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
403 i++
404 if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
405 i++
406 }
407 }
408 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
409 i++
410 }
411 if i >= len(data) {
412 return 0
413 }
414
415 // link: whitespace-free sequence, optionally between angle brackets
416 if data[i] == '<' {
417 i++
418 }
419 linkOffset := i
420 for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
421 i++
422 }
423 linkEnd := i
424 if data[linkOffset] == '<' && data[linkEnd-1] == '>' {
425 linkOffset++
426 linkEnd--
427 }
428
429 // optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
430 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
431 i++
432 }
433 if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
434 return 0
435 }
436
437 // compute end-of-line
438 lineEnd := 0
439 if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
440 lineEnd = i
441 }
442 if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
443 lineEnd++
444 }
445
446 // optional (space|tab)* spacer after a newline
447 if lineEnd > 0 {
448 i = lineEnd + 1
449 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
450 i++
451 }
452 }
453
454 // optional title: any non-newline sequence enclosed in '"() alone on its line
455 titleOffset, titleEnd := 0, 0
456 if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
457 i++
458 titleOffset = i
459
460 // look for EOL
461 for i < len(data) && data[i] != '\n' && data[i] != '\r' {
462 i++
463 }
464 if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
465 titleEnd = i + 1
466 } else {
467 titleEnd = i
468 }
469
470 // step back
471 i--
472 for i > titleOffset && (data[i] == ' ' || data[i] == '\t') {
473 i--
474 }
475 if i > titleOffset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
476 lineEnd = titleEnd
477 titleEnd = i
478 }
479 }
480 if lineEnd == 0 { // garbage after the link
481 return 0
482 }
483
484 // a valid ref has been found
485
486 // id matches are case-insensitive
487 id := string(bytes.ToLower(data[idOffset:idEnd]))
488 p.refs[id] = &reference{
489 link: data[linkOffset:linkEnd],
490 title: data[titleOffset:titleEnd],
491 }
492
493 return lineEnd
494}
495
496//
497//
498// Miscellaneous helper functions
499//
500//
501
502// Test if a character is a punctuation symbol.
503// Taken from a private function in regexp in the stdlib.
504func ispunct(c byte) bool {
505 for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
506 if c == r {
507 return true
508 }
509 }
510 return false
511}
512
513// Test if a character is a whitespace character.
514func isspace(c byte) bool {
515 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
516}
517
518// Test if a character is a letter or a digit.
519// TODO: check when this is looking for ASCII alnum and when it should use unicode
520func isalnum(c byte) bool {
521 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
522}
523
524// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
525// always ends output with a newline
526func expandTabs(out *bytes.Buffer, line []byte, tabSize int) {
527 // first, check for common cases: no tabs, or only tabs at beginning of line
528 i, prefix := 0, 0
529 slowcase := false
530 for i = 0; i < len(line); i++ {
531 if line[i] == '\t' {
532 if prefix == i {
533 prefix++
534 } else {
535 slowcase = true
536 break
537 }
538 }
539 }
540
541 // no need to decode runes if all tabs are at the beginning of the line
542 if !slowcase {
543 for i = 0; i < prefix*tabSize; i++ {
544 out.WriteByte(' ')
545 }
546 out.Write(line[prefix:])
547 return
548 }
549
550 // the slow case: we need to count runes to figure out how
551 // many spaces to insert for each tab
552 column := 0
553 i = 0
554 for i < len(line) {
555 start := i
556 for i < len(line) && line[i] != '\t' {
557 _, size := utf8.DecodeRune(line[i:])
558 i += size
559 column++
560 }
561
562 if i > start {
563 out.Write(line[start:i])
564 }
565
566 if i >= len(line) {
567 break
568 }
569
570 for {
571 out.WriteByte(' ')
572 column++
573 if column%tabSize == 0 {
574 break
575 }
576 }
577
578 i++
579 }
580}