markdown.go (view raw)
1//
2// Black Friday Markdown Processor
3// Originally based on http://github.com/tanoku/upskirt
4// by Russ Ross <russ@russross.com>
5//
6
7//
8//
9// Markdown parsing and processing
10//
11//
12
13package blackfriday
14
15import (
16 "bytes"
17 "utf8"
18)
19
20// These are the supported markdown parsing extensions.
21// OR these values together to select multiple extensions.
22const (
23 EXTENSION_NO_INTRA_EMPHASIS = 1 << iota
24 EXTENSION_TABLES
25 EXTENSION_FENCED_CODE
26 EXTENSION_AUTOLINK
27 EXTENSION_STRIKETHROUGH
28 EXTENSION_LAX_HTML_BLOCKS
29 EXTENSION_SPACE_HEADERS
30 EXTENSION_HARD_LINE_BREAK
31)
32
33// These are the possible flag values for the link renderer.
34// Only a single one of these values will be used; they are not ORed together.
35// These are mostly of interest if you are writing a new output format.
36const (
37 LINK_TYPE_NOT_AUTOLINK = iota
38 LINK_TYPE_NORMAL
39 LINK_TYPE_EMAIL
40)
41
42// These are the possible flag values for the listitem renderer.
43// Multiple flag values may be ORed together.
44// These are mostly of interest if you are writing a new output format.
45const (
46 LIST_TYPE_ORDERED = 1 << iota
47 LIST_ITEM_CONTAINS_BLOCK
48 LIST_ITEM_END_OF_LIST
49)
50
51// These are the possible flag values for the table cell renderer.
52// Only a single one of these values will be used; they are not ORed together.
53// These are mostly of interest if you are writing a new output format.
54const (
55 TABLE_ALIGNMENT_LEFT = 1 << iota
56 TABLE_ALIGNMENT_RIGHT
57 TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
58)
59
60// The size of a tab stop.
61const TAB_SIZE = 4
62
63// These are the tags that are recognized as HTML block tags.
64// Any of these can be included in markdown text without special escaping.
65var block_tags = map[string]bool{
66 "p": true,
67 "dl": true,
68 "h1": true,
69 "h2": true,
70 "h3": true,
71 "h4": true,
72 "h5": true,
73 "h6": true,
74 "ol": true,
75 "ul": true,
76 "del": true,
77 "div": true,
78 "ins": true,
79 "pre": true,
80 "form": true,
81 "math": true,
82 "table": true,
83 "iframe": true,
84 "script": true,
85 "fieldset": true,
86 "noscript": true,
87 "blockquote": true,
88}
89
90// This struct defines the rendering interface.
91// A series of callback functions are registered to form a complete renderer.
92// A single interface{} value field is provided, and that value is handed to
93// each callback. Leaving a field blank suppresses rendering that type of output
94// except where noted.
95//
96// This is mostly of interest if you are implementing a new rendering format.
97// Most users will use the convenience functions to fill in this structure.
98type Renderer struct {
99 // block-level callbacks---nil skips the block
100 BlockCode func(out *bytes.Buffer, text []byte, lang string, opaque interface{})
101 BlockQuote func(out *bytes.Buffer, text []byte, opaque interface{})
102 BlockHtml func(out *bytes.Buffer, text []byte, opaque interface{})
103 Header func(out *bytes.Buffer, text func() bool, level int, opaque interface{})
104 HRule func(out *bytes.Buffer, opaque interface{})
105 List func(out *bytes.Buffer, text func() bool, flags int, opaque interface{})
106 ListItem func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
107 Paragraph func(out *bytes.Buffer, text func() bool, opaque interface{})
108 Table func(out *bytes.Buffer, header []byte, body []byte, columnData []int, opaque interface{})
109 TableRow func(out *bytes.Buffer, text []byte, opaque interface{})
110 TableCell func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
111
112 // Span-level callbacks---nil or return 0 prints the span verbatim
113 AutoLink func(out *bytes.Buffer, link []byte, kind int, opaque interface{}) int
114 CodeSpan func(out *bytes.Buffer, text []byte, opaque interface{}) int
115 DoubleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
116 Emphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
117 Image func(out *bytes.Buffer, link []byte, title []byte, alt []byte, opaque interface{}) int
118 LineBreak func(out *bytes.Buffer, opaque interface{}) int
119 Link func(out *bytes.Buffer, link []byte, title []byte, content []byte, opaque interface{}) int
120 RawHtmlTag func(out *bytes.Buffer, tag []byte, opaque interface{}) int
121 TripleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
122 StrikeThrough func(out *bytes.Buffer, text []byte, opaque interface{}) int
123
124 // Low-level callbacks---nil copies input directly into the output
125 Entity func(out *bytes.Buffer, entity []byte, opaque interface{})
126 NormalText func(out *bytes.Buffer, text []byte, opaque interface{})
127
128 // Header and footer
129 DocumentHeader func(out *bytes.Buffer, opaque interface{})
130 DocumentFooter func(out *bytes.Buffer, opaque interface{})
131
132 // User data---passed back to every callback
133 Opaque interface{}
134}
135
136type inlineParser func(out *bytes.Buffer, rndr *render, data []byte, offset int) int
137
138type render struct {
139 mk *Renderer
140 refs map[string]*reference
141 inline [256]inlineParser
142 flags uint32
143 nesting int
144 maxNesting int
145 insideLink bool
146}
147
148
149//
150//
151// Public interface
152//
153//
154
155// Parse and render a block of markdown-encoded text.
156// The renderer is used to format the output, and extensions dictates which
157// non-standard extensions are enabled.
158func Markdown(input []byte, renderer *Renderer, extensions uint32) []byte {
159 // no point in parsing if we can't render
160 if renderer == nil {
161 return nil
162 }
163
164 // fill in the render structure
165 rndr := new(render)
166 rndr.mk = renderer
167 rndr.flags = extensions
168 rndr.refs = make(map[string]*reference)
169 rndr.maxNesting = 16
170 rndr.insideLink = false
171
172 // register inline parsers
173 if rndr.mk.Emphasis != nil || rndr.mk.DoubleEmphasis != nil || rndr.mk.TripleEmphasis != nil {
174 rndr.inline['*'] = inlineEmphasis
175 rndr.inline['_'] = inlineEmphasis
176 if extensions&EXTENSION_STRIKETHROUGH != 0 {
177 rndr.inline['~'] = inlineEmphasis
178 }
179 }
180 if rndr.mk.CodeSpan != nil {
181 rndr.inline['`'] = inlineCodeSpan
182 }
183 if rndr.mk.LineBreak != nil {
184 rndr.inline['\n'] = inlineLineBreak
185 }
186 if rndr.mk.Image != nil || rndr.mk.Link != nil {
187 rndr.inline['['] = inlineLink
188 }
189 rndr.inline['<'] = inlineLAngle
190 rndr.inline['\\'] = inlineEscape
191 rndr.inline['&'] = inlineEntity
192
193 if extensions&EXTENSION_AUTOLINK != 0 {
194 rndr.inline[':'] = inlineAutoLink
195 }
196
197 first := FirstPass(rndr, input)
198 second := SecondPass(rndr, first)
199
200 return second
201}
202
203// first pass:
204// - extract references
205// - expand tabs
206// - normalize newlines
207// - copy everything else
208func FirstPass(rndr *render, input []byte) []byte {
209 var out bytes.Buffer
210 beg, end := 0, 0
211 for beg < len(input) { // iterate over lines
212 if end = isReference(rndr, input[beg:]); end > 0 {
213 beg += end
214 } else { // skip to the next line
215 end = beg
216 for end < len(input) && input[end] != '\n' && input[end] != '\r' {
217 end++
218 }
219
220 // add the line body if present
221 if end > beg {
222 expandTabs(&out, input[beg:end])
223 } else {
224 out.WriteByte('\n')
225 }
226
227 if end < len(input) && input[end] == '\r' {
228 end++
229 }
230 if end < len(input) && input[end] == '\n' {
231 end++
232 }
233
234 beg = end
235 }
236 }
237 return out.Bytes()
238}
239
240// second pass: actual rendering
241func SecondPass(rndr *render, input []byte) []byte {
242 var output bytes.Buffer
243 if rndr.mk.DocumentHeader != nil {
244 rndr.mk.DocumentHeader(&output, rndr.mk.Opaque)
245 }
246
247 parseBlock(&output, rndr, input)
248
249 if rndr.mk.DocumentFooter != nil {
250 rndr.mk.DocumentFooter(&output, rndr.mk.Opaque)
251 }
252
253 if rndr.nesting != 0 {
254 panic("Nesting level did not end at zero")
255 }
256
257 return output.Bytes()
258}
259
260
261//
262// Link references
263//
264// This section implements support for references that (usually) appear
265// as footnotes in a document, and can be referenced anywhere in the document.
266// The basic format is:
267//
268// [1]: http://www.google.com/ "Google"
269// [2]: http://www.github.com/ "Github"
270//
271// Anywhere in the document, the reference can be linked by referring to its
272// label, i.e., 1 and 2 in this example, as in:
273//
274// This library is hosted on [Github][2], a git hosting site.
275
276// References are parsed and stored in this struct.
277type reference struct {
278 link []byte
279 title []byte
280}
281
282// Check whether or not data starts with a reference link.
283// If so, it is parsed and stored in the list of references
284// (in the render struct).
285// Returns the number of bytes to skip to move past it,
286// or zero if the first line is not a reference.
287func isReference(rndr *render, data []byte) int {
288 // up to 3 optional leading spaces
289 if len(data) < 4 {
290 return 0
291 }
292 i := 0
293 for i < 3 && data[i] == ' ' {
294 i++
295 }
296
297 // id part: anything but a newline between brackets
298 if data[i] != '[' {
299 return 0
300 }
301 i++
302 id_offset := i
303 for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
304 i++
305 }
306 if i >= len(data) || data[i] != ']' {
307 return 0
308 }
309 id_end := i
310
311 // spacer: colon (space | tab)* newline? (space | tab)*
312 i++
313 if i >= len(data) || data[i] != ':' {
314 return 0
315 }
316 i++
317 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
318 i++
319 }
320 if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
321 i++
322 if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
323 i++
324 }
325 }
326 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
327 i++
328 }
329 if i >= len(data) {
330 return 0
331 }
332
333 // link: whitespace-free sequence, optionally between angle brackets
334 if data[i] == '<' {
335 i++
336 }
337 link_offset := i
338 for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
339 i++
340 }
341 link_end := i
342 if data[link_offset] == '<' && data[link_end-1] == '>' {
343 link_offset++
344 link_end--
345 }
346
347 // optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
348 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
349 i++
350 }
351 if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
352 return 0
353 }
354
355 // compute end-of-line
356 line_end := 0
357 if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
358 line_end = i
359 }
360 if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
361 line_end++
362 }
363
364 // optional (space|tab)* spacer after a newline
365 if line_end > 0 {
366 i = line_end + 1
367 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
368 i++
369 }
370 }
371
372 // optional title: any non-newline sequence enclosed in '"() alone on its line
373 title_offset, title_end := 0, 0
374 if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
375 i++
376 title_offset = i
377
378 // look for EOL
379 for i < len(data) && data[i] != '\n' && data[i] != '\r' {
380 i++
381 }
382 if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
383 title_end = i + 1
384 } else {
385 title_end = i
386 }
387
388 // step back
389 i--
390 for i > title_offset && (data[i] == ' ' || data[i] == '\t') {
391 i--
392 }
393 if i > title_offset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
394 line_end = title_end
395 title_end = i
396 }
397 }
398 if line_end == 0 { // garbage after the link
399 return 0
400 }
401
402 // a valid ref has been found
403 if rndr == nil {
404 return line_end
405 }
406
407 // id matches are case-insensitive
408 id := string(bytes.ToLower(data[id_offset:id_end]))
409 rndr.refs[id] = &reference{
410 link: data[link_offset:link_end],
411 title: data[title_offset:title_end],
412 }
413
414 return line_end
415}
416
417
418//
419//
420// Miscellaneous helper functions
421//
422//
423
424
425// Test if a character is a punctuation symbol.
426// Taken from a private function in regexp in the stdlib.
427func ispunct(c byte) bool {
428 for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
429 if c == r {
430 return true
431 }
432 }
433 return false
434}
435
436// Test if a character is a whitespace character.
437func isspace(c byte) bool {
438 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
439}
440
441// Test if a character is a letter or a digit.
442// TODO: check when this is looking for ASCII alnum and when it should use unicode
443func isalnum(c byte) bool {
444 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
445}
446
447// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
448// always ends output with a newline
449func expandTabs(out *bytes.Buffer, line []byte) {
450 // first, check for common cases: no tabs, or only tabs at beginning of line
451 i, prefix := 0, 0
452 slowcase := false
453 for i = 0; i < len(line); i++ {
454 if line[i] == '\t' {
455 if prefix == i {
456 prefix++
457 } else {
458 slowcase = true
459 break
460 }
461 }
462 }
463
464 // no need to decode runes if all tabs are at the beginning of the line
465 if !slowcase {
466 for i = 0; i < prefix*TAB_SIZE; i++ {
467 out.WriteByte(' ')
468 }
469 out.Write(line[prefix:])
470 out.WriteByte('\n')
471 return
472 }
473
474 // the slow case: we need to count runes to figure out how
475 // many spaces to insert for each tab
476 column := 0
477 i = 0
478 for i < len(line) {
479 start := i
480 for i < len(line) && line[i] != '\t' {
481 _, size := utf8.DecodeRune(line[i:])
482 i += size
483 column++
484 }
485
486 if i > start {
487 out.Write(line[start:i])
488 }
489
490 if i >= len(line) {
491 break
492 }
493
494 for {
495 out.WriteByte(' ')
496 column++
497 if column%TAB_SIZE == 0 {
498 break
499 }
500 }
501
502 i++
503 }
504 out.WriteByte('\n')
505}