markdown.go (view raw)
1//
2// Black Friday Markdown Processor
3// Originally based on http://github.com/tanoku/upskirt
4// by Russ Ross <russ@russross.com>
5//
6
7//
8//
9// Markdown parsing and processing
10//
11//
12
13package blackfriday
14
15import (
16 "bytes"
17 "utf8"
18)
19
20// These are the supported markdown parsing extensions.
21// OR these values together to select multiple extensions.
22const (
23 EXTENSION_NO_INTRA_EMPHASIS = 1 << iota
24 EXTENSION_TABLES
25 EXTENSION_FENCED_CODE
26 EXTENSION_AUTOLINK
27 EXTENSION_STRIKETHROUGH
28 EXTENSION_LAX_HTML_BLOCKS
29 EXTENSION_SPACE_HEADERS
30)
31
32// These are the possible flag values for the link renderer.
33// Only a single one of these values will be used; they are not ORed together.
34// These are mostly of interest if you are writing a new output format.
35const (
36 LINK_TYPE_NOT_AUTOLINK = iota
37 LINK_TYPE_NORMAL
38 LINK_TYPE_EMAIL
39)
40
41// These are the possible flag values for the listitem renderer.
42// Multiple flag values may be ORed together.
43// These are mostly of interest if you are writing a new output format.
44const (
45 LIST_TYPE_ORDERED = 1 << iota
46 LIST_ITEM_CONTAINS_BLOCK
47 LIST_ITEM_END_OF_LIST
48)
49
50// These are the possible flag values for the table cell renderer.
51// Only a single one of these values will be used; they are not ORed together.
52// These are mostly of interest if you are writing a new output format.
53const (
54 TABLE_ALIGNMENT_LEFT = 1 << iota
55 TABLE_ALIGNMENT_RIGHT
56 TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
57)
58
59// The size of a tab stop.
60const TAB_SIZE = 4
61
62// These are the tags that are recognized as HTML block tags.
63// Any of these can be included in markdown text without special escaping.
64var block_tags = map[string]bool{
65 "p": true,
66 "dl": true,
67 "h1": true,
68 "h2": true,
69 "h3": true,
70 "h4": true,
71 "h5": true,
72 "h6": true,
73 "ol": true,
74 "ul": true,
75 "del": true,
76 "div": true,
77 "ins": true,
78 "pre": true,
79 "form": true,
80 "math": true,
81 "table": true,
82 "iframe": true,
83 "script": true,
84 "fieldset": true,
85 "noscript": true,
86 "blockquote": true,
87}
88
89// This struct defines the rendering interface.
90// A series of callback functions are registered to form a complete renderer.
91// A single interface{} value field is provided, and that value is handed to
92// each callback. Leaving a field blank suppresses rendering that type of output
93// except where noted.
94//
95// This is mostly of interest if you are implementing a new rendering format.
96// Most users will use the convenience functions to fill in this structure.
97type Renderer struct {
98 // block-level callbacks---nil skips the block
99 BlockCode func(out *bytes.Buffer, text []byte, lang string, opaque interface{})
100 BlockQuote func(out *bytes.Buffer, text []byte, opaque interface{})
101 BlockHtml func(out *bytes.Buffer, text []byte, opaque interface{})
102 Header func(out *bytes.Buffer, text func() bool, level int, opaque interface{})
103 HRule func(out *bytes.Buffer, opaque interface{})
104 List func(out *bytes.Buffer, text func() bool, flags int, opaque interface{})
105 ListItem func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
106 Paragraph func(out *bytes.Buffer, text []byte, opaque interface{})
107 Table func(out *bytes.Buffer, header []byte, body []byte, columnData []int, opaque interface{})
108 TableRow func(out *bytes.Buffer, text []byte, opaque interface{})
109 TableCell func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
110
111 // Span-level callbacks---nil or return 0 prints the span verbatim
112 AutoLink func(out *bytes.Buffer, link []byte, kind int, opaque interface{}) int
113 CodeSpan func(out *bytes.Buffer, text []byte, opaque interface{}) int
114 DoubleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
115 Emphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
116 Image func(out *bytes.Buffer, link []byte, title []byte, alt []byte, opaque interface{}) int
117 LineBreak func(out *bytes.Buffer, opaque interface{}) int
118 Link func(out *bytes.Buffer, link []byte, title []byte, content []byte, opaque interface{}) int
119 RawHtmlTag func(out *bytes.Buffer, tag []byte, opaque interface{}) int
120 TripleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
121 StrikeThrough func(out *bytes.Buffer, text []byte, opaque interface{}) int
122
123 // Low-level callbacks---nil copies input directly into the output
124 Entity func(out *bytes.Buffer, entity []byte, opaque interface{})
125 NormalText func(out *bytes.Buffer, text []byte, opaque interface{})
126
127 // Header and footer
128 DocumentHeader func(out *bytes.Buffer, opaque interface{})
129 DocumentFooter func(out *bytes.Buffer, opaque interface{})
130
131 // User data---passed back to every callback
132 Opaque interface{}
133}
134
135type inlineParser func(out *bytes.Buffer, rndr *render, data []byte, offset int) int
136
137type render struct {
138 mk *Renderer
139 refs map[string]*reference
140 inline [256]inlineParser
141 flags uint32
142 nesting int
143 maxNesting int
144 insideLink bool
145}
146
147
148//
149//
150// Public interface
151//
152//
153
154// Parse and render a block of markdown-encoded text.
155// The renderer is used to format the output, and extensions dictates which
156// non-standard extensions are enabled.
157func Markdown(input []byte, renderer *Renderer, extensions uint32) []byte {
158 // no point in parsing if we can't render
159 if renderer == nil {
160 return nil
161 }
162
163 // fill in the render structure
164 rndr := new(render)
165 rndr.mk = renderer
166 rndr.flags = extensions
167 rndr.refs = make(map[string]*reference)
168 rndr.maxNesting = 16
169 rndr.insideLink = false
170
171 // register inline parsers
172 if rndr.mk.Emphasis != nil || rndr.mk.DoubleEmphasis != nil || rndr.mk.TripleEmphasis != nil {
173 rndr.inline['*'] = inlineEmphasis
174 rndr.inline['_'] = inlineEmphasis
175 if extensions&EXTENSION_STRIKETHROUGH != 0 {
176 rndr.inline['~'] = inlineEmphasis
177 }
178 }
179 if rndr.mk.CodeSpan != nil {
180 rndr.inline['`'] = inlineCodeSpan
181 }
182 if rndr.mk.LineBreak != nil {
183 rndr.inline['\n'] = inlineLineBreak
184 }
185 if rndr.mk.Image != nil || rndr.mk.Link != nil {
186 rndr.inline['['] = inlineLink
187 }
188 rndr.inline['<'] = inlineLAngle
189 rndr.inline['\\'] = inlineEscape
190 rndr.inline['&'] = inlineEntity
191
192 if extensions&EXTENSION_AUTOLINK != 0 {
193 rndr.inline[':'] = inlineAutoLink
194 }
195
196 // first pass: look for references, copy everything else
197 var text bytes.Buffer
198 beg, end := 0, 0
199 for beg < len(input) { // iterate over lines
200 if end = isReference(rndr, input[beg:]); end > 0 {
201 beg += end
202 } else { // skip to the next line
203 end = beg
204 for end < len(input) && input[end] != '\n' && input[end] != '\r' {
205 end++
206 }
207
208 // add the line body if present
209 if end > beg {
210 expandTabs(&text, input[beg:end])
211 }
212
213 for end < len(input) && (input[end] == '\n' || input[end] == '\r') {
214 // add one \n per newline
215 if input[end] == '\n' || (end+1 < len(input) && input[end+1] != '\n') {
216 text.WriteByte('\n')
217 }
218 end++
219 }
220
221 beg = end
222 }
223 }
224
225 // second pass: actual rendering
226 var output bytes.Buffer
227 if rndr.mk.DocumentHeader != nil {
228 rndr.mk.DocumentHeader(&output, rndr.mk.Opaque)
229 }
230
231 if text.Len() > 0 {
232 // add a final newline if not already present
233 finalchar := text.Bytes()[text.Len()-1]
234 if finalchar != '\n' && finalchar != '\r' {
235 text.WriteByte('\n')
236 }
237 parseBlock(&output, rndr, text.Bytes())
238 }
239
240 if rndr.mk.DocumentFooter != nil {
241 rndr.mk.DocumentFooter(&output, rndr.mk.Opaque)
242 }
243
244 if rndr.nesting != 0 {
245 panic("Nesting level did not end at zero")
246 }
247
248 return output.Bytes()
249}
250
251
252//
253// Link references
254//
255// This section implements support for references that (usually) appear
256// as footnotes in a document, and can be referenced anywhere in the document.
257// The basic format is:
258//
259// [1]: http://www.google.com/ "Google"
260// [2]: http://www.github.com/ "Github"
261//
262// Anywhere in the document, the reference can be linked by referring to its
263// label, i.e., 1 and 2 in this example, as in:
264//
265// This library is hosted on [Github][2], a git hosting site.
266
267// References are parsed and stored in this struct.
268type reference struct {
269 link []byte
270 title []byte
271}
272
273// Check whether or not data starts with a reference link.
274// If so, it is parsed and stored in the list of references
275// (in the render struct).
276// Returns the number of bytes to skip to move past it, or zero
277// if there is the first line is not a reference.
278func isReference(rndr *render, data []byte) int {
279 // up to 3 optional leading spaces
280 if len(data) < 4 {
281 return 0
282 }
283 i := 0
284 for i < 3 && data[i] == ' ' {
285 i++
286 }
287 if data[i] == ' ' {
288 return 0
289 }
290
291 // id part: anything but a newline between brackets
292 if data[i] != '[' {
293 return 0
294 }
295 i++
296 id_offset := i
297 for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
298 i++
299 }
300 if i >= len(data) || data[i] != ']' {
301 return 0
302 }
303 id_end := i
304
305 // spacer: colon (space | tab)* newline? (space | tab)*
306 i++
307 if i >= len(data) || data[i] != ':' {
308 return 0
309 }
310 i++
311 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
312 i++
313 }
314 if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
315 i++
316 if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
317 i++
318 }
319 }
320 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
321 i++
322 }
323 if i >= len(data) {
324 return 0
325 }
326
327 // link: whitespace-free sequence, optionally between angle brackets
328 if data[i] == '<' {
329 i++
330 }
331 link_offset := i
332 for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
333 i++
334 }
335 link_end := i
336 if data[link_offset] == '<' && data[link_end-1] == '>' {
337 link_offset++
338 link_end--
339 }
340
341 // optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
342 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
343 i++
344 }
345 if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
346 return 0
347 }
348
349 // compute end-of-line
350 line_end := 0
351 if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
352 line_end = i
353 }
354 if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
355 line_end++
356 }
357
358 // optional (space|tab)* spacer after a newline
359 if line_end > 0 {
360 i = line_end + 1
361 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
362 i++
363 }
364 }
365
366 // optional title: any non-newline sequence enclosed in '"() alone on its line
367 title_offset, title_end := 0, 0
368 if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
369 i++
370 title_offset = i
371
372 // look for EOL
373 for i < len(data) && data[i] != '\n' && data[i] != '\r' {
374 i++
375 }
376 if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
377 title_end = i + 1
378 } else {
379 title_end = i
380 }
381
382 // step back
383 i--
384 for i > title_offset && (data[i] == ' ' || data[i] == '\t') {
385 i--
386 }
387 if i > title_offset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
388 line_end = title_end
389 title_end = i
390 }
391 }
392 if line_end == 0 { // garbage after the link
393 return 0
394 }
395
396 // a valid ref has been found
397 if rndr == nil {
398 return line_end
399 }
400
401 // id matches are case-insensitive
402 id := string(bytes.ToLower(data[id_offset:id_end]))
403 rndr.refs[id] = &reference{
404 link: data[link_offset:link_end],
405 title: data[title_offset:title_end],
406 }
407
408 return line_end
409}
410
411
412//
413//
414// Miscellaneous helper functions
415//
416//
417
418
419// Test if a character is a punctuation symbol.
420// Taken from a private function in regexp in the stdlib.
421func ispunct(c byte) bool {
422 for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
423 if c == r {
424 return true
425 }
426 }
427 return false
428}
429
430// Test if a character is a whitespace character.
431func isspace(c byte) bool {
432 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
433}
434
435// Test if a character is a letter or a digit.
436// TODO: check when this is looking for ASCII alnum and when it should use unicode
437func isalnum(c byte) bool {
438 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
439}
440
441// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
442func expandTabs(out *bytes.Buffer, line []byte) {
443 // first, check for common cases: no tabs, or only tabs at beginning of line
444 i, prefix := 0, 0
445 slowcase := false
446 for i = 0; i < len(line); i++ {
447 if line[i] == '\t' {
448 if prefix == i {
449 prefix++
450 } else {
451 slowcase = true
452 break
453 }
454 }
455 }
456
457 // no need to decode runes if all tabs are at the beginning of the line
458 if !slowcase {
459 for i = 0; i < prefix*TAB_SIZE; i++ {
460 out.WriteByte(' ')
461 }
462 out.Write(line[prefix:])
463 return
464 }
465
466 // the slow case: we need to count runes to figure out how
467 // many spaces to insert for each tab
468 column := 0
469 i = 0
470 for i < len(line) {
471 start := i
472 for i < len(line) && line[i] != '\t' {
473 _, size := utf8.DecodeRune(line[i:])
474 i += size
475 column++
476 }
477
478 if i > start {
479 out.Write(line[start:i])
480 }
481
482 if i >= len(line) {
483 break
484 }
485
486 for {
487 out.WriteByte(' ')
488 column++
489 if column%TAB_SIZE == 0 {
490 break
491 }
492 }
493
494 i++
495 }
496}