markdown.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Licensed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11//
12// Markdown parsing and processing
13//
14//
15
16package blackfriday
17
18import (
19 "bytes"
20 "utf8"
21)
22
23// These are the supported markdown parsing extensions.
24// OR these values together to select multiple extensions.
25const (
26 EXTENSION_NO_INTRA_EMPHASIS = 1 << iota
27 EXTENSION_TABLES
28 EXTENSION_FENCED_CODE
29 EXTENSION_AUTOLINK
30 EXTENSION_STRIKETHROUGH
31 EXTENSION_LAX_HTML_BLOCKS
32 EXTENSION_SPACE_HEADERS
33 EXTENSION_HARD_LINE_BREAK
34)
35
36// These are the possible flag values for the link renderer.
37// Only a single one of these values will be used; they are not ORed together.
38// These are mostly of interest if you are writing a new output format.
39const (
40 LINK_TYPE_NOT_AUTOLINK = iota
41 LINK_TYPE_NORMAL
42 LINK_TYPE_EMAIL
43)
44
45// These are the possible flag values for the listitem renderer.
46// Multiple flag values may be ORed together.
47// These are mostly of interest if you are writing a new output format.
48const (
49 LIST_TYPE_ORDERED = 1 << iota
50 LIST_ITEM_CONTAINS_BLOCK
51 LIST_ITEM_END_OF_LIST
52)
53
54// These are the possible flag values for the table cell renderer.
55// Only a single one of these values will be used; they are not ORed together.
56// These are mostly of interest if you are writing a new output format.
57const (
58 TABLE_ALIGNMENT_LEFT = 1 << iota
59 TABLE_ALIGNMENT_RIGHT
60 TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
61)
62
63// The size of a tab stop.
64const TAB_SIZE = 4
65
66// These are the tags that are recognized as HTML block tags.
67// Any of these can be included in markdown text without special escaping.
68var block_tags = map[string]bool{
69 "p": true,
70 "dl": true,
71 "h1": true,
72 "h2": true,
73 "h3": true,
74 "h4": true,
75 "h5": true,
76 "h6": true,
77 "ol": true,
78 "ul": true,
79 "del": true,
80 "div": true,
81 "ins": true,
82 "pre": true,
83 "form": true,
84 "math": true,
85 "table": true,
86 "iframe": true,
87 "script": true,
88 "fieldset": true,
89 "noscript": true,
90 "blockquote": true,
91}
92
93// This struct defines the rendering interface.
94// A series of callback functions are registered to form a complete renderer.
95// A single interface{} value field is provided, and that value is handed to
96// each callback. Leaving a field blank suppresses rendering that type of output
97// except where noted.
98//
99// This is mostly of interest if you are implementing a new rendering format.
100// Most users will use the convenience functions to fill in this structure.
101type Renderer struct {
102 // block-level callbacks---nil skips the block
103 BlockCode func(out *bytes.Buffer, text []byte, lang string, opaque interface{})
104 BlockQuote func(out *bytes.Buffer, text []byte, opaque interface{})
105 BlockHtml func(out *bytes.Buffer, text []byte, opaque interface{})
106 Header func(out *bytes.Buffer, text func() bool, level int, opaque interface{})
107 HRule func(out *bytes.Buffer, opaque interface{})
108 List func(out *bytes.Buffer, text func() bool, flags int, opaque interface{})
109 ListItem func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
110 Paragraph func(out *bytes.Buffer, text func() bool, opaque interface{})
111 Table func(out *bytes.Buffer, header []byte, body []byte, columnData []int, opaque interface{})
112 TableRow func(out *bytes.Buffer, text []byte, opaque interface{})
113 TableCell func(out *bytes.Buffer, text []byte, flags int, opaque interface{})
114
115 // Span-level callbacks---nil or return 0 prints the span verbatim
116 AutoLink func(out *bytes.Buffer, link []byte, kind int, opaque interface{}) int
117 CodeSpan func(out *bytes.Buffer, text []byte, opaque interface{}) int
118 DoubleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
119 Emphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
120 Image func(out *bytes.Buffer, link []byte, title []byte, alt []byte, opaque interface{}) int
121 LineBreak func(out *bytes.Buffer, opaque interface{}) int
122 Link func(out *bytes.Buffer, link []byte, title []byte, content []byte, opaque interface{}) int
123 RawHtmlTag func(out *bytes.Buffer, tag []byte, opaque interface{}) int
124 TripleEmphasis func(out *bytes.Buffer, text []byte, opaque interface{}) int
125 StrikeThrough func(out *bytes.Buffer, text []byte, opaque interface{}) int
126
127 // Low-level callbacks---nil copies input directly into the output
128 Entity func(out *bytes.Buffer, entity []byte, opaque interface{})
129 NormalText func(out *bytes.Buffer, text []byte, opaque interface{})
130
131 // Header and footer
132 DocumentHeader func(out *bytes.Buffer, opaque interface{})
133 DocumentFooter func(out *bytes.Buffer, opaque interface{})
134
135 // User data---passed back to every callback
136 Opaque interface{}
137}
138
139type inlineParser func(out *bytes.Buffer, rndr *render, data []byte, offset int) int
140
141type render struct {
142 mk *Renderer
143 refs map[string]*reference
144 inline [256]inlineParser
145 flags uint32
146 nesting int
147 maxNesting int
148 insideLink bool
149}
150
151
152//
153//
154// Public interface
155//
156//
157
158// Parse and render a block of markdown-encoded text.
159// The renderer is used to format the output, and extensions dictates which
160// non-standard extensions are enabled.
161func Markdown(input []byte, renderer *Renderer, extensions uint32) []byte {
162 // no point in parsing if we can't render
163 if renderer == nil {
164 return nil
165 }
166
167 // fill in the render structure
168 rndr := new(render)
169 rndr.mk = renderer
170 rndr.flags = extensions
171 rndr.refs = make(map[string]*reference)
172 rndr.maxNesting = 16
173 rndr.insideLink = false
174
175 // register inline parsers
176 if rndr.mk.Emphasis != nil || rndr.mk.DoubleEmphasis != nil || rndr.mk.TripleEmphasis != nil {
177 rndr.inline['*'] = inlineEmphasis
178 rndr.inline['_'] = inlineEmphasis
179 if extensions&EXTENSION_STRIKETHROUGH != 0 {
180 rndr.inline['~'] = inlineEmphasis
181 }
182 }
183 if rndr.mk.CodeSpan != nil {
184 rndr.inline['`'] = inlineCodeSpan
185 }
186 if rndr.mk.LineBreak != nil {
187 rndr.inline['\n'] = inlineLineBreak
188 }
189 if rndr.mk.Image != nil || rndr.mk.Link != nil {
190 rndr.inline['['] = inlineLink
191 }
192 rndr.inline['<'] = inlineLAngle
193 rndr.inline['\\'] = inlineEscape
194 rndr.inline['&'] = inlineEntity
195
196 if extensions&EXTENSION_AUTOLINK != 0 {
197 rndr.inline[':'] = inlineAutoLink
198 }
199
200 first := FirstPass(rndr, input)
201 second := SecondPass(rndr, first)
202
203 return second
204}
205
206// first pass:
207// - extract references
208// - expand tabs
209// - normalize newlines
210// - copy everything else
211func FirstPass(rndr *render, input []byte) []byte {
212 var out bytes.Buffer
213 beg, end := 0, 0
214 for beg < len(input) { // iterate over lines
215 if end = isReference(rndr, input[beg:]); end > 0 {
216 beg += end
217 } else { // skip to the next line
218 end = beg
219 for end < len(input) && input[end] != '\n' && input[end] != '\r' {
220 end++
221 }
222
223 // add the line body if present
224 if end > beg {
225 expandTabs(&out, input[beg:end])
226 } else {
227 out.WriteByte('\n')
228 }
229
230 if end < len(input) && input[end] == '\r' {
231 end++
232 }
233 if end < len(input) && input[end] == '\n' {
234 end++
235 }
236
237 beg = end
238 }
239 }
240 return out.Bytes()
241}
242
243// second pass: actual rendering
244func SecondPass(rndr *render, input []byte) []byte {
245 var output bytes.Buffer
246 if rndr.mk.DocumentHeader != nil {
247 rndr.mk.DocumentHeader(&output, rndr.mk.Opaque)
248 }
249
250 parseBlock(&output, rndr, input)
251
252 if rndr.mk.DocumentFooter != nil {
253 rndr.mk.DocumentFooter(&output, rndr.mk.Opaque)
254 }
255
256 if rndr.nesting != 0 {
257 panic("Nesting level did not end at zero")
258 }
259
260 return output.Bytes()
261}
262
263
264//
265// Link references
266//
267// This section implements support for references that (usually) appear
268// as footnotes in a document, and can be referenced anywhere in the document.
269// The basic format is:
270//
271// [1]: http://www.google.com/ "Google"
272// [2]: http://www.github.com/ "Github"
273//
274// Anywhere in the document, the reference can be linked by referring to its
275// label, i.e., 1 and 2 in this example, as in:
276//
277// This library is hosted on [Github][2], a git hosting site.
278
279// References are parsed and stored in this struct.
280type reference struct {
281 link []byte
282 title []byte
283}
284
285// Check whether or not data starts with a reference link.
286// If so, it is parsed and stored in the list of references
287// (in the render struct).
288// Returns the number of bytes to skip to move past it,
289// or zero if the first line is not a reference.
290func isReference(rndr *render, data []byte) int {
291 // up to 3 optional leading spaces
292 if len(data) < 4 {
293 return 0
294 }
295 i := 0
296 for i < 3 && data[i] == ' ' {
297 i++
298 }
299
300 // id part: anything but a newline between brackets
301 if data[i] != '[' {
302 return 0
303 }
304 i++
305 id_offset := i
306 for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
307 i++
308 }
309 if i >= len(data) || data[i] != ']' {
310 return 0
311 }
312 id_end := i
313
314 // spacer: colon (space | tab)* newline? (space | tab)*
315 i++
316 if i >= len(data) || data[i] != ':' {
317 return 0
318 }
319 i++
320 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
321 i++
322 }
323 if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
324 i++
325 if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
326 i++
327 }
328 }
329 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
330 i++
331 }
332 if i >= len(data) {
333 return 0
334 }
335
336 // link: whitespace-free sequence, optionally between angle brackets
337 if data[i] == '<' {
338 i++
339 }
340 link_offset := i
341 for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
342 i++
343 }
344 link_end := i
345 if data[link_offset] == '<' && data[link_end-1] == '>' {
346 link_offset++
347 link_end--
348 }
349
350 // optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
351 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
352 i++
353 }
354 if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
355 return 0
356 }
357
358 // compute end-of-line
359 line_end := 0
360 if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
361 line_end = i
362 }
363 if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
364 line_end++
365 }
366
367 // optional (space|tab)* spacer after a newline
368 if line_end > 0 {
369 i = line_end + 1
370 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
371 i++
372 }
373 }
374
375 // optional title: any non-newline sequence enclosed in '"() alone on its line
376 title_offset, title_end := 0, 0
377 if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
378 i++
379 title_offset = i
380
381 // look for EOL
382 for i < len(data) && data[i] != '\n' && data[i] != '\r' {
383 i++
384 }
385 if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
386 title_end = i + 1
387 } else {
388 title_end = i
389 }
390
391 // step back
392 i--
393 for i > title_offset && (data[i] == ' ' || data[i] == '\t') {
394 i--
395 }
396 if i > title_offset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
397 line_end = title_end
398 title_end = i
399 }
400 }
401 if line_end == 0 { // garbage after the link
402 return 0
403 }
404
405 // a valid ref has been found
406 if rndr == nil {
407 return line_end
408 }
409
410 // id matches are case-insensitive
411 id := string(bytes.ToLower(data[id_offset:id_end]))
412 rndr.refs[id] = &reference{
413 link: data[link_offset:link_end],
414 title: data[title_offset:title_end],
415 }
416
417 return line_end
418}
419
420
421//
422//
423// Miscellaneous helper functions
424//
425//
426
427
428// Test if a character is a punctuation symbol.
429// Taken from a private function in regexp in the stdlib.
430func ispunct(c byte) bool {
431 for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
432 if c == r {
433 return true
434 }
435 }
436 return false
437}
438
439// Test if a character is a whitespace character.
440func isspace(c byte) bool {
441 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
442}
443
444// Test if a character is a letter or a digit.
445// TODO: check when this is looking for ASCII alnum and when it should use unicode
446func isalnum(c byte) bool {
447 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
448}
449
450// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
451// always ends output with a newline
452func expandTabs(out *bytes.Buffer, line []byte) {
453 // first, check for common cases: no tabs, or only tabs at beginning of line
454 i, prefix := 0, 0
455 slowcase := false
456 for i = 0; i < len(line); i++ {
457 if line[i] == '\t' {
458 if prefix == i {
459 prefix++
460 } else {
461 slowcase = true
462 break
463 }
464 }
465 }
466
467 // no need to decode runes if all tabs are at the beginning of the line
468 if !slowcase {
469 for i = 0; i < prefix*TAB_SIZE; i++ {
470 out.WriteByte(' ')
471 }
472 out.Write(line[prefix:])
473 out.WriteByte('\n')
474 return
475 }
476
477 // the slow case: we need to count runes to figure out how
478 // many spaces to insert for each tab
479 column := 0
480 i = 0
481 for i < len(line) {
482 start := i
483 for i < len(line) && line[i] != '\t' {
484 _, size := utf8.DecodeRune(line[i:])
485 i += size
486 column++
487 }
488
489 if i > start {
490 out.Write(line[start:i])
491 }
492
493 if i >= len(line) {
494 break
495 }
496
497 for {
498 out.WriteByte(' ')
499 column++
500 if column%TAB_SIZE == 0 {
501 break
502 }
503 }
504
505 i++
506 }
507 out.WriteByte('\n')
508}