markdown.go (view raw)
1//
2// Black Friday Markdown Processor
3// Originally based on http://github.com/tanoku/upskirt
4// by Russ Ross <russ@russross.com>
5//
6
7//
8//
9// Markdown parsing and processing
10//
11//
12
13package blackfriday
14
15import (
16 "bytes"
17 "sort"
18 "unicode"
19)
20
21// These are the supported markdown parsing extensions.
22// OR these values together to select multiple extensions.
23const (
24 EXTENSION_NO_INTRA_EMPHASIS = 1 << iota
25 EXTENSION_TABLES
26 EXTENSION_FENCED_CODE
27 EXTENSION_AUTOLINK
28 EXTENSION_STRIKETHROUGH
29 EXTENSION_LAX_HTML_BLOCKS
30 EXTENSION_SPACE_HEADERS
31)
32
33// These are the possible flag values for the link renderer.
34// Only a single one of these values will be used; they are not ORed together.
35// These are mostly of interest if you are writing a new output format.
36const (
37 LINK_TYPE_NOT_AUTOLINK = iota
38 LINK_TYPE_NORMAL
39 LINK_TYPE_EMAIL
40)
41
42// These are the possible flag values for the listitem renderer.
43// Multiple flag values may be ORed together.
44// These are mostly of interest if you are writing a new output format.
45const (
46 LIST_TYPE_ORDERED = 1 << iota
47 LIST_ITEM_CONTAINS_BLOCK
48 LIST_ITEM_END_OF_LIST
49)
50
51// These are the possible flag values for the table cell renderer.
52// Only a single one of these values will be used; they are not ORed together.
53// These are mostly of interest if you are writing a new output format.
54const (
55 TABLE_ALIGNMENT_LEFT = 1 << iota
56 TABLE_ALIGNMENT_RIGHT
57 TABLE_ALIGNMENT_CENTER = (TABLE_ALIGNMENT_LEFT | TABLE_ALIGNMENT_RIGHT)
58)
59
60// The size of a tab stop.
61const TAB_SIZE = 4
62
63// These are the tags that are recognized as HTML block tags.
64// Any of these can be included in markdown text without special escaping.
65var block_tags = map[string]bool{
66 "p": true,
67 "dl": true,
68 "h1": true,
69 "h2": true,
70 "h3": true,
71 "h4": true,
72 "h5": true,
73 "h6": true,
74 "ol": true,
75 "ul": true,
76 "del": true,
77 "div": true,
78 "ins": true,
79 "pre": true,
80 "form": true,
81 "math": true,
82 "table": true,
83 "iframe": true,
84 "script": true,
85 "fieldset": true,
86 "noscript": true,
87 "blockquote": true,
88}
89
90// This struct defines the rendering interface.
91// A series of callback functions are registered to form a complete renderer.
92// A single interface{} value field is provided, and that value is handed to
93// each callback. Leaving a field blank suppresses rendering that type of output
94// except where noted.
95//
96// This is mostly of interest if you are implementing a new rendering format.
97// Most users will use the convenience functions to fill in this structure.
98type Renderer struct {
99 // block-level callbacks---nil skips the block
100 blockcode func(ob *bytes.Buffer, text []byte, lang string, opaque interface{})
101 blockquote func(ob *bytes.Buffer, text []byte, opaque interface{})
102 blockhtml func(ob *bytes.Buffer, text []byte, opaque interface{})
103 header func(ob *bytes.Buffer, text []byte, level int, opaque interface{})
104 hrule func(ob *bytes.Buffer, opaque interface{})
105 list func(ob *bytes.Buffer, text []byte, flags int, opaque interface{})
106 listitem func(ob *bytes.Buffer, text []byte, flags int, opaque interface{})
107 paragraph func(ob *bytes.Buffer, text []byte, opaque interface{})
108 table func(ob *bytes.Buffer, header []byte, body []byte, opaque interface{})
109 table_row func(ob *bytes.Buffer, text []byte, opaque interface{})
110 table_cell func(ob *bytes.Buffer, text []byte, flags int, opaque interface{})
111
112 // span-level callbacks---nil or return 0 prints the span verbatim
113 autolink func(ob *bytes.Buffer, link []byte, kind int, opaque interface{}) int
114 codespan func(ob *bytes.Buffer, text []byte, opaque interface{}) int
115 double_emphasis func(ob *bytes.Buffer, text []byte, opaque interface{}) int
116 emphasis func(ob *bytes.Buffer, text []byte, opaque interface{}) int
117 image func(ob *bytes.Buffer, link []byte, title []byte, alt []byte, opaque interface{}) int
118 linebreak func(ob *bytes.Buffer, opaque interface{}) int
119 link func(ob *bytes.Buffer, link []byte, title []byte, content []byte, opaque interface{}) int
120 raw_html_tag func(ob *bytes.Buffer, tag []byte, opaque interface{}) int
121 triple_emphasis func(ob *bytes.Buffer, text []byte, opaque interface{}) int
122 strikethrough func(ob *bytes.Buffer, text []byte, opaque interface{}) int
123
124 // low-level callbacks---nil copies input directly into the output
125 entity func(ob *bytes.Buffer, entity []byte, opaque interface{})
126 normal_text func(ob *bytes.Buffer, text []byte, opaque interface{})
127
128 // header and footer
129 doc_header func(ob *bytes.Buffer, opaque interface{})
130 doc_footer func(ob *bytes.Buffer, opaque interface{})
131
132 // user data---passed back to every callback
133 opaque interface{}
134}
135
136//
137
138type render struct {
139 mk *Renderer
140 refs link_ref_array
141 active_char [256]int
142 ext_flags uint32
143 nesting int
144 max_nesting int
145}
146
147const (
148 MD_CHAR_NONE = iota
149 MD_CHAR_EMPHASIS
150 MD_CHAR_CODESPAN
151 MD_CHAR_LINEBREAK
152 MD_CHAR_LINK
153 MD_CHAR_LANGLE
154 MD_CHAR_ESCAPE
155 MD_CHAR_ENTITITY
156 MD_CHAR_AUTOLINK
157)
158
159
160//
161//
162// Public interface
163//
164//
165
166// Parse and render a block of markdown-encoded text.
167// The renderer is used to format the output, and extensions dictates which
168// non-standard extensions are enabled.
169func Markdown(output *bytes.Buffer, input []byte, renderer *Renderer, extensions uint32) {
170 // no point in parsing if we can't render
171 if renderer == nil {
172 return
173 }
174
175 // fill in the character-level parsers
176 markdown_char_ptrs[MD_CHAR_NONE] = nil
177 markdown_char_ptrs[MD_CHAR_EMPHASIS] = char_emphasis
178 markdown_char_ptrs[MD_CHAR_CODESPAN] = char_codespan
179 markdown_char_ptrs[MD_CHAR_LINEBREAK] = char_linebreak
180 markdown_char_ptrs[MD_CHAR_LINK] = char_link
181 markdown_char_ptrs[MD_CHAR_LANGLE] = char_langle_tag
182 markdown_char_ptrs[MD_CHAR_ESCAPE] = char_escape
183 markdown_char_ptrs[MD_CHAR_ENTITITY] = char_entity
184 markdown_char_ptrs[MD_CHAR_AUTOLINK] = char_autolink
185
186 // fill in the render structure
187 rndr := new(render)
188 rndr.mk = renderer
189 rndr.ext_flags = extensions
190 rndr.max_nesting = 16
191
192 if rndr.mk.emphasis != nil || rndr.mk.double_emphasis != nil || rndr.mk.triple_emphasis != nil {
193 rndr.active_char['*'] = MD_CHAR_EMPHASIS
194 rndr.active_char['_'] = MD_CHAR_EMPHASIS
195 if extensions&EXTENSION_STRIKETHROUGH != 0 {
196 rndr.active_char['~'] = MD_CHAR_EMPHASIS
197 }
198 }
199 if rndr.mk.codespan != nil {
200 rndr.active_char['`'] = MD_CHAR_CODESPAN
201 }
202 if rndr.mk.linebreak != nil {
203 rndr.active_char['\n'] = MD_CHAR_LINEBREAK
204 }
205 if rndr.mk.image != nil || rndr.mk.link != nil {
206 rndr.active_char['['] = MD_CHAR_LINK
207 }
208 rndr.active_char['<'] = MD_CHAR_LANGLE
209 rndr.active_char['\\'] = MD_CHAR_ESCAPE
210 rndr.active_char['&'] = MD_CHAR_ENTITITY
211
212 if extensions&EXTENSION_AUTOLINK != 0 {
213 rndr.active_char['h'] = MD_CHAR_AUTOLINK // http, https
214 rndr.active_char['H'] = MD_CHAR_AUTOLINK
215
216 rndr.active_char['f'] = MD_CHAR_AUTOLINK // ftp
217 rndr.active_char['F'] = MD_CHAR_AUTOLINK
218
219 rndr.active_char['m'] = MD_CHAR_AUTOLINK // mailto
220 rndr.active_char['M'] = MD_CHAR_AUTOLINK
221 }
222
223 // first pass: look for references, copy everything else
224 text := bytes.NewBuffer(nil)
225 beg, end := 0, 0
226 for beg < len(input) { // iterate over lines
227 if end = is_ref(rndr, input[beg:]); end > 0 {
228 beg += end
229 } else { // skip to the next line
230 end = beg
231 for end < len(input) && input[end] != '\n' && input[end] != '\r' {
232 end++
233 }
234
235 // add the line body if present
236 if end > beg {
237 expand_tabs(text, input[beg:end])
238 }
239
240 for end < len(input) && (input[end] == '\n' || input[end] == '\r') {
241 // add one \n per newline
242 if input[end] == '\n' || (end+1 < len(input) && input[end+1] != '\n') {
243 text.WriteByte('\n')
244 }
245 end++
246 }
247
248 beg = end
249 }
250 }
251
252 // sort the reference array
253 if len(rndr.refs) > 1 {
254 sort.Sort(rndr.refs)
255 }
256
257 // second pass: actual rendering
258 if rndr.mk.doc_header != nil {
259 rndr.mk.doc_header(output, rndr.mk.opaque)
260 }
261
262 if text.Len() > 0 {
263 // add a final newline if not already present
264 finalchar := text.Bytes()[text.Len()-1]
265 if finalchar != '\n' && finalchar != '\r' {
266 text.WriteByte('\n')
267 }
268 parse_block(output, rndr, text.Bytes())
269 }
270
271 if rndr.mk.doc_footer != nil {
272 rndr.mk.doc_footer(output, rndr.mk.opaque)
273 }
274
275 if rndr.nesting != 0 {
276 panic("Nesting level did not end at zero")
277 }
278}
279
280
281//
282// Inline parsing
283// Functions to parse text within a block.
284//
285
286// closures to render active chars, each:
287// returns the number of chars taken care of
288// data is the complete block being rendered
289// offset is the number of valid chars before the data
290//
291// Note: this is filled in in Markdown to prevent an initilization loop
292var markdown_char_ptrs [9]func(ob *bytes.Buffer, rndr *render, data []byte, offset int) int
293
294func parse_inline(ob *bytes.Buffer, rndr *render, data []byte) {
295 if rndr.nesting >= rndr.max_nesting {
296 return
297 }
298 rndr.nesting++
299
300 i, end := 0, 0
301 for i < len(data) {
302 // copy inactive chars into the output
303 for end < len(data) && rndr.active_char[data[end]] == 0 {
304 end++
305 }
306
307 if rndr.mk.normal_text != nil {
308 rndr.mk.normal_text(ob, data[i:end], rndr.mk.opaque)
309 } else {
310 ob.Write(data[i:end])
311 }
312
313 if end >= len(data) {
314 break
315 }
316 i = end
317
318 // call the trigger
319 action := rndr.active_char[data[end]]
320 end = markdown_char_ptrs[action](ob, rndr, data, i)
321
322 if end == 0 { // no action from the callback
323 end = i + 1
324 } else {
325 i += end
326 end = i
327 }
328 }
329
330 rndr.nesting--
331}
332
333// single and double emphasis parsing
334func char_emphasis(ob *bytes.Buffer, rndr *render, data []byte, offset int) int {
335 data = data[offset:]
336 c := data[0]
337 ret := 0
338
339 if len(data) > 2 && data[1] != c {
340 // whitespace cannot follow an opening emphasis;
341 // strikethrough only takes two characters '~~'
342 if c == '~' || isspace(data[1]) {
343 return 0
344 }
345 if ret = parse_emph1(ob, rndr, data[1:], c); ret == 0 {
346 return 0
347 }
348
349 return ret + 1
350 }
351
352 if len(data) > 3 && data[1] == c && data[2] != c {
353 if isspace(data[2]) {
354 return 0
355 }
356 if ret = parse_emph2(ob, rndr, data[2:], c); ret == 0 {
357 return 0
358 }
359
360 return ret + 2
361 }
362
363 if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
364 if c == '~' || isspace(data[3]) {
365 return 0
366 }
367 if ret = parse_emph3(ob, rndr, data, 3, c); ret == 0 {
368 return 0
369 }
370
371 return ret + 3
372 }
373
374 return 0
375}
376
377func char_codespan(ob *bytes.Buffer, rndr *render, data []byte, offset int) int {
378 data = data[offset:]
379
380 nb := 0
381
382 // count the number of backticks in the delimiter
383 for nb < len(data) && data[nb] == '`' {
384 nb++
385 }
386
387 // find the next delimiter
388 i, end := 0, 0
389 for end = nb; end < len(data) && i < nb; end++ {
390 if data[end] == '`' {
391 i++
392 } else {
393 i = 0
394 }
395 }
396
397 if i < nb && end >= len(data) {
398 return 0 // no matching delimiter
399 }
400
401 // trim outside whitespace
402 f_begin := nb
403 for f_begin < end && (data[f_begin] == ' ' || data[f_begin] == '\t') {
404 f_begin++
405 }
406
407 f_end := end - nb
408 for f_end > nb && (data[f_end-1] == ' ' || data[f_end-1] == '\t') {
409 f_end--
410 }
411
412 // real code span
413 if rndr.mk.codespan == nil {
414 return 0
415 }
416 if f_begin < f_end {
417 if rndr.mk.codespan(ob, data[f_begin:f_end], rndr.mk.opaque) == 0 {
418 end = 0
419 }
420 } else {
421 if rndr.mk.codespan(ob, nil, rndr.mk.opaque) == 0 {
422 end = 0
423 }
424 }
425
426 return end
427
428}
429
430// '\n' preceded by two spaces
431func char_linebreak(ob *bytes.Buffer, rndr *render, data []byte, offset int) int {
432 if offset < 2 || data[offset-1] != ' ' || data[offset-2] != ' ' {
433 return 0
434 }
435
436 // remove trailing spaces from ob and render
437 ob_bytes := ob.Bytes()
438 end := len(ob_bytes)
439 for end > 0 && ob_bytes[end-1] == ' ' {
440 end--
441 }
442 ob.Truncate(end)
443
444 if rndr.mk.linebreak == nil {
445 return 0
446 }
447 if rndr.mk.linebreak(ob, rndr.mk.opaque) > 0 {
448 return 1
449 } else {
450 return 0
451 }
452
453 return 0
454}
455
456// '[': parse a link or an image
457func char_link(ob *bytes.Buffer, rndr *render, data []byte, offset int) int {
458 is_img := offset > 0 && data[offset-1] == '!'
459
460 data = data[offset:]
461
462 i := 1
463 var title, link []byte
464 text_has_nl := false
465
466 // check whether the correct renderer exists
467 if (is_img && rndr.mk.image == nil) || (!is_img && rndr.mk.link == nil) {
468 return 0
469 }
470
471 // look for the matching closing bracket
472 for level := 1; level > 0 && i < len(data); i++ {
473 switch {
474 case data[i] == '\n':
475 text_has_nl = true
476
477 case data[i-1] == '\\':
478 continue
479
480 case data[i] == '[':
481 level++
482
483 case data[i] == ']':
484 level--
485 if level <= 0 {
486 i-- // compensate for extra i++ in for loop
487 }
488 }
489 }
490
491 if i >= len(data) {
492 return 0
493 }
494
495 txt_e := i
496 i++
497
498 // skip any amount of whitespace or newline
499 // (this is much more lax than original markdown syntax)
500 for i < len(data) && isspace(data[i]) {
501 i++
502 }
503
504 // inline style link
505 switch {
506 case i < len(data) && data[i] == '(':
507 // skip initial whitespace
508 i++
509
510 for i < len(data) && isspace(data[i]) {
511 i++
512 }
513
514 link_b := i
515
516 // look for link end: ' " )
517 for i < len(data) {
518 if data[i] == '\\' {
519 i += 2
520 } else {
521 if data[i] == ')' || data[i] == '\'' || data[i] == '"' {
522 break
523 }
524 i++
525 }
526 }
527
528 if i >= len(data) {
529 return 0
530 }
531 link_e := i
532
533 // look for title end if present
534 title_b, title_e := 0, 0
535 if data[i] == '\'' || data[i] == '"' {
536 i++
537 title_b = i
538
539 for i < len(data) {
540 if data[i] == '\\' {
541 i += 2
542 } else {
543 if data[i] == ')' {
544 break
545 }
546 i++
547 }
548 }
549
550 if i >= len(data) {
551 return 0
552 }
553
554 // skip whitespace after title
555 title_e = i - 1
556 for title_e > title_b && isspace(data[title_e]) {
557 title_e--
558 }
559
560 // check for closing quote presence
561 if data[title_e] != '\'' && data[title_e] != '"' {
562 title_b, title_e = 0, 0
563 link_e = i
564 }
565 }
566
567 // remove whitespace at the end of the link
568 for link_e > link_b && isspace(data[link_e-1]) {
569 link_e--
570 }
571
572 // remove optional angle brackets around the link
573 if data[link_b] == '<' {
574 link_b++
575 }
576 if data[link_e-1] == '>' {
577 link_e--
578 }
579
580 // build escaped link and title
581 if link_e > link_b {
582 link = data[link_b:link_e]
583 }
584
585 if title_e > title_b {
586 title = data[title_b:title_e]
587 }
588
589 i++
590
591 // reference style link
592 case i < len(data) && data[i] == '[':
593 var id []byte
594
595 // look for the id
596 i++
597 link_b := i
598 for i < len(data) && data[i] != ']' {
599 i++
600 }
601 if i >= len(data) {
602 return 0
603 }
604 link_e := i
605
606 // find the link_ref
607 if link_b == link_e {
608 if text_has_nl {
609 b := bytes.NewBuffer(nil)
610
611 for j := 1; j < txt_e; j++ {
612 switch {
613 case data[j] != '\n':
614 b.WriteByte(data[j])
615 case data[j-1] != ' ':
616 b.WriteByte(' ')
617 }
618 }
619
620 id = b.Bytes()
621 } else {
622 id = data[1:txt_e]
623 }
624 } else {
625 id = data[link_b:link_e]
626 }
627
628 // find the link_ref with matching id
629 index := sortDotSearch(len(rndr.refs), func(i int) bool {
630 return !byteslice_less(rndr.refs[i].id, id)
631 })
632 if index >= len(rndr.refs) || !bytes.Equal(rndr.refs[index].id, id) {
633 return 0
634 }
635 lr := rndr.refs[index]
636
637 // keep link and title from link_ref
638 link = lr.link
639 title = lr.title
640 i++
641
642 // shortcut reference style link
643 default:
644 var id []byte
645
646 // craft the id
647 if text_has_nl {
648 b := bytes.NewBuffer(nil)
649
650 for j := 1; j < txt_e; j++ {
651 switch {
652 case data[j] != '\n':
653 b.WriteByte(data[j])
654 case data[j-1] != ' ':
655 b.WriteByte(' ')
656 }
657 }
658
659 id = b.Bytes()
660 } else {
661 id = data[1:txt_e]
662 }
663
664 // find the link_ref with matching id
665 index := sortDotSearch(len(rndr.refs), func(i int) bool {
666 return !byteslice_less(rndr.refs[i].id, id)
667 })
668 if index >= len(rndr.refs) || !bytes.Equal(rndr.refs[index].id, id) {
669 return 0
670 }
671 lr := rndr.refs[index]
672
673 // keep link and title from link_ref
674 link = lr.link
675 title = lr.title
676
677 // rewind the whitespace
678 i = txt_e + 1
679 }
680
681 // build content: img alt is escaped, link content is parsed
682 content := bytes.NewBuffer(nil)
683 if txt_e > 1 {
684 if is_img {
685 content.Write(data[1:txt_e])
686 } else {
687 parse_inline(content, rndr, data[1:txt_e])
688 }
689 }
690
691 var u_link []byte
692 if len(link) > 0 {
693 u_link_buf := bytes.NewBuffer(nil)
694 unescape_text(u_link_buf, link)
695 u_link = u_link_buf.Bytes()
696 }
697
698 // call the relevant rendering function
699 ret := 0
700 if is_img {
701 ob_size := ob.Len()
702 ob_bytes := ob.Bytes()
703 if ob_size > 0 && ob_bytes[ob_size-1] == '!' {
704 ob.Truncate(ob_size - 1)
705 }
706
707 ret = rndr.mk.image(ob, u_link, title, content.Bytes(), rndr.mk.opaque)
708 } else {
709 ret = rndr.mk.link(ob, u_link, title, content.Bytes(), rndr.mk.opaque)
710 }
711
712 if ret > 0 {
713 return i
714 }
715 return 0
716}
717
718// '<' when tags or autolinks are allowed
719func char_langle_tag(ob *bytes.Buffer, rndr *render, data []byte, offset int) int {
720 data = data[offset:]
721 altype := LINK_TYPE_NOT_AUTOLINK
722 end := tag_length(data, &altype)
723 ret := 0
724
725 if end > 2 {
726 switch {
727 case rndr.mk.autolink != nil && altype != LINK_TYPE_NOT_AUTOLINK:
728 u_link := bytes.NewBuffer(nil)
729 unescape_text(u_link, data[1:end+1-2])
730 ret = rndr.mk.autolink(ob, u_link.Bytes(), altype, rndr.mk.opaque)
731 case rndr.mk.raw_html_tag != nil:
732 ret = rndr.mk.raw_html_tag(ob, data[:end], rndr.mk.opaque)
733 }
734 }
735
736 if ret == 0 {
737 return 0
738 }
739 return end
740}
741
742// '\\' backslash escape
743var escape_chars = []byte("\\`*_{}[]()#+-.!:|&<>")
744
745func char_escape(ob *bytes.Buffer, rndr *render, data []byte, offset int) int {
746 data = data[offset:]
747
748 if len(data) > 1 {
749 if bytes.IndexByte(escape_chars, data[1]) < 0 {
750 return 0
751 }
752
753 if rndr.mk.normal_text != nil {
754 rndr.mk.normal_text(ob, data[1:2], rndr.mk.opaque)
755 } else {
756 ob.WriteByte(data[1])
757 }
758 }
759
760 return 2
761}
762
763// '&' escaped when it doesn't belong to an entity
764// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
765func char_entity(ob *bytes.Buffer, rndr *render, data []byte, offset int) int {
766 data = data[offset:]
767
768 end := 1
769
770 if end < len(data) && data[end] == '#' {
771 end++
772 }
773
774 for end < len(data) && isalnum(data[end]) {
775 end++
776 }
777
778 if end < len(data) && data[end] == ';' {
779 end++ // real entity
780 } else {
781 return 0 // lone '&'
782 }
783
784 if rndr.mk.entity != nil {
785 rndr.mk.entity(ob, data[:end], rndr.mk.opaque)
786 } else {
787 ob.Write(data[:end])
788 }
789
790 return end
791}
792
793func char_autolink(ob *bytes.Buffer, rndr *render, data []byte, offset int) int {
794 orig_data := data
795 data = data[offset:]
796
797 if offset > 0 {
798 if !isspace(orig_data[offset-1]) && !ispunct(orig_data[offset-1]) {
799 return 0
800 }
801 }
802
803 if !is_safe_link(data) {
804 return 0
805 }
806
807 link_end := 0
808 for link_end < len(data) && !isspace(data[link_end]) {
809 link_end++
810 }
811
812 // Skip punctuation at the end of the link
813 if (data[link_end-1] == '.' || data[link_end-1] == ',' || data[link_end-1] == ';') && data[link_end-2] != '\\' {
814 link_end--
815 }
816
817 // See if the link finishes with a punctuation sign that can be closed.
818 var copen byte
819 switch data[link_end-1] {
820 case '"':
821 copen = '"'
822 case '\'':
823 copen = '\''
824 case ')':
825 copen = '('
826 case ']':
827 copen = '['
828 case '}':
829 copen = '{'
830 default:
831 copen = 0
832 }
833
834 if copen != 0 {
835 buf_end := offset + link_end - 2
836
837 open_delim := 1
838
839 /* Try to close the final punctuation sign in this same line;
840 * if we managed to close it outside of the URL, that means that it's
841 * not part of the URL. If it closes inside the URL, that means it
842 * is part of the URL.
843 *
844 * Examples:
845 *
846 * foo http://www.pokemon.com/Pikachu_(Electric) bar
847 * => http://www.pokemon.com/Pikachu_(Electric)
848 *
849 * foo (http://www.pokemon.com/Pikachu_(Electric)) bar
850 * => http://www.pokemon.com/Pikachu_(Electric)
851 *
852 * foo http://www.pokemon.com/Pikachu_(Electric)) bar
853 * => http://www.pokemon.com/Pikachu_(Electric))
854 *
855 * (foo http://www.pokemon.com/Pikachu_(Electric)) bar
856 * => foo http://www.pokemon.com/Pikachu_(Electric)
857 */
858
859 for buf_end >= 0 && orig_data[buf_end] != '\n' && open_delim != 0 {
860 if orig_data[buf_end] == data[link_end-1] {
861 open_delim++
862 }
863
864 if orig_data[buf_end] == copen {
865 open_delim--
866 }
867
868 buf_end--
869 }
870
871 if open_delim == 0 {
872 link_end--
873 }
874 }
875
876 if rndr.mk.autolink != nil {
877 u_link := bytes.NewBuffer(nil)
878 unescape_text(u_link, data[:link_end])
879
880 rndr.mk.autolink(ob, u_link.Bytes(), LINK_TYPE_NORMAL, rndr.mk.opaque)
881 }
882
883 return link_end
884}
885
886var valid_uris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
887
888func is_safe_link(link []byte) bool {
889 for _, prefix := range valid_uris {
890 if len(link) > len(prefix) && !byteslice_less(link[:len(prefix)], prefix) && !byteslice_less(prefix, link[:len(prefix)]) && isalnum(link[len(prefix)]) {
891 return true
892 }
893 }
894
895 return false
896}
897
898// return the length of the given tag, or 0 is it's not valid
899func tag_length(data []byte, autolink *int) int {
900 var i, j int
901
902 // a valid tag can't be shorter than 3 chars
903 if len(data) < 3 {
904 return 0
905 }
906
907 // begins with a '<' optionally followed by '/', followed by letter or number
908 if data[0] != '<' {
909 return 0
910 }
911 if data[1] == '/' {
912 i = 2
913 } else {
914 i = 1
915 }
916
917 if !isalnum(data[i]) {
918 return 0
919 }
920
921 // scheme test
922 *autolink = LINK_TYPE_NOT_AUTOLINK
923
924 // try to find the beggining of an URI
925 for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
926 i++
927 }
928
929 if i > 1 && data[i] == '@' {
930 if j = is_mail_autolink(data[i:]); j != 0 {
931 *autolink = LINK_TYPE_EMAIL
932 return i + j
933 }
934 }
935
936 if i > 2 && data[i] == ':' {
937 *autolink = LINK_TYPE_NORMAL
938 i++
939 }
940
941 // complete autolink test: no whitespace or ' or "
942 switch {
943 case i >= len(data):
944 *autolink = LINK_TYPE_NOT_AUTOLINK
945 case *autolink != 0:
946 j = i
947
948 for i < len(data) {
949 if data[i] == '\\' {
950 i += 2
951 } else {
952 if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
953 break
954 } else {
955 i++
956 }
957 }
958
959 }
960
961 if i >= len(data) {
962 return 0
963 }
964 if i > j && data[i] == '>' {
965 return i + 1
966 }
967
968 // one of the forbidden chars has been found
969 *autolink = LINK_TYPE_NOT_AUTOLINK
970 }
971
972 // look for something looking like a tag end
973 for i < len(data) && data[i] != '>' {
974 i++
975 }
976 if i >= len(data) {
977 return 0
978 }
979 return i + 1
980}
981
982// look for the address part of a mail autolink and '>'
983// this is less strict than the original markdown e-mail address matching
984func is_mail_autolink(data []byte) int {
985 nb := 0
986
987 // address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
988 for i := 0; i < len(data); i++ {
989 if isalnum(data[i]) {
990 continue
991 }
992
993 switch data[i] {
994 case '@':
995 nb++
996
997 case '-', '.', '_':
998 break
999
1000 case '>':
1001 if nb == 1 {
1002 return i + 1
1003 } else {
1004 return 0
1005 }
1006 default:
1007 return 0
1008 }
1009 }
1010
1011 return 0
1012}
1013
1014// look for the next emph char, skipping other constructs
1015func find_emph_char(data []byte, c byte) int {
1016 i := 1
1017
1018 for i < len(data) {
1019 for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
1020 i++
1021 }
1022 if i >= len(data) {
1023 return 0
1024 }
1025 if data[i] == c {
1026 return i
1027 }
1028
1029 // do not count escaped chars
1030 if i != 0 && data[i-1] == '\\' {
1031 i++
1032 continue
1033 }
1034
1035 if data[i] == '`' {
1036 // skip a code span
1037 tmp_i := 0
1038 i++
1039 for i < len(data) && data[i] != '`' {
1040 if tmp_i == 0 && data[i] == c {
1041 tmp_i = i
1042 }
1043 i++
1044 }
1045 if i >= len(data) {
1046 return tmp_i
1047 }
1048 i++
1049 } else {
1050 if data[i] == '[' {
1051 // skip a link
1052 tmp_i := 0
1053 i++
1054 for i < len(data) && data[i] != ']' {
1055 if tmp_i == 0 && data[i] == c {
1056 tmp_i = i
1057 }
1058 i++
1059 }
1060 i++
1061 for i < len(data) && (data[i] == ' ' || data[i] == '\t' || data[i] == '\n') {
1062 i++
1063 }
1064 if i >= len(data) {
1065 return tmp_i
1066 }
1067 if data[i] != '[' && data[i] != '(' { // not a link
1068 if tmp_i > 0 {
1069 return tmp_i
1070 } else {
1071 continue
1072 }
1073 }
1074 cc := data[i]
1075 i++
1076 for i < len(data) && data[i] != cc {
1077 if tmp_i == 0 && data[i] == c {
1078 tmp_i = i
1079 }
1080 i++
1081 }
1082 if i >= len(data) {
1083 return tmp_i
1084 }
1085 i++
1086 }
1087 }
1088 }
1089 return 0
1090}
1091
1092func parse_emph1(ob *bytes.Buffer, rndr *render, data []byte, c byte) int {
1093 i := 0
1094
1095 if rndr.mk.emphasis == nil {
1096 return 0
1097 }
1098
1099 // skip one symbol if coming from emph3
1100 if len(data) > 1 && data[0] == c && data[1] == c {
1101 i = 1
1102 }
1103
1104 for i < len(data) {
1105 length := find_emph_char(data[i:], c)
1106 if length == 0 {
1107 return 0
1108 }
1109 i += length
1110 if i >= len(data) {
1111 return 0
1112 }
1113
1114 if i+1 < len(data) && data[i+1] == c {
1115 i++
1116 continue
1117 }
1118
1119 if data[i] == c && !isspace(data[i-1]) {
1120
1121 if rndr.ext_flags&EXTENSION_NO_INTRA_EMPHASIS != 0 {
1122 if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
1123 continue
1124 }
1125 }
1126
1127 work := bytes.NewBuffer(nil)
1128 parse_inline(work, rndr, data[:i])
1129 r := rndr.mk.emphasis(ob, work.Bytes(), rndr.mk.opaque)
1130 if r > 0 {
1131 return i + 1
1132 } else {
1133 return 0
1134 }
1135 }
1136 }
1137
1138 return 0
1139}
1140
1141func parse_emph2(ob *bytes.Buffer, rndr *render, data []byte, c byte) int {
1142 render_method := rndr.mk.double_emphasis
1143 if c == '~' {
1144 render_method = rndr.mk.strikethrough
1145 }
1146
1147 if render_method == nil {
1148 return 0
1149 }
1150
1151 i := 0
1152
1153 for i < len(data) {
1154 length := find_emph_char(data[i:], c)
1155 if length == 0 {
1156 return 0
1157 }
1158 i += length
1159
1160 if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
1161 work := bytes.NewBuffer(nil)
1162 parse_inline(work, rndr, data[:i])
1163 r := render_method(ob, work.Bytes(), rndr.mk.opaque)
1164 if r > 0 {
1165 return i + 2
1166 } else {
1167 return 0
1168 }
1169 }
1170 i++
1171 }
1172 return 0
1173}
1174
1175func parse_emph3(ob *bytes.Buffer, rndr *render, data []byte, offset int, c byte) int {
1176 i := 0
1177 orig_data := data
1178 data = data[offset:]
1179
1180 for i < len(data) {
1181 length := find_emph_char(data[i:], c)
1182 if length == 0 {
1183 return 0
1184 }
1185 i += length
1186
1187 // skip whitespace preceded symbols
1188 if data[i] != c || isspace(data[i-1]) {
1189 continue
1190 }
1191
1192 switch {
1193 case (i+2 < len(data) && data[i+1] == c && data[i+2] == c && rndr.mk.triple_emphasis != nil):
1194 // triple symbol found
1195 work := bytes.NewBuffer(nil)
1196
1197 parse_inline(work, rndr, data[:i])
1198 r := rndr.mk.triple_emphasis(ob, work.Bytes(), rndr.mk.opaque)
1199 if r > 0 {
1200 return i + 3
1201 } else {
1202 return 0
1203 }
1204 case (i+1 < len(data) && data[i+1] == c):
1205 // double symbol found, hand over to emph1
1206 length = parse_emph1(ob, rndr, orig_data[offset-2:], c)
1207 if length == 0 {
1208 return 0
1209 } else {
1210 return length - 2
1211 }
1212 default:
1213 // single symbol found, hand over to emph2
1214 length = parse_emph2(ob, rndr, orig_data[offset-1:], c)
1215 if length == 0 {
1216 return 0
1217 } else {
1218 return length - 1
1219 }
1220 }
1221 }
1222 return 0
1223}
1224
1225
1226//
1227// Block parsing
1228// Functions to parse block-level elements.
1229//
1230
1231// parse block-level data
1232func parse_block(ob *bytes.Buffer, rndr *render, data []byte) {
1233 if rndr.nesting >= rndr.max_nesting {
1234 return
1235 }
1236 rndr.nesting++
1237
1238 for len(data) > 0 {
1239 if is_atxheader(rndr, data) {
1240 data = data[parse_atxheader(ob, rndr, data):]
1241 continue
1242 }
1243 if data[0] == '<' && rndr.mk.blockhtml != nil {
1244 if i := parse_htmlblock(ob, rndr, data, true); i > 0 {
1245 data = data[i:]
1246 continue
1247 }
1248 }
1249 if i := is_empty(data); i > 0 {
1250 data = data[i:]
1251 continue
1252 }
1253 if is_hrule(data) {
1254 if rndr.mk.hrule != nil {
1255 rndr.mk.hrule(ob, rndr.mk.opaque)
1256 }
1257 var i int
1258 for i = 0; i < len(data) && data[i] != '\n'; i++ {
1259 }
1260 data = data[i:]
1261 continue
1262 }
1263 if rndr.ext_flags&EXTENSION_FENCED_CODE != 0 {
1264 if i := parse_fencedcode(ob, rndr, data); i > 0 {
1265 data = data[i:]
1266 continue
1267 }
1268 }
1269 if rndr.ext_flags&EXTENSION_TABLES != 0 {
1270 if i := parse_table(ob, rndr, data); i > 0 {
1271 data = data[i:]
1272 continue
1273 }
1274 }
1275 if prefix_quote(data) > 0 {
1276 data = data[parse_blockquote(ob, rndr, data):]
1277 continue
1278 }
1279 if prefix_code(data) > 0 {
1280 data = data[parse_blockcode(ob, rndr, data):]
1281 continue
1282 }
1283 if prefix_uli(data) > 0 {
1284 data = data[parse_list(ob, rndr, data, 0):]
1285 continue
1286 }
1287 if prefix_oli(data) > 0 {
1288 data = data[parse_list(ob, rndr, data, LIST_TYPE_ORDERED):]
1289 continue
1290 }
1291
1292 data = data[parse_paragraph(ob, rndr, data):]
1293 }
1294
1295 rndr.nesting--
1296}
1297
1298func is_atxheader(rndr *render, data []byte) bool {
1299 if data[0] != '#' {
1300 return false
1301 }
1302
1303 if rndr.ext_flags&EXTENSION_SPACE_HEADERS != 0 {
1304 level := 0
1305 for level < len(data) && level < 6 && data[level] == '#' {
1306 level++
1307 }
1308 if level < len(data) && data[level] != ' ' && data[level] != '\t' {
1309 return false
1310 }
1311 }
1312 return true
1313}
1314
1315func parse_atxheader(ob *bytes.Buffer, rndr *render, data []byte) int {
1316 level := 0
1317 for level < len(data) && level < 6 && data[level] == '#' {
1318 level++
1319 }
1320 i, end := 0, 0
1321 for i = level; i < len(data) && (data[i] == ' ' || data[i] == '\t'); i++ {
1322 }
1323 for end = i; end < len(data) && data[end] != '\n'; end++ {
1324 }
1325 skip := end
1326 for end > 0 && data[end-1] == '#' {
1327 end--
1328 }
1329 for end > 0 && (data[end-1] == ' ' || data[end-1] == '\t') {
1330 end--
1331 }
1332 if end > i {
1333 work := bytes.NewBuffer(nil)
1334 parse_inline(work, rndr, data[i:end])
1335 if rndr.mk.header != nil {
1336 rndr.mk.header(ob, work.Bytes(), level, rndr.mk.opaque)
1337 }
1338 }
1339 return skip
1340}
1341
1342func is_headerline(data []byte) int {
1343 i := 0
1344
1345 // test of level 1 header
1346 if data[i] == '=' {
1347 for i = 1; i < len(data) && data[i] == '='; i++ {
1348 }
1349 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
1350 i++
1351 }
1352 if i >= len(data) || data[i] == '\n' {
1353 return 1
1354 } else {
1355 return 0
1356 }
1357 }
1358
1359 // test of level 2 header
1360 if data[i] == '-' {
1361 for i = 1; i < len(data) && data[i] == '-'; i++ {
1362 }
1363 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
1364 i++
1365 }
1366 if i >= len(data) || data[i] == '\n' {
1367 return 2
1368 } else {
1369 return 0
1370 }
1371 }
1372
1373 return 0
1374}
1375
1376func parse_htmlblock(ob *bytes.Buffer, rndr *render, data []byte, do_render bool) int {
1377 var i, j int
1378
1379 // identify the opening tag
1380 if len(data) < 2 || data[0] != '<' {
1381 return 0
1382 }
1383 curtag, tagfound := find_block_tag(data[1:])
1384
1385 // handle special cases
1386 if !tagfound {
1387
1388 // HTML comment, laxist form
1389 if len(data) > 5 && data[1] == '!' && data[2] == '-' && data[3] == '-' {
1390 i = 5
1391
1392 for i < len(data) && !(data[i-2] == '-' && data[i-1] == '-' && data[i] == '>') {
1393 i++
1394 }
1395 i++
1396
1397 if i < len(data) {
1398 j = is_empty(data[i:])
1399 }
1400
1401 if j > 0 {
1402 size := i + j
1403 if do_render && rndr.mk.blockhtml != nil {
1404 rndr.mk.blockhtml(ob, data[:size], rndr.mk.opaque)
1405 }
1406 return size
1407 }
1408 }
1409
1410 // HR, which is the only self-closing block tag considered
1411 if len(data) > 4 && (data[1] == 'h' || data[1] == 'H') && (data[2] == 'r' || data[2] == 'R') {
1412 i = 3
1413 for i < len(data) && data[i] != '>' {
1414 i++
1415 }
1416
1417 if i+1 < len(data) {
1418 i++
1419 j = is_empty(data[i:])
1420 if j > 0 {
1421 size := i + j
1422 if do_render && rndr.mk.blockhtml != nil {
1423 rndr.mk.blockhtml(ob, data[:size], rndr.mk.opaque)
1424 }
1425 return size
1426 }
1427 }
1428 }
1429
1430 // no special case recognized
1431 return 0
1432 }
1433
1434 // look for an unindented matching closing tag
1435 // followed by a blank line
1436 i = 1
1437 found := false
1438
1439 // if not found, try a second pass looking for indented match
1440 // but not if tag is "ins" or "del" (following original Markdown.pl)
1441 if curtag != "ins" && curtag != "del" {
1442 i = 1
1443 for i < len(data) {
1444 i++
1445 for i < len(data) && !(data[i-1] == '<' && data[i] == '/') {
1446 i++
1447 }
1448
1449 if i+2+len(curtag) >= len(data) {
1450 break
1451 }
1452
1453 j = htmlblock_end(curtag, rndr, data[i-1:])
1454
1455 if j > 0 {
1456 i += j - 1
1457 found = true
1458 break
1459 }
1460 }
1461 }
1462
1463 if !found {
1464 return 0
1465 }
1466
1467 // the end of the block has been found
1468 if do_render && rndr.mk.blockhtml != nil {
1469 rndr.mk.blockhtml(ob, data[:i], rndr.mk.opaque)
1470 }
1471
1472 return i
1473}
1474
1475func find_block_tag(data []byte) (string, bool) {
1476 i := 0
1477 for i < len(data) && ((data[i] >= '0' && data[i] <= '9') || (data[i] >= 'A' && data[i] <= 'Z') || (data[i] >= 'a' && data[i] <= 'z')) {
1478 i++
1479 }
1480 if i >= len(data) {
1481 return "", false
1482 }
1483 key := string(data[:i])
1484 if block_tags[key] {
1485 return key, true
1486 }
1487 return "", false
1488}
1489
1490func htmlblock_end(tag string, rndr *render, data []byte) int {
1491 // assume data[0] == '<' && data[1] == '/' already tested
1492
1493 // check if tag is a match
1494 if len(tag)+3 >= len(data) || bytes.Compare(data[2:2+len(tag)], []byte(tag)) != 0 || data[len(tag)+2] != '>' {
1495 return 0
1496 }
1497
1498 // check white lines
1499 i := len(tag) + 3
1500 w := 0
1501 if i < len(data) {
1502 if w = is_empty(data[i:]); w == 0 {
1503 return 0 // non-blank after tag
1504 }
1505 }
1506 i += w
1507 w = 0
1508
1509 if rndr.ext_flags&EXTENSION_LAX_HTML_BLOCKS != 0 {
1510 if i < len(data) {
1511 w = is_empty(data[i:])
1512 }
1513 } else {
1514 if i < len(data) {
1515 if w = is_empty(data[i:]); w == 0 {
1516 return 0 // non-blank line after tag line
1517 }
1518 }
1519 }
1520
1521 return i + w
1522}
1523
1524func is_empty(data []byte) int {
1525 var i int
1526 for i = 0; i < len(data) && data[i] != '\n'; i++ {
1527 if data[i] != ' ' && data[i] != '\t' {
1528 return 0
1529 }
1530 }
1531 return i + 1
1532}
1533
1534func is_hrule(data []byte) bool {
1535 // skip initial spaces
1536 if len(data) < 3 {
1537 return false
1538 }
1539 i := 0
1540 if data[0] == ' ' {
1541 i++
1542 if data[1] == ' ' {
1543 i++
1544 if data[2] == ' ' {
1545 i++
1546 }
1547 }
1548 }
1549
1550 // look at the hrule char
1551 if i+2 >= len(data) || (data[i] != '*' && data[i] != '-' && data[i] != '_') {
1552 return false
1553 }
1554 c := data[i]
1555
1556 // the whole line must be the char or whitespace
1557 n := 0
1558 for i < len(data) && data[i] != '\n' {
1559 switch {
1560 case data[i] == c:
1561 n++
1562 case data[i] != ' ' && data[i] != '\t':
1563 return false
1564 }
1565 i++
1566 }
1567
1568 return n >= 3
1569}
1570
1571func is_codefence(data []byte, syntax **string) int {
1572 i, n := 0, 0
1573
1574 // skip initial spaces
1575 if len(data) < 3 {
1576 return 0
1577 }
1578 if data[0] == ' ' {
1579 i++
1580 if data[1] == ' ' {
1581 i++
1582 if data[2] == ' ' {
1583 i++
1584 }
1585 }
1586 }
1587
1588 // look at the hrule char
1589 if i+2 >= len(data) || !(data[i] == '~' || data[i] == '`') {
1590 return 0
1591 }
1592
1593 c := data[i]
1594
1595 // the whole line must be the char or whitespace
1596 for i < len(data) && data[i] == c {
1597 n++
1598 i++
1599 }
1600
1601 if n < 3 {
1602 return 0
1603 }
1604
1605 if syntax != nil {
1606 syn := 0
1607
1608 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
1609 i++
1610 }
1611
1612 syntax_start := i
1613
1614 if i < len(data) && data[i] == '{' {
1615 i++
1616 syntax_start++
1617
1618 for i < len(data) && data[i] != '}' && data[i] != '\n' {
1619 syn++
1620 i++
1621 }
1622
1623 if i == len(data) || data[i] != '}' {
1624 return 0
1625 }
1626
1627 // string all whitespace at the beginning and the end
1628 // of the {} block
1629 for syn > 0 && isspace(data[syntax_start]) {
1630 syntax_start++
1631 syn--
1632 }
1633
1634 for syn > 0 && isspace(data[syntax_start+syn-1]) {
1635 syn--
1636 }
1637
1638 i++
1639 } else {
1640 for i < len(data) && !isspace(data[i]) {
1641 syn++
1642 i++
1643 }
1644 }
1645
1646 language := string(data[syntax_start : syntax_start+syn])
1647 *syntax = &language
1648 }
1649
1650 for i < len(data) && data[i] != '\n' {
1651 if !isspace(data[i]) {
1652 return 0
1653 }
1654 i++
1655 }
1656
1657 return i + 1
1658}
1659
1660func parse_fencedcode(ob *bytes.Buffer, rndr *render, data []byte) int {
1661 var lang *string
1662 beg := is_codefence(data, &lang)
1663 if beg == 0 {
1664 return 0
1665 }
1666
1667 work := bytes.NewBuffer(nil)
1668
1669 for beg < len(data) {
1670 fence_end := is_codefence(data[beg:], nil)
1671 if fence_end != 0 {
1672 beg += fence_end
1673 break
1674 }
1675
1676 var end int
1677 for end = beg + 1; end < len(data) && data[end-1] != '\n'; end++ {
1678 }
1679
1680 if beg < end {
1681 // verbatim copy to the working buffer, escaping entities
1682 if is_empty(data[beg:]) > 0 {
1683 work.WriteByte('\n')
1684 } else {
1685 work.Write(data[beg:end])
1686 }
1687 }
1688 beg = end
1689 }
1690
1691 if work.Len() > 0 && work.Bytes()[work.Len()-1] != '\n' {
1692 work.WriteByte('\n')
1693 }
1694
1695 if rndr.mk.blockcode != nil {
1696 syntax := ""
1697 if lang != nil {
1698 syntax = *lang
1699 }
1700
1701 rndr.mk.blockcode(ob, work.Bytes(), syntax, rndr.mk.opaque)
1702 }
1703
1704 return beg
1705}
1706
1707func parse_table(ob *bytes.Buffer, rndr *render, data []byte) int {
1708 header_work := bytes.NewBuffer(nil)
1709 i, columns, col_data := parse_table_header(header_work, rndr, data)
1710 if i > 0 {
1711 body_work := bytes.NewBuffer(nil)
1712
1713 for i < len(data) {
1714 pipes, row_start := 0, i
1715 for ; i < len(data) && data[i] != '\n'; i++ {
1716 if data[i] == '|' {
1717 pipes++
1718 }
1719 }
1720
1721 if pipes == 0 || i == len(data) {
1722 i = row_start
1723 break
1724 }
1725
1726 parse_table_row(body_work, rndr, data[row_start:i], columns, col_data)
1727 i++
1728 }
1729
1730 if rndr.mk.table != nil {
1731 rndr.mk.table(ob, header_work.Bytes(), body_work.Bytes(), rndr.mk.opaque)
1732 }
1733 }
1734
1735 return i
1736}
1737
1738func parse_table_header(ob *bytes.Buffer, rndr *render, data []byte) (size int, columns int, column_data []int) {
1739 i, pipes := 0, 0
1740 column_data = []int{}
1741 for i = 0; i < len(data) && data[i] != '\n'; i++ {
1742 if data[i] == '|' {
1743 pipes++
1744 }
1745 }
1746
1747 if i == len(data) || pipes == 0 {
1748 return 0, 0, column_data
1749 }
1750
1751 header_end := i
1752
1753 if data[0] == '|' {
1754 pipes--
1755 }
1756
1757 if i > 2 && data[i-1] == '|' {
1758 pipes--
1759 }
1760
1761 columns = pipes + 1
1762 column_data = make([]int, columns)
1763
1764 // parse the header underline
1765 i++
1766 if i < len(data) && data[i] == '|' {
1767 i++
1768 }
1769
1770 under_end := i
1771 for under_end < len(data) && data[under_end] != '\n' {
1772 under_end++
1773 }
1774
1775 col := 0
1776 for ; col < columns && i < under_end; col++ {
1777 dashes := 0
1778
1779 for i < under_end && (data[i] == ' ' || data[i] == '\t') {
1780 i++
1781 }
1782
1783 if data[i] == ':' {
1784 i++
1785 column_data[col] |= TABLE_ALIGNMENT_LEFT
1786 dashes++
1787 }
1788
1789 for i < under_end && data[i] == '-' {
1790 i++
1791 dashes++
1792 }
1793
1794 if i < under_end && data[i] == ':' {
1795 i++
1796 column_data[col] |= TABLE_ALIGNMENT_RIGHT
1797 dashes++
1798 }
1799
1800 for i < under_end && (data[i] == ' ' || data[i] == '\t') {
1801 i++
1802 }
1803
1804 if i < under_end && data[i] != '|' {
1805 break
1806 }
1807
1808 if dashes < 3 {
1809 break
1810 }
1811
1812 i++
1813 }
1814
1815 if col < columns {
1816 return 0, 0, column_data
1817 }
1818
1819 parse_table_row(ob, rndr, data[:header_end], columns, column_data)
1820 size = under_end + 1
1821 return
1822}
1823
1824func parse_table_row(ob *bytes.Buffer, rndr *render, data []byte, columns int, col_data []int) {
1825 i, col := 0, 0
1826 row_work := bytes.NewBuffer(nil)
1827
1828 if i < len(data) && data[i] == '|' {
1829 i++
1830 }
1831
1832 for col = 0; col < columns && i < len(data); col++ {
1833 for i < len(data) && isspace(data[i]) {
1834 i++
1835 }
1836
1837 cell_start := i
1838
1839 for i < len(data) && data[i] != '|' {
1840 i++
1841 }
1842
1843 cell_end := i - 1
1844
1845 for cell_end > cell_start && isspace(data[cell_end]) {
1846 cell_end--
1847 }
1848
1849 cell_work := bytes.NewBuffer(nil)
1850 parse_inline(cell_work, rndr, data[cell_start:cell_end+1])
1851
1852 if rndr.mk.table_cell != nil {
1853 cdata := 0
1854 if col < len(col_data) {
1855 cdata = col_data[col]
1856 }
1857 rndr.mk.table_cell(row_work, cell_work.Bytes(), cdata, rndr.mk.opaque)
1858 }
1859
1860 i++
1861 }
1862
1863 for ; col < columns; col++ {
1864 empty_cell := []byte{}
1865 if rndr.mk.table_cell != nil {
1866 cdata := 0
1867 if col < len(col_data) {
1868 cdata = col_data[col]
1869 }
1870 rndr.mk.table_cell(row_work, empty_cell, cdata, rndr.mk.opaque)
1871 }
1872 }
1873
1874 if rndr.mk.table_row != nil {
1875 rndr.mk.table_row(ob, row_work.Bytes(), rndr.mk.opaque)
1876 }
1877}
1878
1879// returns blockquote prefix length
1880func prefix_quote(data []byte) int {
1881 i := 0
1882 for i < len(data) && i < 3 && data[i] == ' ' {
1883 i++
1884 }
1885 if i < len(data) && data[i] == '>' {
1886 if i+1 < len(data) && (data[i+1] == ' ' || data[i+1] == '\t') {
1887 return i + 2
1888 }
1889 return i + 1
1890 }
1891 return 0
1892}
1893
1894// parse a blockquote fragment
1895func parse_blockquote(ob *bytes.Buffer, rndr *render, data []byte) int {
1896 out := bytes.NewBuffer(nil)
1897 work := bytes.NewBuffer(nil)
1898 beg, end := 0, 0
1899 for beg < len(data) {
1900 for end = beg + 1; end < len(data) && data[end-1] != '\n'; end++ {
1901 }
1902
1903 if pre := prefix_quote(data[beg:]); pre > 0 {
1904 beg += pre // skip prefix
1905 } else {
1906 // empty line followed by non-quote line
1907 if is_empty(data[beg:]) > 0 && (end >= len(data) || (prefix_quote(data[end:]) == 0 && is_empty(data[end:]) == 0)) {
1908 break
1909 }
1910 }
1911
1912 if beg < end { // copy into the in-place working buffer
1913 work.Write(data[beg:end])
1914 }
1915 beg = end
1916 }
1917
1918 parse_block(out, rndr, work.Bytes())
1919 if rndr.mk.blockquote != nil {
1920 rndr.mk.blockquote(ob, out.Bytes(), rndr.mk.opaque)
1921 }
1922 return end
1923}
1924
1925// returns prefix length for block code
1926func prefix_code(data []byte) int {
1927 if len(data) > 0 && data[0] == '\t' {
1928 return 1
1929 }
1930 if len(data) > 3 && data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
1931 return 4
1932 }
1933 return 0
1934}
1935
1936func parse_blockcode(ob *bytes.Buffer, rndr *render, data []byte) int {
1937 work := bytes.NewBuffer(nil)
1938
1939 beg, end := 0, 0
1940 for beg < len(data) {
1941 for end = beg + 1; end < len(data) && data[end-1] != '\n'; end++ {
1942 }
1943
1944 if pre := prefix_code(data[beg:end]); pre > 0 {
1945 beg += pre
1946 } else {
1947 if is_empty(data[beg:end]) == 0 {
1948 // non-empty non-prefixed line breaks the pre
1949 break
1950 }
1951 }
1952
1953 if beg < end {
1954 // verbatim copy to the working buffer, escaping entities
1955 if is_empty(data[beg:end]) > 0 {
1956 work.WriteByte('\n')
1957 } else {
1958 work.Write(data[beg:end])
1959 }
1960 }
1961 beg = end
1962 }
1963
1964 // trim all the \n off the end of work
1965 workbytes := work.Bytes()
1966 n := 0
1967 for len(workbytes) > n && workbytes[len(workbytes)-n-1] == '\n' {
1968 n++
1969 }
1970 if n > 0 {
1971 work = bytes.NewBuffer(workbytes[:len(workbytes)-n])
1972 }
1973
1974 work.WriteByte('\n')
1975
1976 if rndr.mk.blockcode != nil {
1977 rndr.mk.blockcode(ob, work.Bytes(), "", rndr.mk.opaque)
1978 }
1979
1980 return beg
1981}
1982
1983// returns unordered list item prefix
1984func prefix_uli(data []byte) int {
1985 i := 0
1986 for i < len(data) && i < 3 && data[i] == ' ' {
1987 i++
1988 }
1989 if i+1 >= len(data) || (data[i] != '*' && data[i] != '+' && data[i] != '-') || (data[i+1] != ' ' && data[i+1] != '\t') {
1990 return 0
1991 }
1992 return i + 2
1993}
1994
1995// returns ordered list item prefix
1996func prefix_oli(data []byte) int {
1997 i := 0
1998 for i < len(data) && i < 3 && data[i] == ' ' {
1999 i++
2000 }
2001 if i >= len(data) || data[i] < '0' || data[i] > '9' {
2002 return 0
2003 }
2004 for i < len(data) && data[i] >= '0' && data[i] <= '9' {
2005 i++
2006 }
2007 if i+1 >= len(data) || data[i] != '.' || (data[i+1] != ' ' && data[i+1] != '\t') {
2008 return 0
2009 }
2010 return i + 2
2011}
2012
2013// parse ordered or unordered list block
2014func parse_list(ob *bytes.Buffer, rndr *render, data []byte, flags int) int {
2015 work := bytes.NewBuffer(nil)
2016
2017 i, j := 0, 0
2018 for i < len(data) {
2019 j = parse_listitem(work, rndr, data[i:], &flags)
2020 i += j
2021
2022 if j == 0 || flags&LIST_ITEM_END_OF_LIST != 0 {
2023 break
2024 }
2025 }
2026
2027 if rndr.mk.list != nil {
2028 rndr.mk.list(ob, work.Bytes(), flags, rndr.mk.opaque)
2029 }
2030 return i
2031}
2032
2033// parse a single list item
2034// assumes initial prefix is already removed
2035func parse_listitem(ob *bytes.Buffer, rndr *render, data []byte, flags *int) int {
2036 // keep track of the first indentation prefix
2037 beg, end, pre, sublist, orgpre, i := 0, 0, 0, 0, 0, 0
2038
2039 for orgpre < 3 && orgpre < len(data) && data[orgpre] == ' ' {
2040 orgpre++
2041 }
2042
2043 beg = prefix_uli(data)
2044 if beg == 0 {
2045 beg = prefix_oli(data)
2046 }
2047 if beg == 0 {
2048 return 0
2049 }
2050
2051 // skip leading whitespace on first line
2052 for beg < len(data) && data[beg] == ' ' {
2053 beg++
2054 }
2055
2056 // skip to the beginning of the following line
2057 end = beg
2058 for end < len(data) && data[end-1] != '\n' {
2059 end++
2060 }
2061
2062 // get working buffers
2063 work := bytes.NewBuffer(nil)
2064 inter := bytes.NewBuffer(nil)
2065
2066 // put the first line into the working buffer
2067 work.Write(data[beg:end])
2068 beg = end
2069
2070 // process the following lines
2071 in_empty, has_inside_empty := false, false
2072 for beg < len(data) {
2073 end++
2074
2075 for end < len(data) && data[end-1] != '\n' {
2076 end++
2077 }
2078
2079 // process an empty line
2080 if is_empty(data[beg:end]) > 0 {
2081 in_empty = true
2082 beg = end
2083 continue
2084 }
2085
2086 // calculate the indentation
2087 i = 0
2088 for i < 4 && beg+i < end && data[beg+i] == ' ' {
2089 i++
2090 }
2091
2092 pre = i
2093 if data[beg] == '\t' {
2094 i = 1
2095 pre = 8
2096 }
2097
2098 // check for a new item
2099 chunk := data[beg+i : end]
2100 if (prefix_uli(chunk) > 0 && !is_hrule(chunk)) || prefix_oli(chunk) > 0 {
2101 if in_empty {
2102 has_inside_empty = true
2103 }
2104
2105 if pre == orgpre { // the following item must have the same indentation
2106 break
2107 }
2108
2109 if sublist == 0 {
2110 sublist = work.Len()
2111 }
2112 } else {
2113 // only join indented stuff after empty lines
2114 if in_empty && i < 4 && data[beg] != '\t' {
2115 *flags |= LIST_ITEM_END_OF_LIST
2116 break
2117 } else {
2118 if in_empty {
2119 work.WriteByte('\n')
2120 has_inside_empty = true
2121 }
2122 }
2123 }
2124
2125 in_empty = false
2126
2127 // add the line into the working buffer without prefix
2128 work.Write(data[beg+i : end])
2129 beg = end
2130 }
2131
2132 // render li contents
2133 if has_inside_empty {
2134 *flags |= LIST_ITEM_CONTAINS_BLOCK
2135 }
2136
2137 workbytes := work.Bytes()
2138 if *flags&LIST_ITEM_CONTAINS_BLOCK != 0 {
2139 // intermediate render of block li
2140 if sublist > 0 && sublist < len(workbytes) {
2141 parse_block(inter, rndr, workbytes[:sublist])
2142 parse_block(inter, rndr, workbytes[sublist:])
2143 } else {
2144 parse_block(inter, rndr, workbytes)
2145 }
2146 } else {
2147 // intermediate render of inline li
2148 if sublist > 0 && sublist < len(workbytes) {
2149 parse_inline(inter, rndr, workbytes[:sublist])
2150 parse_block(inter, rndr, workbytes[sublist:])
2151 } else {
2152 parse_inline(inter, rndr, workbytes)
2153 }
2154 }
2155
2156 // render li itself
2157 if rndr.mk.listitem != nil {
2158 rndr.mk.listitem(ob, inter.Bytes(), *flags, rndr.mk.opaque)
2159 }
2160
2161 return beg
2162}
2163
2164func parse_paragraph(ob *bytes.Buffer, rndr *render, data []byte) int {
2165 i, end, level := 0, 0, 0
2166
2167 for i < len(data) {
2168 for end = i + 1; end < len(data) && data[end-1] != '\n'; end++ {
2169 }
2170
2171 if is_empty(data[i:]) > 0 {
2172 break
2173 }
2174 if level = is_headerline(data[i:]); level > 0 {
2175 break
2176 }
2177
2178 if rndr.ext_flags&EXTENSION_LAX_HTML_BLOCKS != 0 {
2179 if data[i] == '<' && rndr.mk.blockhtml != nil && parse_htmlblock(ob, rndr, data[i:], false) > 0 {
2180 end = i
2181 break
2182 }
2183 }
2184
2185 if is_atxheader(rndr, data[i:]) || is_hrule(data[i:]) {
2186 end = i
2187 break
2188 }
2189
2190 i = end
2191 }
2192
2193 work := data
2194 size := i
2195 for size > 0 && work[size-1] == '\n' {
2196 size--
2197 }
2198
2199 if level == 0 {
2200 tmp := bytes.NewBuffer(nil)
2201 parse_inline(tmp, rndr, work[:size])
2202 if rndr.mk.paragraph != nil {
2203 rndr.mk.paragraph(ob, tmp.Bytes(), rndr.mk.opaque)
2204 }
2205 } else {
2206 if size > 0 {
2207 beg := 0
2208 i = size
2209 size--
2210
2211 for size > 0 && work[size] != '\n' {
2212 size--
2213 }
2214
2215 beg = size + 1
2216 for size > 0 && work[size-1] == '\n' {
2217 size--
2218 }
2219
2220 if size > 0 {
2221 tmp := bytes.NewBuffer(nil)
2222 parse_inline(tmp, rndr, work[:size])
2223 if rndr.mk.paragraph != nil {
2224 rndr.mk.paragraph(ob, tmp.Bytes(), rndr.mk.opaque)
2225 }
2226
2227 work = work[beg:]
2228 size = i - beg
2229 } else {
2230 size = i
2231 }
2232 }
2233
2234 header_work := bytes.NewBuffer(nil)
2235 parse_inline(header_work, rndr, work[:size])
2236
2237 if rndr.mk.header != nil {
2238 rndr.mk.header(ob, header_work.Bytes(), level, rndr.mk.opaque)
2239 }
2240 }
2241
2242 return end
2243}
2244
2245
2246//
2247// Link references
2248//
2249// This section implements support for references that (usually) appear
2250// as footnotes in a document, and can be referenced anywhere in the document.
2251// The basic format is:
2252//
2253// [1]: http://www.google.com/ "Google"
2254// [2]: http://www.github.com/ "Github"
2255//
2256// Anywhere in the document, the reference can be linked by referring to its
2257// label, i.e., 1 and 2 in this example, as in:
2258//
2259// This library is hosted on [Github][2], a git hosting site.
2260
2261// References are parsed and stored in this struct.
2262type link_ref struct {
2263 id []byte
2264 link []byte
2265 title []byte
2266}
2267
2268// The list of all reference links is stored in this type.
2269type link_ref_array []*link_ref
2270
2271// Find the length of a list of references.
2272// This implements an interface needed for sorting.
2273func (elt link_ref_array) Len() int {
2274 return len(elt)
2275}
2276
2277// Test if one reference is less than another (case-insensitive).
2278// This implements an interface needed for sorting.
2279func (elt link_ref_array) Less(i, j int) bool {
2280 return byteslice_less(elt[i].id, elt[j].id)
2281}
2282
2283// Compare two []byte values (case-insensitive), returning
2284// true if a is less than b.
2285func byteslice_less(a []byte, b []byte) bool {
2286 // adapted from bytes.Compare in stdlib
2287 m := len(a)
2288 if m > len(b) {
2289 m = len(b)
2290 }
2291 for i, ac := range a[0:m] {
2292 // do a case-insensitive comparison
2293 ai, bi := unicode.ToLower(int(ac)), unicode.ToLower(int(b[i]))
2294 switch {
2295 case ai > bi:
2296 return false
2297 case ai < bi:
2298 return true
2299 }
2300 }
2301 switch {
2302 case len(a) < len(b):
2303 return true
2304 case len(a) > len(b):
2305 return false
2306 }
2307 return false
2308}
2309
2310// Swap two references.
2311// This implements an interface needed for sorting.
2312func (elt link_ref_array) Swap(i, j int) {
2313 elt[i], elt[j] = elt[j], elt[i]
2314}
2315
2316// Check whether or not data starts with a reference link.
2317// If so, it is parsed and stored in the list of references
2318// (in the render struct).
2319// Returns the number of bytes to skip to move past it, or zero
2320// if there is the first line is not a reference.
2321func is_ref(rndr *render, data []byte) int {
2322 // up to 3 optional leading spaces
2323 if len(data) < 4 {
2324 return 0
2325 }
2326 i := 0
2327 for i < 3 && data[i] == ' ' {
2328 i++
2329 }
2330 if data[i] == ' ' {
2331 return 0
2332 }
2333
2334 // id part: anything but a newline between brackets
2335 if data[i] != '[' {
2336 return 0
2337 }
2338 i++
2339 id_offset := i
2340 for i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != ']' {
2341 i++
2342 }
2343 if i >= len(data) || data[i] != ']' {
2344 return 0
2345 }
2346 id_end := i
2347
2348 // spacer: colon (space | tab)* newline? (space | tab)*
2349 i++
2350 if i >= len(data) || data[i] != ':' {
2351 return 0
2352 }
2353 i++
2354 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
2355 i++
2356 }
2357 if i < len(data) && (data[i] == '\n' || data[i] == '\r') {
2358 i++
2359 if i < len(data) && data[i] == '\n' && data[i-1] == '\r' {
2360 i++
2361 }
2362 }
2363 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
2364 i++
2365 }
2366 if i >= len(data) {
2367 return 0
2368 }
2369
2370 // link: whitespace-free sequence, optionally between angle brackets
2371 if data[i] == '<' {
2372 i++
2373 }
2374 link_offset := i
2375 for i < len(data) && data[i] != ' ' && data[i] != '\t' && data[i] != '\n' && data[i] != '\r' {
2376 i++
2377 }
2378 link_end := i
2379 if data[link_offset] == '<' && data[link_end-1] == '>' {
2380 link_offset++
2381 link_end--
2382 }
2383
2384 // optional spacer: (space | tab)* (newline | '\'' | '"' | '(' )
2385 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
2386 i++
2387 }
2388 if i < len(data) && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' {
2389 return 0
2390 }
2391
2392 // compute end-of-line
2393 line_end := 0
2394 if i >= len(data) || data[i] == '\r' || data[i] == '\n' {
2395 line_end = i
2396 }
2397 if i+1 < len(data) && data[i] == '\r' && data[i+1] == '\n' {
2398 line_end++
2399 }
2400
2401 // optional (space|tab)* spacer after a newline
2402 if line_end > 0 {
2403 i = line_end + 1
2404 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
2405 i++
2406 }
2407 }
2408
2409 // optional title: any non-newline sequence enclosed in '"() alone on its line
2410 title_offset, title_end := 0, 0
2411 if i+1 < len(data) && (data[i] == '\'' || data[i] == '"' || data[i] == '(') {
2412 i++
2413 title_offset = i
2414
2415 // look for EOL
2416 for i < len(data) && data[i] != '\n' && data[i] != '\r' {
2417 i++
2418 }
2419 if i+1 < len(data) && data[i] == '\n' && data[i+1] == '\r' {
2420 title_end = i + 1
2421 } else {
2422 title_end = i
2423 }
2424
2425 // step back
2426 i--
2427 for i > title_offset && (data[i] == ' ' || data[i] == '\t') {
2428 i--
2429 }
2430 if i > title_offset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') {
2431 line_end = title_end
2432 title_end = i
2433 }
2434 }
2435 if line_end == 0 { // garbage after the link
2436 return 0
2437 }
2438
2439 // a valid ref has been found
2440 if rndr == nil {
2441 return line_end
2442 }
2443 item := &link_ref{id: data[id_offset:id_end], link: data[link_offset:link_end], title: data[title_offset:title_end]}
2444 rndr.refs = append(rndr.refs, item)
2445
2446 return line_end
2447}
2448
2449
2450//
2451//
2452// Miscellaneous helper functions
2453//
2454//
2455
2456
2457// Test if a character is a punctuation symbol.
2458// Taken from a private function in regexp in the stdlib.
2459func ispunct(c byte) bool {
2460 for _, r := range []byte("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~") {
2461 if c == r {
2462 return true
2463 }
2464 }
2465 return false
2466}
2467
2468// this is sort.Search, reproduced here because an older
2469// version of the library had a bug
2470func sortDotSearch(n int, f func(int) bool) int {
2471 // Define f(-1) == false and f(n) == true.
2472 // Invariant: f(i-1) == false, f(j) == true.
2473 i, j := 0, n
2474 for i < j {
2475 h := i + (j-i)/2 // avoid overflow when computing h
2476 // i ≤ h < j
2477 if !f(h) {
2478 i = h + 1 // preserves f(i-1) == false
2479 } else {
2480 j = h // preserves f(j) == true
2481 }
2482 }
2483 // i == j, f(i-1) == false, and f(j) (= f(i)) == true => answer is i.
2484 return i
2485}
2486
2487// Test if a character is a whitespace character.
2488func isspace(c byte) bool {
2489 return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'
2490}
2491
2492// Test if a character is a letter or a digit.
2493func isalnum(c byte) bool {
2494 return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
2495}
2496
2497// Replace tab characters with spaces, aligning to the next TAB_SIZE column.
2498func expand_tabs(ob *bytes.Buffer, line []byte) {
2499 i, tab := 0, 0
2500
2501 for i < len(line) {
2502 org := i
2503 for i < len(line) && line[i] != '\t' {
2504 i++
2505 tab++
2506 }
2507
2508 if i > org {
2509 ob.Write(line[org:i])
2510 }
2511
2512 if i >= len(line) {
2513 break
2514 }
2515
2516 for {
2517 ob.WriteByte(' ')
2518 tab++
2519 if tab%TAB_SIZE == 0 {
2520 break
2521 }
2522 }
2523
2524 i++
2525 }
2526}