block.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11// Functions to parse block-level elements.
12//
13
14package blackfriday
15
16import (
17 "bytes"
18 "html"
19 "regexp"
20
21 "github.com/shurcooL/sanitized_anchor_name"
22)
23
24const (
25 charEntity = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"
26 escapable = "[!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]"
27)
28
29var (
30 reBackslashOrAmp = regexp.MustCompile("[\\&]")
31 reEntityOrEscapedChar = regexp.MustCompile("(?i)\\\\" + escapable + "|" + charEntity)
32)
33
34// Parse block-level data.
35// Note: this function and many that it calls assume that
36// the input buffer ends with a newline.
37func (p *parser) block(data []byte) {
38 // this is called recursively: enforce a maximum depth
39 if p.nesting >= p.maxNesting {
40 return
41 }
42 p.nesting++
43
44 // parse out one block-level construct at a time
45 for len(data) > 0 {
46 // prefixed header:
47 //
48 // # Header 1
49 // ## Header 2
50 // ...
51 // ###### Header 6
52 if p.isPrefixHeader(data) {
53 data = data[p.prefixHeader(data):]
54 continue
55 }
56
57 // block of preformatted HTML:
58 //
59 // <div>
60 // ...
61 // </div>
62 if data[0] == '<' {
63 if i := p.html(data, true); i > 0 {
64 data = data[i:]
65 continue
66 }
67 }
68
69 // title block
70 //
71 // % stuff
72 // % more stuff
73 // % even more stuff
74 if p.flags&Titleblock != 0 {
75 if data[0] == '%' {
76 if i := p.titleBlock(data, true); i > 0 {
77 data = data[i:]
78 continue
79 }
80 }
81 }
82
83 // blank lines. note: returns the # of bytes to skip
84 if i := p.isEmpty(data); i > 0 {
85 data = data[i:]
86 continue
87 }
88
89 // indented code block:
90 //
91 // func max(a, b int) int {
92 // if a > b {
93 // return a
94 // }
95 // return b
96 // }
97 if p.codePrefix(data) > 0 {
98 data = data[p.code(data):]
99 continue
100 }
101
102 // fenced code block:
103 //
104 // ``` go
105 // func fact(n int) int {
106 // if n <= 1 {
107 // return n
108 // }
109 // return n * fact(n-1)
110 // }
111 // ```
112 if p.flags&FencedCode != 0 {
113 if i := p.fencedCodeBlock(data, true); i > 0 {
114 data = data[i:]
115 continue
116 }
117 }
118
119 // horizontal rule:
120 //
121 // ------
122 // or
123 // ******
124 // or
125 // ______
126 if p.isHRule(data) {
127 p.addBlock(HorizontalRule, nil)
128 var i int
129 for i = 0; i < len(data) && data[i] != '\n'; i++ {
130 }
131 data = data[i:]
132 continue
133 }
134
135 // block quote:
136 //
137 // > A big quote I found somewhere
138 // > on the web
139 if p.quotePrefix(data) > 0 {
140 data = data[p.quote(data):]
141 continue
142 }
143
144 // table:
145 //
146 // Name | Age | Phone
147 // ------|-----|---------
148 // Bob | 31 | 555-1234
149 // Alice | 27 | 555-4321
150 if p.flags&Tables != 0 {
151 if i := p.table(data); i > 0 {
152 data = data[i:]
153 continue
154 }
155 }
156
157 // an itemized/unordered list:
158 //
159 // * Item 1
160 // * Item 2
161 //
162 // also works with + or -
163 if p.uliPrefix(data) > 0 {
164 data = data[p.list(data, 0):]
165 continue
166 }
167
168 // a numbered/ordered list:
169 //
170 // 1. Item 1
171 // 2. Item 2
172 if p.oliPrefix(data) > 0 {
173 data = data[p.list(data, ListTypeOrdered):]
174 continue
175 }
176
177 // definition lists:
178 //
179 // Term 1
180 // : Definition a
181 // : Definition b
182 //
183 // Term 2
184 // : Definition c
185 if p.flags&DefinitionLists != 0 {
186 if p.dliPrefix(data) > 0 {
187 data = data[p.list(data, ListTypeDefinition):]
188 continue
189 }
190 }
191
192 // anything else must look like a normal paragraph
193 // note: this finds underlined headers, too
194 data = data[p.paragraph(data):]
195 }
196
197 p.nesting--
198}
199
200func (p *parser) addBlock(typ NodeType, content []byte) *Node {
201 p.closeUnmatchedBlocks()
202 container := p.addChild(typ, 0)
203 container.content = content
204 return container
205}
206
207func (p *parser) isPrefixHeader(data []byte) bool {
208 if data[0] != '#' {
209 return false
210 }
211
212 if p.flags&SpaceHeaders != 0 {
213 level := 0
214 for level < 6 && level < len(data) && data[level] == '#' {
215 level++
216 }
217 if level == len(data) || data[level] != ' ' {
218 return false
219 }
220 }
221 return true
222}
223
224func (p *parser) prefixHeader(data []byte) int {
225 level := 0
226 for level < 6 && level < len(data) && data[level] == '#' {
227 level++
228 }
229 i := skipChar(data, level, ' ')
230 end := skipUntilChar(data, i, '\n')
231 skip := end
232 id := ""
233 if p.flags&HeaderIDs != 0 {
234 j, k := 0, 0
235 // find start/end of header id
236 for j = i; j < end-1 && (data[j] != '{' || data[j+1] != '#'); j++ {
237 }
238 for k = j + 1; k < end && data[k] != '}'; k++ {
239 }
240 // extract header id iff found
241 if j < end && k < end {
242 id = string(data[j+2 : k])
243 end = j
244 skip = k + 1
245 for end > 0 && data[end-1] == ' ' {
246 end--
247 }
248 }
249 }
250 for end > 0 && data[end-1] == '#' {
251 if isBackslashEscaped(data, end-1) {
252 break
253 }
254 end--
255 }
256 for end > 0 && data[end-1] == ' ' {
257 end--
258 }
259 if end > i {
260 if id == "" && p.flags&AutoHeaderIDs != 0 {
261 id = sanitized_anchor_name.Create(string(data[i:end]))
262 }
263 block := p.addBlock(Header, data[i:end])
264 block.HeaderID = id
265 block.Level = level
266 }
267 return skip
268}
269
270func (p *parser) isUnderlinedHeader(data []byte) int {
271 // test of level 1 header
272 if data[0] == '=' {
273 i := skipChar(data, 1, '=')
274 i = skipChar(data, i, ' ')
275 if i < len(data) && data[i] == '\n' {
276 return 1
277 }
278 return 0
279 }
280
281 // test of level 2 header
282 if data[0] == '-' {
283 i := skipChar(data, 1, '-')
284 i = skipChar(data, i, ' ')
285 if i < len(data) && data[i] == '\n' {
286 return 2
287 }
288 return 0
289 }
290
291 return 0
292}
293
294func (p *parser) titleBlock(data []byte, doRender bool) int {
295 if data[0] != '%' {
296 return 0
297 }
298 splitData := bytes.Split(data, []byte("\n"))
299 var i int
300 for idx, b := range splitData {
301 if !bytes.HasPrefix(b, []byte("%")) {
302 i = idx // - 1
303 break
304 }
305 }
306
307 data = bytes.Join(splitData[0:i], []byte("\n"))
308 consumed := len(data)
309 data = bytes.TrimPrefix(data, []byte("% "))
310 data = bytes.Replace(data, []byte("\n% "), []byte("\n"), -1)
311 block := p.addBlock(Header, data)
312 block.Level = 1
313 block.IsTitleblock = true
314
315 return consumed
316}
317
318func (p *parser) html(data []byte, doRender bool) int {
319 var i, j int
320
321 // identify the opening tag
322 if data[0] != '<' {
323 return 0
324 }
325 curtag, tagfound := p.htmlFindTag(data[1:])
326
327 // handle special cases
328 if !tagfound {
329 // check for an HTML comment
330 if size := p.htmlComment(data, doRender); size > 0 {
331 return size
332 }
333
334 // check for an <hr> tag
335 if size := p.htmlHr(data, doRender); size > 0 {
336 return size
337 }
338
339 // no special case recognized
340 return 0
341 }
342
343 // look for an unindented matching closing tag
344 // followed by a blank line
345 found := false
346 /*
347 closetag := []byte("\n</" + curtag + ">")
348 j = len(curtag) + 1
349 for !found {
350 // scan for a closing tag at the beginning of a line
351 if skip := bytes.Index(data[j:], closetag); skip >= 0 {
352 j += skip + len(closetag)
353 } else {
354 break
355 }
356
357 // see if it is the only thing on the line
358 if skip := p.isEmpty(data[j:]); skip > 0 {
359 // see if it is followed by a blank line/eof
360 j += skip
361 if j >= len(data) {
362 found = true
363 i = j
364 } else {
365 if skip := p.isEmpty(data[j:]); skip > 0 {
366 j += skip
367 found = true
368 i = j
369 }
370 }
371 }
372 }
373 */
374
375 // if not found, try a second pass looking for indented match
376 // but not if tag is "ins" or "del" (following original Markdown.pl)
377 if !found && curtag != "ins" && curtag != "del" {
378 i = 1
379 for i < len(data) {
380 i++
381 for i < len(data) && !(data[i-1] == '<' && data[i] == '/') {
382 i++
383 }
384
385 if i+2+len(curtag) >= len(data) {
386 break
387 }
388
389 j = p.htmlFindEnd(curtag, data[i-1:])
390
391 if j > 0 {
392 i += j - 1
393 found = true
394 break
395 }
396 }
397 }
398
399 if !found {
400 return 0
401 }
402
403 // the end of the block has been found
404 if doRender {
405 // trim newlines
406 end := i
407 for end > 0 && data[end-1] == '\n' {
408 end--
409 }
410 finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
411 }
412
413 return i
414}
415
416func finalizeHTMLBlock(block *Node) {
417 block.Literal = block.content
418 block.content = nil
419}
420
421// HTML comment, lax form
422func (p *parser) htmlComment(data []byte, doRender bool) int {
423 i := p.inlineHTMLComment(data)
424 // needs to end with a blank line
425 if j := p.isEmpty(data[i:]); j > 0 {
426 size := i + j
427 if doRender {
428 // trim trailing newlines
429 end := size
430 for end > 0 && data[end-1] == '\n' {
431 end--
432 }
433 block := p.addBlock(HTMLBlock, data[:end])
434 finalizeHTMLBlock(block)
435 }
436 return size
437 }
438 return 0
439}
440
441// HR, which is the only self-closing block tag considered
442func (p *parser) htmlHr(data []byte, doRender bool) int {
443 if len(data) < 4 {
444 return 0
445 }
446 if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') {
447 return 0
448 }
449 if data[3] != ' ' && data[3] != '/' && data[3] != '>' {
450 // not an <hr> tag after all; at least not a valid one
451 return 0
452 }
453 i := 3
454 for i < len(data) && data[i] != '>' && data[i] != '\n' {
455 i++
456 }
457 if i < len(data) && data[i] == '>' {
458 i++
459 if j := p.isEmpty(data[i:]); j > 0 {
460 size := i + j
461 if doRender {
462 // trim newlines
463 end := size
464 for end > 0 && data[end-1] == '\n' {
465 end--
466 }
467 finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
468 }
469 return size
470 }
471 }
472 return 0
473}
474
475func (p *parser) htmlFindTag(data []byte) (string, bool) {
476 i := 0
477 for i < len(data) && isalnum(data[i]) {
478 i++
479 }
480 key := string(data[:i])
481 if _, ok := blockTags[key]; ok {
482 return key, true
483 }
484 return "", false
485}
486
487func (p *parser) htmlFindEnd(tag string, data []byte) int {
488 // assume data[0] == '<' && data[1] == '/' already tested
489 if tag == "hr" {
490 return 2
491 }
492 // check if tag is a match
493 closetag := []byte("</" + tag + ">")
494 if !bytes.HasPrefix(data, closetag) {
495 return 0
496 }
497 i := len(closetag)
498
499 // check that the rest of the line is blank
500 skip := 0
501 if skip = p.isEmpty(data[i:]); skip == 0 {
502 return 0
503 }
504 i += skip
505 skip = 0
506
507 if i >= len(data) {
508 return i
509 }
510
511 if p.flags&LaxHTMLBlocks != 0 {
512 return i
513 }
514 if skip = p.isEmpty(data[i:]); skip == 0 {
515 // following line must be blank
516 return 0
517 }
518
519 return i + skip
520}
521
522func (*parser) isEmpty(data []byte) int {
523 // it is okay to call isEmpty on an empty buffer
524 if len(data) == 0 {
525 return 0
526 }
527
528 var i int
529 for i = 0; i < len(data) && data[i] != '\n'; i++ {
530 if data[i] != ' ' && data[i] != '\t' {
531 return 0
532 }
533 }
534 if i < len(data) && data[i] == '\n' {
535 i++
536 }
537 return i
538}
539
540func (*parser) isHRule(data []byte) bool {
541 i := 0
542
543 // skip up to three spaces
544 for i < 3 && data[i] == ' ' {
545 i++
546 }
547
548 // look at the hrule char
549 if data[i] != '*' && data[i] != '-' && data[i] != '_' {
550 return false
551 }
552 c := data[i]
553
554 // the whole line must be the char or whitespace
555 n := 0
556 for i < len(data) && data[i] != '\n' {
557 switch {
558 case data[i] == c:
559 n++
560 case data[i] != ' ':
561 return false
562 }
563 i++
564 }
565
566 return n >= 3
567}
568
569// isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data,
570// and returns the end index if so, or 0 otherwise. It also returns the marker found.
571// If syntax is not nil, it gets set to the syntax specified in the fence line.
572func isFenceLine(data []byte, syntax *string, oldmarker string) (end int, marker string) {
573 i, size := 0, 0
574
575 // skip up to three spaces
576 for i < len(data) && i < 3 && data[i] == ' ' {
577 i++
578 }
579
580 // check for the marker characters: ~ or `
581 if i >= len(data) {
582 return 0, ""
583 }
584 if data[i] != '~' && data[i] != '`' {
585 return 0, ""
586 }
587
588 c := data[i]
589
590 // the whole line must be the same char or whitespace
591 for i < len(data) && data[i] == c {
592 size++
593 i++
594 }
595
596 // the marker char must occur at least 3 times
597 if size < 3 {
598 return 0, ""
599 }
600 marker = string(data[i-size : i])
601
602 // if this is the end marker, it must match the beginning marker
603 if oldmarker != "" && marker != oldmarker {
604 return 0, ""
605 }
606
607 // TODO(shurcooL): It's probably a good idea to simplify the 2 code paths here
608 // into one, always get the syntax, and discard it if the caller doesn't care.
609 if syntax != nil {
610 syn := 0
611 i = skipChar(data, i, ' ')
612
613 if i >= len(data) {
614 if i == len(data) {
615 return i, marker
616 }
617 return 0, ""
618 }
619
620 syntaxStart := i
621
622 if data[i] == '{' {
623 i++
624 syntaxStart++
625
626 for i < len(data) && data[i] != '}' && data[i] != '\n' {
627 syn++
628 i++
629 }
630
631 if i >= len(data) || data[i] != '}' {
632 return 0, ""
633 }
634
635 // strip all whitespace at the beginning and the end
636 // of the {} block
637 for syn > 0 && isspace(data[syntaxStart]) {
638 syntaxStart++
639 syn--
640 }
641
642 for syn > 0 && isspace(data[syntaxStart+syn-1]) {
643 syn--
644 }
645
646 i++
647 } else {
648 for i < len(data) && !isspace(data[i]) {
649 syn++
650 i++
651 }
652 }
653
654 *syntax = string(data[syntaxStart : syntaxStart+syn])
655 }
656
657 i = skipChar(data, i, ' ')
658 if i >= len(data) || data[i] != '\n' {
659 if i == len(data) {
660 return i, marker
661 }
662 return 0, ""
663 }
664 return i + 1, marker // Take newline into account.
665}
666
667// fencedCodeBlock returns the end index if data contains a fenced code block at the beginning,
668// or 0 otherwise. It writes to out if doRender is true, otherwise it has no side effects.
669// If doRender is true, a final newline is mandatory to recognize the fenced code block.
670func (p *parser) fencedCodeBlock(data []byte, doRender bool) int {
671 var syntax string
672 beg, marker := isFenceLine(data, &syntax, "")
673 if beg == 0 || beg >= len(data) {
674 return 0
675 }
676
677 var work bytes.Buffer
678 work.Write([]byte(syntax))
679 work.WriteByte('\n')
680
681 for {
682 // safe to assume beg < len(data)
683
684 // check for the end of the code block
685 fenceEnd, _ := isFenceLine(data[beg:], nil, marker)
686 if fenceEnd != 0 {
687 beg += fenceEnd
688 break
689 }
690
691 // copy the current line
692 end := skipUntilChar(data, beg, '\n') + 1
693
694 // did we reach the end of the buffer without a closing marker?
695 if end >= len(data) {
696 return 0
697 }
698
699 // verbatim copy to the working buffer
700 if doRender {
701 work.Write(data[beg:end])
702 }
703 beg = end
704 }
705
706 if doRender {
707 block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
708 block.IsFenced = true
709 finalizeCodeBlock(block)
710 }
711
712 return beg
713}
714
715func unescapeChar(str []byte) []byte {
716 if str[0] == '\\' {
717 return []byte{str[1]}
718 }
719 return []byte(html.UnescapeString(string(str)))
720}
721
722func unescapeString(str []byte) []byte {
723 if reBackslashOrAmp.Match(str) {
724 return reEntityOrEscapedChar.ReplaceAllFunc(str, unescapeChar)
725 }
726 return str
727}
728
729func finalizeCodeBlock(block *Node) {
730 if block.IsFenced {
731 newlinePos := bytes.IndexByte(block.content, '\n')
732 firstLine := block.content[:newlinePos]
733 rest := block.content[newlinePos+1:]
734 block.Info = unescapeString(bytes.Trim(firstLine, "\n"))
735 block.Literal = rest
736 } else {
737 block.Literal = block.content
738 }
739 block.content = nil
740}
741
742func (p *parser) table(data []byte) int {
743 table := p.addBlock(Table, nil)
744 i, columns := p.tableHeader(data)
745 if i == 0 {
746 p.tip = table.Parent
747 table.Unlink()
748 return 0
749 }
750
751 p.addBlock(TableBody, nil)
752
753 for i < len(data) {
754 pipes, rowStart := 0, i
755 for ; i < len(data) && data[i] != '\n'; i++ {
756 if data[i] == '|' {
757 pipes++
758 }
759 }
760
761 if pipes == 0 {
762 i = rowStart
763 break
764 }
765
766 // include the newline in data sent to tableRow
767 if i < len(data) && data[i] == '\n' {
768 i++
769 }
770 p.tableRow(data[rowStart:i], columns, false)
771 }
772
773 return i
774}
775
776// check if the specified position is preceded by an odd number of backslashes
777func isBackslashEscaped(data []byte, i int) bool {
778 backslashes := 0
779 for i-backslashes-1 >= 0 && data[i-backslashes-1] == '\\' {
780 backslashes++
781 }
782 return backslashes&1 == 1
783}
784
785func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
786 i := 0
787 colCount := 1
788 for i = 0; i < len(data) && data[i] != '\n'; i++ {
789 if data[i] == '|' && !isBackslashEscaped(data, i) {
790 colCount++
791 }
792 }
793
794 // doesn't look like a table header
795 if colCount == 1 {
796 return
797 }
798
799 // include the newline in the data sent to tableRow
800 j := i
801 if j < len(data) && data[j] == '\n' {
802 j++
803 }
804 header := data[:j]
805
806 // column count ignores pipes at beginning or end of line
807 if data[0] == '|' {
808 colCount--
809 }
810 if i > 2 && data[i-1] == '|' && !isBackslashEscaped(data, i-1) {
811 colCount--
812 }
813
814 columns = make([]CellAlignFlags, colCount)
815
816 // move on to the header underline
817 i++
818 if i >= len(data) {
819 return
820 }
821
822 if data[i] == '|' && !isBackslashEscaped(data, i) {
823 i++
824 }
825 i = skipChar(data, i, ' ')
826
827 // each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3
828 // and trailing | optional on last column
829 col := 0
830 for i < len(data) && data[i] != '\n' {
831 dashes := 0
832
833 if data[i] == ':' {
834 i++
835 columns[col] |= TableAlignmentLeft
836 dashes++
837 }
838 for i < len(data) && data[i] == '-' {
839 i++
840 dashes++
841 }
842 if i < len(data) && data[i] == ':' {
843 i++
844 columns[col] |= TableAlignmentRight
845 dashes++
846 }
847 for i < len(data) && data[i] == ' ' {
848 i++
849 }
850 if i == len(data) {
851 return
852 }
853 // end of column test is messy
854 switch {
855 case dashes < 3:
856 // not a valid column
857 return
858
859 case data[i] == '|' && !isBackslashEscaped(data, i):
860 // marker found, now skip past trailing whitespace
861 col++
862 i++
863 for i < len(data) && data[i] == ' ' {
864 i++
865 }
866
867 // trailing junk found after last column
868 if col >= colCount && i < len(data) && data[i] != '\n' {
869 return
870 }
871
872 case (data[i] != '|' || isBackslashEscaped(data, i)) && col+1 < colCount:
873 // something else found where marker was required
874 return
875
876 case data[i] == '\n':
877 // marker is optional for the last column
878 col++
879
880 default:
881 // trailing junk found after last column
882 return
883 }
884 }
885 if col != colCount {
886 return
887 }
888
889 p.addBlock(TableHead, nil)
890 p.tableRow(header, columns, true)
891 size = i
892 if size < len(data) && data[size] == '\n' {
893 size++
894 }
895 return
896}
897
898func (p *parser) tableRow(data []byte, columns []CellAlignFlags, header bool) {
899 p.addBlock(TableRow, nil)
900 i, col := 0, 0
901
902 if data[i] == '|' && !isBackslashEscaped(data, i) {
903 i++
904 }
905
906 for col = 0; col < len(columns) && i < len(data); col++ {
907 for i < len(data) && data[i] == ' ' {
908 i++
909 }
910
911 cellStart := i
912
913 for i < len(data) && (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
914 i++
915 }
916
917 cellEnd := i
918
919 // skip the end-of-cell marker, possibly taking us past end of buffer
920 i++
921
922 for cellEnd > cellStart && cellEnd-1 < len(data) && data[cellEnd-1] == ' ' {
923 cellEnd--
924 }
925
926 cell := p.addBlock(TableCell, data[cellStart:cellEnd])
927 cell.IsHeader = header
928 cell.Align = columns[col]
929 }
930
931 // pad it out with empty columns to get the right number
932 for ; col < len(columns); col++ {
933 cell := p.addBlock(TableCell, nil)
934 cell.IsHeader = header
935 cell.Align = columns[col]
936 }
937
938 // silently ignore rows with too many cells
939}
940
941// returns blockquote prefix length
942func (p *parser) quotePrefix(data []byte) int {
943 i := 0
944 for i < 3 && i < len(data) && data[i] == ' ' {
945 i++
946 }
947 if i < len(data) && data[i] == '>' {
948 if i+1 < len(data) && data[i+1] == ' ' {
949 return i + 2
950 }
951 return i + 1
952 }
953 return 0
954}
955
956// blockquote ends with at least one blank line
957// followed by something without a blockquote prefix
958func (p *parser) terminateBlockquote(data []byte, beg, end int) bool {
959 if p.isEmpty(data[beg:]) <= 0 {
960 return false
961 }
962 if end >= len(data) {
963 return true
964 }
965 return p.quotePrefix(data[end:]) == 0 && p.isEmpty(data[end:]) == 0
966}
967
968// parse a blockquote fragment
969func (p *parser) quote(data []byte) int {
970 block := p.addBlock(BlockQuote, nil)
971 var raw bytes.Buffer
972 beg, end := 0, 0
973 for beg < len(data) {
974 end = beg
975 // Step over whole lines, collecting them. While doing that, check for
976 // fenced code and if one's found, incorporate it altogether,
977 // irregardless of any contents inside it
978 for end < len(data) && data[end] != '\n' {
979 if p.flags&FencedCode != 0 {
980 if i := p.fencedCodeBlock(data[end:], false); i > 0 {
981 // -1 to compensate for the extra end++ after the loop:
982 end += i - 1
983 break
984 }
985 }
986 end++
987 }
988 if end < len(data) && data[end] == '\n' {
989 end++
990 }
991 if pre := p.quotePrefix(data[beg:]); pre > 0 {
992 // skip the prefix
993 beg += pre
994 } else if p.terminateBlockquote(data, beg, end) {
995 break
996 }
997 // this line is part of the blockquote
998 raw.Write(data[beg:end])
999 beg = end
1000 }
1001 p.block(raw.Bytes())
1002 p.finalize(block)
1003 return end
1004}
1005
1006// returns prefix length for block code
1007func (p *parser) codePrefix(data []byte) int {
1008 if len(data) >= 1 && data[0] == '\t' {
1009 return 1
1010 }
1011 if len(data) >= 4 && data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
1012 return 4
1013 }
1014 return 0
1015}
1016
1017func (p *parser) code(data []byte) int {
1018 var work bytes.Buffer
1019
1020 i := 0
1021 for i < len(data) {
1022 beg := i
1023 for i < len(data) && data[i] != '\n' {
1024 i++
1025 }
1026 if i < len(data) && data[i] == '\n' {
1027 i++
1028 }
1029
1030 blankline := p.isEmpty(data[beg:i]) > 0
1031 if pre := p.codePrefix(data[beg:i]); pre > 0 {
1032 beg += pre
1033 } else if !blankline {
1034 // non-empty, non-prefixed line breaks the pre
1035 i = beg
1036 break
1037 }
1038
1039 // verbatim copy to the working buffer
1040 if blankline {
1041 work.WriteByte('\n')
1042 } else {
1043 work.Write(data[beg:i])
1044 }
1045 }
1046
1047 // trim all the \n off the end of work
1048 workbytes := work.Bytes()
1049 eol := len(workbytes)
1050 for eol > 0 && workbytes[eol-1] == '\n' {
1051 eol--
1052 }
1053 if eol != len(workbytes) {
1054 work.Truncate(eol)
1055 }
1056
1057 work.WriteByte('\n')
1058
1059 block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
1060 block.IsFenced = false
1061 finalizeCodeBlock(block)
1062
1063 return i
1064}
1065
1066// returns unordered list item prefix
1067func (p *parser) uliPrefix(data []byte) int {
1068 i := 0
1069 // start with up to 3 spaces
1070 for i < len(data) && i < 3 && data[i] == ' ' {
1071 i++
1072 }
1073 if i >= len(data)-1 {
1074 return 0
1075 }
1076 // need one of {'*', '+', '-'} followed by a space or a tab
1077 if (data[i] != '*' && data[i] != '+' && data[i] != '-') ||
1078 (data[i+1] != ' ' && data[i+1] != '\t') {
1079 return 0
1080 }
1081 return i + 2
1082}
1083
1084// returns ordered list item prefix
1085func (p *parser) oliPrefix(data []byte) int {
1086 i := 0
1087
1088 // start with up to 3 spaces
1089 for i < 3 && i < len(data) && data[i] == ' ' {
1090 i++
1091 }
1092
1093 // count the digits
1094 start := i
1095 for i < len(data) && data[i] >= '0' && data[i] <= '9' {
1096 i++
1097 }
1098 if start == i || i >= len(data)-1 {
1099 return 0
1100 }
1101
1102 // we need >= 1 digits followed by a dot and a space or a tab
1103 if data[i] != '.' || !(data[i+1] == ' ' || data[i+1] == '\t') {
1104 return 0
1105 }
1106 return i + 2
1107}
1108
1109// returns definition list item prefix
1110func (p *parser) dliPrefix(data []byte) int {
1111 if len(data) < 2 {
1112 return 0
1113 }
1114 i := 0
1115 // need a ':' followed by a space or a tab
1116 if data[i] != ':' || !(data[i+1] == ' ' || data[i+1] == '\t') {
1117 return 0
1118 }
1119 for i < len(data) && data[i] == ' ' {
1120 i++
1121 }
1122 return i + 2
1123}
1124
1125// parse ordered or unordered list block
1126func (p *parser) list(data []byte, flags ListType) int {
1127 i := 0
1128 flags |= ListItemBeginningOfList
1129 block := p.addBlock(List, nil)
1130 block.ListFlags = flags
1131 block.Tight = true
1132
1133 for i < len(data) {
1134 skip := p.listItem(data[i:], &flags)
1135 if flags&ListItemContainsBlock != 0 {
1136 block.ListData.Tight = false
1137 }
1138 i += skip
1139 if skip == 0 || flags&ListItemEndOfList != 0 {
1140 break
1141 }
1142 flags &= ^ListItemBeginningOfList
1143 }
1144
1145 above := block.Parent
1146 finalizeList(block)
1147 p.tip = above
1148 return i
1149}
1150
1151// Returns true if block ends with a blank line, descending if needed
1152// into lists and sublists.
1153func endsWithBlankLine(block *Node) bool {
1154 // TODO: figure this out. Always false now.
1155 for block != nil {
1156 //if block.lastLineBlank {
1157 //return true
1158 //}
1159 t := block.Type
1160 if t == List || t == Item {
1161 block = block.LastChild
1162 } else {
1163 break
1164 }
1165 }
1166 return false
1167}
1168
1169func finalizeList(block *Node) {
1170 block.open = false
1171 item := block.FirstChild
1172 for item != nil {
1173 // check for non-final list item ending with blank line:
1174 if endsWithBlankLine(item) && item.Next != nil {
1175 block.ListData.Tight = false
1176 break
1177 }
1178 // recurse into children of list item, to see if there are spaces
1179 // between any of them:
1180 subItem := item.FirstChild
1181 for subItem != nil {
1182 if endsWithBlankLine(subItem) && (item.Next != nil || subItem.Next != nil) {
1183 block.ListData.Tight = false
1184 break
1185 }
1186 subItem = subItem.Next
1187 }
1188 item = item.Next
1189 }
1190}
1191
1192// Parse a single list item.
1193// Assumes initial prefix is already removed if this is a sublist.
1194func (p *parser) listItem(data []byte, flags *ListType) int {
1195 // keep track of the indentation of the first line
1196 itemIndent := 0
1197 if data[0] == '\t' {
1198 itemIndent += 4
1199 } else {
1200 for itemIndent < 3 && data[itemIndent] == ' ' {
1201 itemIndent++
1202 }
1203 }
1204
1205 var bulletChar byte = '*'
1206 i := p.uliPrefix(data)
1207 if i == 0 {
1208 i = p.oliPrefix(data)
1209 } else {
1210 bulletChar = data[i-2]
1211 }
1212 if i == 0 {
1213 i = p.dliPrefix(data)
1214 // reset definition term flag
1215 if i > 0 {
1216 *flags &= ^ListTypeTerm
1217 }
1218 }
1219 if i == 0 {
1220 // if in definition list, set term flag and continue
1221 if *flags&ListTypeDefinition != 0 {
1222 *flags |= ListTypeTerm
1223 } else {
1224 return 0
1225 }
1226 }
1227
1228 // skip leading whitespace on first line
1229 for i < len(data) && data[i] == ' ' {
1230 i++
1231 }
1232
1233 // find the end of the line
1234 line := i
1235 for i > 0 && i < len(data) && data[i-1] != '\n' {
1236 i++
1237 }
1238
1239 // get working buffer
1240 var raw bytes.Buffer
1241
1242 // put the first line into the working buffer
1243 raw.Write(data[line:i])
1244 line = i
1245
1246 // process the following lines
1247 containsBlankLine := false
1248 sublist := 0
1249
1250gatherlines:
1251 for line < len(data) {
1252 i++
1253
1254 // find the end of this line
1255 for i < len(data) && data[i-1] != '\n' {
1256 i++
1257 }
1258
1259 // if it is an empty line, guess that it is part of this item
1260 // and move on to the next line
1261 if p.isEmpty(data[line:i]) > 0 {
1262 containsBlankLine = true
1263 line = i
1264 continue
1265 }
1266
1267 // calculate the indentation
1268 indent := 0
1269 indentIndex := 0
1270 if data[line] == '\t' {
1271 indentIndex++
1272 indent += 4
1273 } else {
1274 for indent < 4 && line+indent < i && data[line+indent] == ' ' {
1275 indent++
1276 indentIndex++
1277 }
1278 }
1279
1280 chunk := data[line+indentIndex : i]
1281
1282 // evaluate how this line fits in
1283 switch {
1284 // is this a nested list item?
1285 case (p.uliPrefix(chunk) > 0 && !p.isHRule(chunk)) ||
1286 p.oliPrefix(chunk) > 0 ||
1287 p.dliPrefix(chunk) > 0:
1288
1289 if containsBlankLine {
1290 *flags |= ListItemContainsBlock
1291 }
1292
1293 // to be a nested list, it must be indented more
1294 // if not, it is the next item in the same list
1295 if indent <= itemIndent {
1296 break gatherlines
1297 }
1298
1299 // is this the first item in the nested list?
1300 if sublist == 0 {
1301 sublist = raw.Len()
1302 }
1303
1304 // is this a nested prefix header?
1305 case p.isPrefixHeader(chunk):
1306 // if the header is not indented, it is not nested in the list
1307 // and thus ends the list
1308 if containsBlankLine && indent < 4 {
1309 *flags |= ListItemEndOfList
1310 break gatherlines
1311 }
1312 *flags |= ListItemContainsBlock
1313
1314 // anything following an empty line is only part
1315 // of this item if it is indented 4 spaces
1316 // (regardless of the indentation of the beginning of the item)
1317 case containsBlankLine && indent < 4:
1318 if *flags&ListTypeDefinition != 0 && i < len(data)-1 {
1319 // is the next item still a part of this list?
1320 next := i
1321 for next < len(data) && data[next] != '\n' {
1322 next++
1323 }
1324 for next < len(data)-1 && data[next] == '\n' {
1325 next++
1326 }
1327 if i < len(data)-1 && data[i] != ':' && data[next] != ':' {
1328 *flags |= ListItemEndOfList
1329 }
1330 } else {
1331 *flags |= ListItemEndOfList
1332 }
1333 break gatherlines
1334
1335 // a blank line means this should be parsed as a block
1336 case containsBlankLine:
1337 raw.WriteByte('\n')
1338 *flags |= ListItemContainsBlock
1339 }
1340
1341 // if this line was preceded by one or more blanks,
1342 // re-introduce the blank into the buffer
1343 if containsBlankLine {
1344 containsBlankLine = false
1345 raw.WriteByte('\n')
1346 }
1347
1348 // add the line into the working buffer without prefix
1349 raw.Write(data[line+indentIndex : i])
1350
1351 line = i
1352 }
1353
1354 rawBytes := raw.Bytes()
1355
1356 block := p.addBlock(Item, nil)
1357 block.ListFlags = *flags
1358 block.Tight = false
1359 block.BulletChar = bulletChar
1360 block.Delimiter = '.' // Only '.' is possible in Markdown, but ')' will also be possible in CommonMark
1361
1362 // render the contents of the list item
1363 if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 {
1364 // intermediate render of block item, except for definition term
1365 if sublist > 0 {
1366 p.block(rawBytes[:sublist])
1367 p.block(rawBytes[sublist:])
1368 } else {
1369 p.block(rawBytes)
1370 }
1371 } else {
1372 // intermediate render of inline item
1373 if sublist > 0 {
1374 child := p.addChild(Paragraph, 0)
1375 child.content = rawBytes[:sublist]
1376 p.block(rawBytes[sublist:])
1377 } else {
1378 child := p.addChild(Paragraph, 0)
1379 child.content = rawBytes
1380 }
1381 }
1382 return line
1383}
1384
1385// render a single paragraph that has already been parsed out
1386func (p *parser) renderParagraph(data []byte) {
1387 if len(data) == 0 {
1388 return
1389 }
1390
1391 // trim leading spaces
1392 beg := 0
1393 for data[beg] == ' ' {
1394 beg++
1395 }
1396
1397 end := len(data)
1398 // trim trailing newline
1399 if data[len(data)-1] == '\n' {
1400 end--
1401 }
1402
1403 // trim trailing spaces
1404 for end > beg && data[end-1] == ' ' {
1405 end--
1406 }
1407
1408 p.addBlock(Paragraph, data[beg:end])
1409}
1410
1411func (p *parser) paragraph(data []byte) int {
1412 // prev: index of 1st char of previous line
1413 // line: index of 1st char of current line
1414 // i: index of cursor/end of current line
1415 var prev, line, i int
1416 tabSize := TabSizeDefault
1417 if p.flags&TabSizeEight != 0 {
1418 tabSize = TabSizeDouble
1419 }
1420 // keep going until we find something to mark the end of the paragraph
1421 for i < len(data) {
1422 // mark the beginning of the current line
1423 prev = line
1424 current := data[i:]
1425 line = i
1426
1427 // did we find a reference or a footnote? If so, end a paragraph
1428 // preceding it and report that we have consumed up to the end of that
1429 // reference:
1430 if refEnd := isReference(p, current, tabSize); refEnd > 0 {
1431 p.renderParagraph(data[:i])
1432 return i + refEnd
1433 }
1434
1435 // did we find a blank line marking the end of the paragraph?
1436 if n := p.isEmpty(current); n > 0 {
1437 // did this blank line followed by a definition list item?
1438 if p.flags&DefinitionLists != 0 {
1439 if i < len(data)-1 && data[i+1] == ':' {
1440 return p.list(data[prev:], ListTypeDefinition)
1441 }
1442 }
1443
1444 p.renderParagraph(data[:i])
1445 return i + n
1446 }
1447
1448 // an underline under some text marks a header, so our paragraph ended on prev line
1449 if i > 0 {
1450 if level := p.isUnderlinedHeader(current); level > 0 {
1451 // render the paragraph
1452 p.renderParagraph(data[:prev])
1453
1454 // ignore leading and trailing whitespace
1455 eol := i - 1
1456 for prev < eol && data[prev] == ' ' {
1457 prev++
1458 }
1459 for eol > prev && data[eol-1] == ' ' {
1460 eol--
1461 }
1462
1463 id := ""
1464 if p.flags&AutoHeaderIDs != 0 {
1465 id = sanitized_anchor_name.Create(string(data[prev:eol]))
1466 }
1467
1468 block := p.addBlock(Header, data[prev:eol])
1469 block.Level = level
1470 block.HeaderID = id
1471
1472 // find the end of the underline
1473 for i < len(data) && data[i] != '\n' {
1474 i++
1475 }
1476 return i
1477 }
1478 }
1479
1480 // if the next line starts a block of HTML, then the paragraph ends here
1481 if p.flags&LaxHTMLBlocks != 0 {
1482 if data[i] == '<' && p.html(current, false) > 0 {
1483 // rewind to before the HTML block
1484 p.renderParagraph(data[:i])
1485 return i
1486 }
1487 }
1488
1489 // if there's a prefixed header or a horizontal rule after this, paragraph is over
1490 if p.isPrefixHeader(current) || p.isHRule(current) {
1491 p.renderParagraph(data[:i])
1492 return i
1493 }
1494
1495 // if there's a fenced code block, paragraph is over
1496 if p.flags&FencedCode != 0 {
1497 if p.fencedCodeBlock(current, false) > 0 {
1498 p.renderParagraph(data[:i])
1499 return i
1500 }
1501 }
1502
1503 // if there's a definition list item, prev line is a definition term
1504 if p.flags&DefinitionLists != 0 {
1505 if p.dliPrefix(current) != 0 {
1506 ret := p.list(data[prev:], ListTypeDefinition)
1507 return ret
1508 }
1509 }
1510
1511 // if there's a list after this, paragraph is over
1512 if p.flags&NoEmptyLineBeforeBlock != 0 {
1513 if p.uliPrefix(current) != 0 ||
1514 p.oliPrefix(current) != 0 ||
1515 p.quotePrefix(current) != 0 ||
1516 p.codePrefix(current) != 0 {
1517 p.renderParagraph(data[:i])
1518 return i
1519 }
1520 }
1521
1522 // otherwise, scan to the beginning of the next line
1523 nl := bytes.IndexByte(data[i:], '\n')
1524 if nl >= 0 {
1525 i += nl + 1
1526 } else {
1527 i += len(data[i:])
1528 }
1529 }
1530
1531 p.renderParagraph(data[:i])
1532 return i
1533}
1534
1535func skipChar(data []byte, start int, char byte) int {
1536 i := start
1537 for i < len(data) && data[i] == char {
1538 i++
1539 }
1540 return i
1541}
1542
1543func skipUntilChar(text []byte, start int, char byte) int {
1544 i := start
1545 for i < len(text) && text[i] != char {
1546 i++
1547 }
1548 return i
1549}