block.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11// Functions to parse block-level elements.
12//
13
14package blackfriday
15
16import (
17 "bytes"
18 "html"
19 "regexp"
20
21 "github.com/shurcooL/sanitized_anchor_name"
22)
23
24const (
25 Entity = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"
26 Escapable = "[!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]"
27)
28
29var (
30 reBackslashOrAmp = regexp.MustCompile("[\\&]")
31 reEntityOrEscapedChar = regexp.MustCompile("(?i)\\\\" + Escapable + "|" + Entity)
32 reTrailingWhitespace = regexp.MustCompile("(\n *)+$")
33)
34
35// Parse block-level data.
36// Note: this function and many that it calls assume that
37// the input buffer ends with a newline.
38func (p *parser) block(data []byte) {
39 if len(data) == 0 || data[len(data)-1] != '\n' {
40 panic("block input is missing terminating newline")
41 }
42
43 // this is called recursively: enforce a maximum depth
44 if p.nesting >= p.maxNesting {
45 return
46 }
47 p.nesting++
48
49 // parse out one block-level construct at a time
50 for len(data) > 0 {
51 // prefixed header:
52 //
53 // # Header 1
54 // ## Header 2
55 // ...
56 // ###### Header 6
57 if p.isPrefixHeader(data) {
58 data = data[p.prefixHeader(data):]
59 continue
60 }
61
62 // block of preformatted HTML:
63 //
64 // <div>
65 // ...
66 // </div>
67 if data[0] == '<' {
68 if i := p.html(data, true); i > 0 {
69 data = data[i:]
70 continue
71 }
72 }
73
74 // title block
75 //
76 // % stuff
77 // % more stuff
78 // % even more stuff
79 if p.flags&Titleblock != 0 {
80 if data[0] == '%' {
81 if i := p.titleBlock(data, true); i > 0 {
82 data = data[i:]
83 continue
84 }
85 }
86 }
87
88 // blank lines. note: returns the # of bytes to skip
89 if i := p.isEmpty(data); i > 0 {
90 data = data[i:]
91 continue
92 }
93
94 // indented code block:
95 //
96 // func max(a, b int) int {
97 // if a > b {
98 // return a
99 // }
100 // return b
101 // }
102 if p.codePrefix(data) > 0 {
103 data = data[p.code(data):]
104 continue
105 }
106
107 // fenced code block:
108 //
109 // ``` go
110 // func fact(n int) int {
111 // if n <= 1 {
112 // return n
113 // }
114 // return n * fact(n-1)
115 // }
116 // ```
117 if p.flags&FencedCode != 0 {
118 if i := p.fencedCodeBlock(data, true); i > 0 {
119 data = data[i:]
120 continue
121 }
122 }
123
124 // horizontal rule:
125 //
126 // ------
127 // or
128 // ******
129 // or
130 // ______
131 if p.isHRule(data) {
132 p.addBlock(HorizontalRule, nil)
133 var i int
134 for i = 0; data[i] != '\n'; i++ {
135 }
136 data = data[i:]
137 continue
138 }
139
140 // block quote:
141 //
142 // > A big quote I found somewhere
143 // > on the web
144 if p.quotePrefix(data) > 0 {
145 data = data[p.quote(data):]
146 continue
147 }
148
149 // table:
150 //
151 // Name | Age | Phone
152 // ------|-----|---------
153 // Bob | 31 | 555-1234
154 // Alice | 27 | 555-4321
155 if p.flags&Tables != 0 {
156 if i := p.table(data); i > 0 {
157 data = data[i:]
158 continue
159 }
160 }
161
162 // an itemized/unordered list:
163 //
164 // * Item 1
165 // * Item 2
166 //
167 // also works with + or -
168 if p.uliPrefix(data) > 0 {
169 data = data[p.list(data, 0):]
170 continue
171 }
172
173 // a numbered/ordered list:
174 //
175 // 1. Item 1
176 // 2. Item 2
177 if p.oliPrefix(data) > 0 {
178 data = data[p.list(data, ListTypeOrdered):]
179 continue
180 }
181
182 // definition lists:
183 //
184 // Term 1
185 // : Definition a
186 // : Definition b
187 //
188 // Term 2
189 // : Definition c
190 if p.flags&DefinitionLists != 0 {
191 if p.dliPrefix(data) > 0 {
192 data = data[p.list(data, ListTypeDefinition):]
193 continue
194 }
195 }
196
197 // anything else must look like a normal paragraph
198 // note: this finds underlined headers, too
199 data = data[p.paragraph(data):]
200 }
201
202 p.nesting--
203}
204
205func (p *parser) addBlock(typ NodeType, content []byte) *Node {
206 p.closeUnmatchedBlocks()
207 container := p.addChild(typ, 0)
208 container.content = content
209 return container
210}
211
212func (p *parser) isPrefixHeader(data []byte) bool {
213 if data[0] != '#' {
214 return false
215 }
216
217 if p.flags&SpaceHeaders != 0 {
218 level := 0
219 for level < 6 && data[level] == '#' {
220 level++
221 }
222 if data[level] != ' ' {
223 return false
224 }
225 }
226 return true
227}
228
229func (p *parser) prefixHeader(data []byte) int {
230 level := 0
231 for level < 6 && data[level] == '#' {
232 level++
233 }
234 i := skipChar(data, level, ' ')
235 end := skipUntilChar(data, i, '\n')
236 skip := end
237 id := ""
238 if p.flags&HeaderIDs != 0 {
239 j, k := 0, 0
240 // find start/end of header id
241 for j = i; j < end-1 && (data[j] != '{' || data[j+1] != '#'); j++ {
242 }
243 for k = j + 1; k < end && data[k] != '}'; k++ {
244 }
245 // extract header id iff found
246 if j < end && k < end {
247 id = string(data[j+2 : k])
248 end = j
249 skip = k + 1
250 for end > 0 && data[end-1] == ' ' {
251 end--
252 }
253 }
254 }
255 for end > 0 && data[end-1] == '#' {
256 if isBackslashEscaped(data, end-1) {
257 break
258 }
259 end--
260 }
261 for end > 0 && data[end-1] == ' ' {
262 end--
263 }
264 if end > i {
265 if id == "" && p.flags&AutoHeaderIDs != 0 {
266 id = sanitized_anchor_name.Create(string(data[i:end]))
267 }
268 block := p.addBlock(Header, data[i:end])
269 block.HeaderID = id
270 block.Level = level
271 }
272 return skip
273}
274
275func (p *parser) isUnderlinedHeader(data []byte) int {
276 // test of level 1 header
277 if data[0] == '=' {
278 i := skipChar(data, 1, '=')
279 i = skipChar(data, i, ' ')
280 if data[i] == '\n' {
281 return 1
282 } else {
283 return 0
284 }
285 }
286
287 // test of level 2 header
288 if data[0] == '-' {
289 i := skipChar(data, 1, '-')
290 i = skipChar(data, i, ' ')
291 if data[i] == '\n' {
292 return 2
293 } else {
294 return 0
295 }
296 }
297
298 return 0
299}
300
301func (p *parser) titleBlock(data []byte, doRender bool) int {
302 if data[0] != '%' {
303 return 0
304 }
305 splitData := bytes.Split(data, []byte("\n"))
306 var i int
307 for idx, b := range splitData {
308 if !bytes.HasPrefix(b, []byte("%")) {
309 i = idx // - 1
310 break
311 }
312 }
313
314 data = bytes.Join(splitData[0:i], []byte("\n"))
315 consumed := len(data)
316 data = bytes.TrimPrefix(data, []byte("% "))
317 data = bytes.Replace(data, []byte("\n% "), []byte("\n"), -1)
318 block := p.addBlock(Header, data)
319 block.Level = 1
320 block.IsTitleblock = true
321
322 return consumed
323}
324
325func (p *parser) html(data []byte, doRender bool) int {
326 var i, j int
327
328 // identify the opening tag
329 if data[0] != '<' {
330 return 0
331 }
332 curtag, tagfound := p.htmlFindTag(data[1:])
333
334 // handle special cases
335 if !tagfound {
336 // check for an HTML comment
337 if size := p.htmlComment(data, doRender); size > 0 {
338 return size
339 }
340
341 // check for an <hr> tag
342 if size := p.htmlHr(data, doRender); size > 0 {
343 return size
344 }
345
346 // no special case recognized
347 return 0
348 }
349
350 // look for an unindented matching closing tag
351 // followed by a blank line
352 found := false
353 /*
354 closetag := []byte("\n</" + curtag + ">")
355 j = len(curtag) + 1
356 for !found {
357 // scan for a closing tag at the beginning of a line
358 if skip := bytes.Index(data[j:], closetag); skip >= 0 {
359 j += skip + len(closetag)
360 } else {
361 break
362 }
363
364 // see if it is the only thing on the line
365 if skip := p.isEmpty(data[j:]); skip > 0 {
366 // see if it is followed by a blank line/eof
367 j += skip
368 if j >= len(data) {
369 found = true
370 i = j
371 } else {
372 if skip := p.isEmpty(data[j:]); skip > 0 {
373 j += skip
374 found = true
375 i = j
376 }
377 }
378 }
379 }
380 */
381
382 // if not found, try a second pass looking for indented match
383 // but not if tag is "ins" or "del" (following original Markdown.pl)
384 if !found && curtag != "ins" && curtag != "del" {
385 i = 1
386 for i < len(data) {
387 i++
388 for i < len(data) && !(data[i-1] == '<' && data[i] == '/') {
389 i++
390 }
391
392 if i+2+len(curtag) >= len(data) {
393 break
394 }
395
396 j = p.htmlFindEnd(curtag, data[i-1:])
397
398 if j > 0 {
399 i += j - 1
400 found = true
401 break
402 }
403 }
404 }
405
406 if !found {
407 return 0
408 }
409
410 // the end of the block has been found
411 if doRender {
412 // trim newlines
413 end := i
414 for end > 0 && data[end-1] == '\n' {
415 end--
416 }
417 finalizeHtmlBlock(p.addBlock(HTMLBlock, data[:end]))
418 }
419
420 return i
421}
422
423func finalizeHtmlBlock(block *Node) {
424 block.Literal = reTrailingWhitespace.ReplaceAll(block.content, []byte{})
425 block.content = []byte{}
426}
427
428// HTML comment, lax form
429func (p *parser) htmlComment(data []byte, doRender bool) int {
430 i := p.inlineHtmlComment(data)
431 // needs to end with a blank line
432 if j := p.isEmpty(data[i:]); j > 0 {
433 size := i + j
434 if doRender {
435 // trim trailing newlines
436 end := size
437 for end > 0 && data[end-1] == '\n' {
438 end--
439 }
440 block := p.addBlock(HTMLBlock, data[:end])
441 finalizeHtmlBlock(block)
442 }
443 return size
444 }
445 return 0
446}
447
448// HR, which is the only self-closing block tag considered
449func (p *parser) htmlHr(data []byte, doRender bool) int {
450 if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') {
451 return 0
452 }
453 if data[3] != ' ' && data[3] != '/' && data[3] != '>' {
454 // not an <hr> tag after all; at least not a valid one
455 return 0
456 }
457
458 i := 3
459 for data[i] != '>' && data[i] != '\n' {
460 i++
461 }
462
463 if data[i] == '>' {
464 i++
465 if j := p.isEmpty(data[i:]); j > 0 {
466 size := i + j
467 if doRender {
468 // trim newlines
469 end := size
470 for end > 0 && data[end-1] == '\n' {
471 end--
472 }
473 finalizeHtmlBlock(p.addBlock(HTMLBlock, data[:end]))
474 }
475 return size
476 }
477 }
478
479 return 0
480}
481
482func (p *parser) htmlFindTag(data []byte) (string, bool) {
483 i := 0
484 for isalnum(data[i]) {
485 i++
486 }
487 key := string(data[:i])
488 if _, ok := blockTags[key]; ok {
489 return key, true
490 }
491 return "", false
492}
493
494func (p *parser) htmlFindEnd(tag string, data []byte) int {
495 // assume data[0] == '<' && data[1] == '/' already tested
496 if tag == "hr" {
497 return 2
498 }
499 // check if tag is a match
500 closetag := []byte("</" + tag + ">")
501 if !bytes.HasPrefix(data, closetag) {
502 return 0
503 }
504 i := len(closetag)
505
506 // check that the rest of the line is blank
507 skip := 0
508 if skip = p.isEmpty(data[i:]); skip == 0 {
509 return 0
510 }
511 i += skip
512 skip = 0
513
514 if i >= len(data) {
515 return i
516 }
517
518 if p.flags&LaxHTMLBlocks != 0 {
519 return i
520 }
521 if skip = p.isEmpty(data[i:]); skip == 0 {
522 // following line must be blank
523 return 0
524 }
525
526 return i + skip
527}
528
529func (*parser) isEmpty(data []byte) int {
530 // it is okay to call isEmpty on an empty buffer
531 if len(data) == 0 {
532 return 0
533 }
534
535 var i int
536 for i = 0; i < len(data) && data[i] != '\n'; i++ {
537 if data[i] != ' ' && data[i] != '\t' {
538 return 0
539 }
540 }
541 return i + 1
542}
543
544func (*parser) isHRule(data []byte) bool {
545 i := 0
546
547 // skip up to three spaces
548 for i < 3 && data[i] == ' ' {
549 i++
550 }
551
552 // look at the hrule char
553 if data[i] != '*' && data[i] != '-' && data[i] != '_' {
554 return false
555 }
556 c := data[i]
557
558 // the whole line must be the char or whitespace
559 n := 0
560 for data[i] != '\n' {
561 switch {
562 case data[i] == c:
563 n++
564 case data[i] != ' ':
565 return false
566 }
567 i++
568 }
569
570 return n >= 3
571}
572
573// isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data,
574// and returns the end index if so, or 0 otherwise. It also returns the marker found.
575// If syntax is not nil, it gets set to the syntax specified in the fence line.
576// A final newline is mandatory to recognize the fence line, unless newlineOptional is true.
577func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional bool) (end int, marker string) {
578 i, size := 0, 0
579
580 // skip up to three spaces
581 for i < len(data) && i < 3 && data[i] == ' ' {
582 i++
583 }
584
585 // check for the marker characters: ~ or `
586 if i >= len(data) {
587 return 0, ""
588 }
589 if data[i] != '~' && data[i] != '`' {
590 return 0, ""
591 }
592
593 c := data[i]
594
595 // the whole line must be the same char or whitespace
596 for i < len(data) && data[i] == c {
597 size++
598 i++
599 }
600
601 // the marker char must occur at least 3 times
602 if size < 3 {
603 return 0, ""
604 }
605 marker = string(data[i-size : i])
606
607 // if this is the end marker, it must match the beginning marker
608 if oldmarker != "" && marker != oldmarker {
609 return 0, ""
610 }
611
612 // TODO(shurcooL): It's probably a good idea to simplify the 2 code paths here
613 // into one, always get the syntax, and discard it if the caller doesn't care.
614 if syntax != nil {
615 syn := 0
616 i = skipChar(data, i, ' ')
617
618 if i >= len(data) {
619 if newlineOptional && i == len(data) {
620 return i, marker
621 }
622 return 0, ""
623 }
624
625 syntaxStart := i
626
627 if data[i] == '{' {
628 i++
629 syntaxStart++
630
631 for i < len(data) && data[i] != '}' && data[i] != '\n' {
632 syn++
633 i++
634 }
635
636 if i >= len(data) || data[i] != '}' {
637 return 0, ""
638 }
639
640 // strip all whitespace at the beginning and the end
641 // of the {} block
642 for syn > 0 && isspace(data[syntaxStart]) {
643 syntaxStart++
644 syn--
645 }
646
647 for syn > 0 && isspace(data[syntaxStart+syn-1]) {
648 syn--
649 }
650
651 i++
652 } else {
653 for i < len(data) && !isspace(data[i]) {
654 syn++
655 i++
656 }
657 }
658
659 *syntax = string(data[syntaxStart : syntaxStart+syn])
660 }
661
662 i = skipChar(data, i, ' ')
663 if i >= len(data) || data[i] != '\n' {
664 if newlineOptional && i == len(data) {
665 return i, marker
666 }
667 return 0, ""
668 }
669
670 return i + 1, marker // Take newline into account.
671}
672
673// fencedCodeBlock returns the end index if data contains a fenced code block at the beginning,
674// or 0 otherwise. It writes to out if doRender is true, otherwise it has no side effects.
675// If doRender is true, a final newline is mandatory to recognize the fenced code block.
676func (p *parser) fencedCodeBlock(data []byte, doRender bool) int {
677 var syntax string
678 beg, marker := isFenceLine(data, &syntax, "", false)
679 if beg == 0 || beg >= len(data) {
680 return 0
681 }
682
683 var work bytes.Buffer
684 work.Write([]byte(syntax))
685 work.WriteByte('\n')
686
687 for {
688 // safe to assume beg < len(data)
689
690 // check for the end of the code block
691 newlineOptional := !doRender
692 fenceEnd, _ := isFenceLine(data[beg:], nil, marker, newlineOptional)
693 if fenceEnd != 0 {
694 beg += fenceEnd
695 break
696 }
697
698 // copy the current line
699 end := skipUntilChar(data, beg, '\n') + 1
700
701 // did we reach the end of the buffer without a closing marker?
702 if end >= len(data) {
703 return 0
704 }
705
706 // verbatim copy to the working buffer
707 if doRender {
708 work.Write(data[beg:end])
709 }
710 beg = end
711 }
712
713 if doRender {
714 block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
715 block.IsFenced = true
716 finalizeCodeBlock(block)
717 }
718
719 return beg
720}
721
722func unescapeChar(str []byte) []byte {
723 if str[0] == '\\' {
724 return []byte{str[1]}
725 }
726 return []byte(html.UnescapeString(string(str)))
727}
728
729func unescapeString(str []byte) []byte {
730 if reBackslashOrAmp.Match(str) {
731 return reEntityOrEscapedChar.ReplaceAllFunc(str, unescapeChar)
732 } else {
733 return str
734 }
735}
736
737func finalizeCodeBlock(block *Node) {
738 if block.IsFenced {
739 newlinePos := bytes.IndexByte(block.content, '\n')
740 firstLine := block.content[:newlinePos]
741 rest := block.content[newlinePos+1:]
742 block.Info = unescapeString(bytes.Trim(firstLine, "\n"))
743 block.Literal = rest
744 } else {
745 block.Literal = reTrailingWhitespace.ReplaceAll(block.content, []byte{'\n'})
746 }
747 block.content = nil
748}
749
750func (p *parser) table(data []byte) int {
751 table := p.addBlock(Table, nil)
752 i, columns := p.tableHeader(data)
753 if i == 0 {
754 p.tip = table.Parent
755 table.unlink()
756 return 0
757 }
758
759 p.addBlock(TableBody, nil)
760
761 for i < len(data) {
762 pipes, rowStart := 0, i
763 for ; data[i] != '\n'; i++ {
764 if data[i] == '|' {
765 pipes++
766 }
767 }
768
769 if pipes == 0 {
770 i = rowStart
771 break
772 }
773
774 // include the newline in data sent to tableRow
775 i++
776 p.tableRow(data[rowStart:i], columns, false)
777 }
778
779 return i
780}
781
782// check if the specified position is preceded by an odd number of backslashes
783func isBackslashEscaped(data []byte, i int) bool {
784 backslashes := 0
785 for i-backslashes-1 >= 0 && data[i-backslashes-1] == '\\' {
786 backslashes++
787 }
788 return backslashes&1 == 1
789}
790
791func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
792 i := 0
793 colCount := 1
794 for i = 0; data[i] != '\n'; i++ {
795 if data[i] == '|' && !isBackslashEscaped(data, i) {
796 colCount++
797 }
798 }
799
800 // doesn't look like a table header
801 if colCount == 1 {
802 return
803 }
804
805 // include the newline in the data sent to tableRow
806 header := data[:i+1]
807
808 // column count ignores pipes at beginning or end of line
809 if data[0] == '|' {
810 colCount--
811 }
812 if i > 2 && data[i-1] == '|' && !isBackslashEscaped(data, i-1) {
813 colCount--
814 }
815
816 columns = make([]CellAlignFlags, colCount)
817
818 // move on to the header underline
819 i++
820 if i >= len(data) {
821 return
822 }
823
824 if data[i] == '|' && !isBackslashEscaped(data, i) {
825 i++
826 }
827 i = skipChar(data, i, ' ')
828
829 // each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3
830 // and trailing | optional on last column
831 col := 0
832 for data[i] != '\n' {
833 dashes := 0
834
835 if data[i] == ':' {
836 i++
837 columns[col] |= TableAlignmentLeft
838 dashes++
839 }
840 for data[i] == '-' {
841 i++
842 dashes++
843 }
844 if data[i] == ':' {
845 i++
846 columns[col] |= TableAlignmentRight
847 dashes++
848 }
849 for data[i] == ' ' {
850 i++
851 }
852
853 // end of column test is messy
854 switch {
855 case dashes < 3:
856 // not a valid column
857 return
858
859 case data[i] == '|' && !isBackslashEscaped(data, i):
860 // marker found, now skip past trailing whitespace
861 col++
862 i++
863 for data[i] == ' ' {
864 i++
865 }
866
867 // trailing junk found after last column
868 if col >= colCount && data[i] != '\n' {
869 return
870 }
871
872 case (data[i] != '|' || isBackslashEscaped(data, i)) && col+1 < colCount:
873 // something else found where marker was required
874 return
875
876 case data[i] == '\n':
877 // marker is optional for the last column
878 col++
879
880 default:
881 // trailing junk found after last column
882 return
883 }
884 }
885 if col != colCount {
886 return
887 }
888
889 p.addBlock(TableHead, nil)
890 p.tableRow(header, columns, true)
891 size = i + 1
892 return
893}
894
895func (p *parser) tableRow(data []byte, columns []CellAlignFlags, header bool) {
896 p.addBlock(TableRow, nil)
897 i, col := 0, 0
898
899 if data[i] == '|' && !isBackslashEscaped(data, i) {
900 i++
901 }
902
903 for col = 0; col < len(columns) && i < len(data); col++ {
904 for data[i] == ' ' {
905 i++
906 }
907
908 cellStart := i
909
910 for (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
911 i++
912 }
913
914 cellEnd := i
915
916 // skip the end-of-cell marker, possibly taking us past end of buffer
917 i++
918
919 for cellEnd > cellStart && data[cellEnd-1] == ' ' {
920 cellEnd--
921 }
922
923 cell := p.addBlock(TableCell, data[cellStart:cellEnd])
924 cell.IsHeader = header
925 cell.Align = columns[col]
926 }
927
928 // pad it out with empty columns to get the right number
929 for ; col < len(columns); col++ {
930 cell := p.addBlock(TableCell, nil)
931 cell.IsHeader = header
932 cell.Align = columns[col]
933 }
934
935 // silently ignore rows with too many cells
936}
937
938// returns blockquote prefix length
939func (p *parser) quotePrefix(data []byte) int {
940 i := 0
941 for i < 3 && data[i] == ' ' {
942 i++
943 }
944 if data[i] == '>' {
945 if data[i+1] == ' ' {
946 return i + 2
947 }
948 return i + 1
949 }
950 return 0
951}
952
953// blockquote ends with at least one blank line
954// followed by something without a blockquote prefix
955func (p *parser) terminateBlockquote(data []byte, beg, end int) bool {
956 if p.isEmpty(data[beg:]) <= 0 {
957 return false
958 }
959 if end >= len(data) {
960 return true
961 }
962 return p.quotePrefix(data[end:]) == 0 && p.isEmpty(data[end:]) == 0
963}
964
965// parse a blockquote fragment
966func (p *parser) quote(data []byte) int {
967 block := p.addBlock(BlockQuote, nil)
968 var raw bytes.Buffer
969 beg, end := 0, 0
970 for beg < len(data) {
971 end = beg
972 // Step over whole lines, collecting them. While doing that, check for
973 // fenced code and if one's found, incorporate it altogether,
974 // irregardless of any contents inside it
975 for data[end] != '\n' {
976 if p.flags&FencedCode != 0 {
977 if i := p.fencedCodeBlock(data[end:], false); i > 0 {
978 // -1 to compensate for the extra end++ after the loop:
979 end += i - 1
980 break
981 }
982 }
983 end++
984 }
985 end++
986 if pre := p.quotePrefix(data[beg:]); pre > 0 {
987 // skip the prefix
988 beg += pre
989 } else if p.terminateBlockquote(data, beg, end) {
990 break
991 }
992 // this line is part of the blockquote
993 raw.Write(data[beg:end])
994 beg = end
995 }
996 p.block(raw.Bytes())
997 p.finalize(block)
998 return end
999}
1000
1001// returns prefix length for block code
1002func (p *parser) codePrefix(data []byte) int {
1003 if data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
1004 return 4
1005 }
1006 return 0
1007}
1008
1009func (p *parser) code(data []byte) int {
1010 var work bytes.Buffer
1011
1012 i := 0
1013 for i < len(data) {
1014 beg := i
1015 for data[i] != '\n' {
1016 i++
1017 }
1018 i++
1019
1020 blankline := p.isEmpty(data[beg:i]) > 0
1021 if pre := p.codePrefix(data[beg:i]); pre > 0 {
1022 beg += pre
1023 } else if !blankline {
1024 // non-empty, non-prefixed line breaks the pre
1025 i = beg
1026 break
1027 }
1028
1029 // verbatim copy to the working buffeu
1030 if blankline {
1031 work.WriteByte('\n')
1032 } else {
1033 work.Write(data[beg:i])
1034 }
1035 }
1036
1037 // trim all the \n off the end of work
1038 workbytes := work.Bytes()
1039 eol := len(workbytes)
1040 for eol > 0 && workbytes[eol-1] == '\n' {
1041 eol--
1042 }
1043 if eol != len(workbytes) {
1044 work.Truncate(eol)
1045 }
1046
1047 work.WriteByte('\n')
1048
1049 block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
1050 block.IsFenced = false
1051 finalizeCodeBlock(block)
1052
1053 return i
1054}
1055
1056// returns unordered list item prefix
1057func (p *parser) uliPrefix(data []byte) int {
1058 i := 0
1059
1060 // start with up to 3 spaces
1061 for i < 3 && data[i] == ' ' {
1062 i++
1063 }
1064
1065 // need a *, +, or - followed by a space
1066 if (data[i] != '*' && data[i] != '+' && data[i] != '-') ||
1067 data[i+1] != ' ' {
1068 return 0
1069 }
1070 return i + 2
1071}
1072
1073// returns ordered list item prefix
1074func (p *parser) oliPrefix(data []byte) int {
1075 i := 0
1076
1077 // start with up to 3 spaces
1078 for i < 3 && data[i] == ' ' {
1079 i++
1080 }
1081
1082 // count the digits
1083 start := i
1084 for data[i] >= '0' && data[i] <= '9' {
1085 i++
1086 }
1087
1088 // we need >= 1 digits followed by a dot and a space
1089 if start == i || data[i] != '.' || data[i+1] != ' ' {
1090 return 0
1091 }
1092 return i + 2
1093}
1094
1095// returns definition list item prefix
1096func (p *parser) dliPrefix(data []byte) int {
1097 i := 0
1098
1099 // need a : followed by a spaces
1100 if data[i] != ':' || data[i+1] != ' ' {
1101 return 0
1102 }
1103 for data[i] == ' ' {
1104 i++
1105 }
1106 return i + 2
1107}
1108
1109// parse ordered or unordered list block
1110func (p *parser) list(data []byte, flags ListType) int {
1111 i := 0
1112 flags |= ListItemBeginningOfList
1113 block := p.addBlock(List, nil)
1114 block.ListFlags = flags
1115 block.Tight = true
1116
1117 for i < len(data) {
1118 skip := p.listItem(data[i:], &flags)
1119 if flags&ListItemContainsBlock != 0 {
1120 block.ListData.Tight = false
1121 }
1122 i += skip
1123 if skip == 0 || flags&ListItemEndOfList != 0 {
1124 break
1125 }
1126 flags &= ^ListItemBeginningOfList
1127 }
1128
1129 above := block.Parent
1130 finalizeList(block)
1131 p.tip = above
1132 return i
1133}
1134
1135// Returns true if block ends with a blank line, descending if needed
1136// into lists and sublists.
1137func endsWithBlankLine(block *Node) bool {
1138 // TODO: figure this out. Always false now.
1139 for block != nil {
1140 //if block.lastLineBlank {
1141 //return true
1142 //}
1143 t := block.Type
1144 if t == List || t == Item {
1145 block = block.LastChild
1146 } else {
1147 break
1148 }
1149 }
1150 return false
1151}
1152
1153func finalizeList(block *Node) {
1154 block.open = false
1155 item := block.FirstChild
1156 for item != nil {
1157 // check for non-final list item ending with blank line:
1158 if endsWithBlankLine(item) && item.Next != nil {
1159 block.ListData.Tight = false
1160 break
1161 }
1162 // recurse into children of list item, to see if there are spaces
1163 // between any of them:
1164 subItem := item.FirstChild
1165 for subItem != nil {
1166 if endsWithBlankLine(subItem) && (item.Next != nil || subItem.Next != nil) {
1167 block.ListData.Tight = false
1168 break
1169 }
1170 subItem = subItem.Next
1171 }
1172 item = item.Next
1173 }
1174}
1175
1176// Parse a single list item.
1177// Assumes initial prefix is already removed if this is a sublist.
1178func (p *parser) listItem(data []byte, flags *ListType) int {
1179 // keep track of the indentation of the first line
1180 itemIndent := 0
1181 for itemIndent < 3 && data[itemIndent] == ' ' {
1182 itemIndent++
1183 }
1184
1185 var bulletChar byte = '*'
1186 i := p.uliPrefix(data)
1187 if i == 0 {
1188 i = p.oliPrefix(data)
1189 } else {
1190 bulletChar = data[i-2]
1191 }
1192 if i == 0 {
1193 i = p.dliPrefix(data)
1194 // reset definition term flag
1195 if i > 0 {
1196 *flags &= ^ListTypeTerm
1197 }
1198 }
1199 if i == 0 {
1200 // if in definition list, set term flag and continue
1201 if *flags&ListTypeDefinition != 0 {
1202 *flags |= ListTypeTerm
1203 } else {
1204 return 0
1205 }
1206 }
1207
1208 // skip leading whitespace on first line
1209 for data[i] == ' ' {
1210 i++
1211 }
1212
1213 // find the end of the line
1214 line := i
1215 for i > 0 && data[i-1] != '\n' {
1216 i++
1217 }
1218
1219 // get working buffer
1220 var raw bytes.Buffer
1221
1222 // put the first line into the working buffer
1223 raw.Write(data[line:i])
1224 line = i
1225
1226 // process the following lines
1227 containsBlankLine := false
1228 sublist := 0
1229
1230gatherlines:
1231 for line < len(data) {
1232 i++
1233
1234 // find the end of this line
1235 for data[i-1] != '\n' {
1236 i++
1237 }
1238
1239 // if it is an empty line, guess that it is part of this item
1240 // and move on to the next line
1241 if p.isEmpty(data[line:i]) > 0 {
1242 containsBlankLine = true
1243 line = i
1244 continue
1245 }
1246
1247 // calculate the indentation
1248 indent := 0
1249 for indent < 4 && line+indent < i && data[line+indent] == ' ' {
1250 indent++
1251 }
1252
1253 chunk := data[line+indent : i]
1254
1255 // evaluate how this line fits in
1256 switch {
1257 // is this a nested list item?
1258 case (p.uliPrefix(chunk) > 0 && !p.isHRule(chunk)) ||
1259 p.oliPrefix(chunk) > 0 ||
1260 p.dliPrefix(chunk) > 0:
1261
1262 if containsBlankLine {
1263 *flags |= ListItemContainsBlock
1264 }
1265
1266 // to be a nested list, it must be indented more
1267 // if not, it is the next item in the same list
1268 if indent <= itemIndent {
1269 break gatherlines
1270 }
1271
1272 // is this the first item in the nested list?
1273 if sublist == 0 {
1274 sublist = raw.Len()
1275 }
1276
1277 // is this a nested prefix header?
1278 case p.isPrefixHeader(chunk):
1279 // if the header is not indented, it is not nested in the list
1280 // and thus ends the list
1281 if containsBlankLine && indent < 4 {
1282 *flags |= ListItemEndOfList
1283 break gatherlines
1284 }
1285 *flags |= ListItemContainsBlock
1286
1287 // anything following an empty line is only part
1288 // of this item if it is indented 4 spaces
1289 // (regardless of the indentation of the beginning of the item)
1290 case containsBlankLine && indent < 4:
1291 if *flags&ListTypeDefinition != 0 && i < len(data)-1 {
1292 // is the next item still a part of this list?
1293 next := i
1294 for data[next] != '\n' {
1295 next++
1296 }
1297 for next < len(data)-1 && data[next] == '\n' {
1298 next++
1299 }
1300 if i < len(data)-1 && data[i] != ':' && data[next] != ':' {
1301 *flags |= ListItemEndOfList
1302 }
1303 } else {
1304 *flags |= ListItemEndOfList
1305 }
1306 break gatherlines
1307
1308 // a blank line means this should be parsed as a block
1309 case containsBlankLine:
1310 raw.WriteByte('\n')
1311 *flags |= ListItemContainsBlock
1312 }
1313
1314 // if this line was preceded by one or more blanks,
1315 // re-introduce the blank into the buffer
1316 if containsBlankLine {
1317 containsBlankLine = false
1318 raw.WriteByte('\n')
1319 }
1320
1321 // add the line into the working buffer without prefix
1322 raw.Write(data[line+indent : i])
1323
1324 line = i
1325 }
1326
1327 rawBytes := raw.Bytes()
1328
1329 block := p.addBlock(Item, nil)
1330 block.ListFlags = *flags
1331 block.Tight = false
1332 block.BulletChar = bulletChar
1333 block.Delimiter = '.' // Only '.' is possible in Markdown, but ')' will also be possible in CommonMark
1334
1335 // render the contents of the list item
1336 if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 {
1337 // intermediate render of block item, except for definition term
1338 if sublist > 0 {
1339 p.block(rawBytes[:sublist])
1340 p.block(rawBytes[sublist:])
1341 } else {
1342 p.block(rawBytes)
1343 }
1344 } else {
1345 // intermediate render of inline item
1346 if sublist > 0 {
1347 child := p.addChild(Paragraph, 0)
1348 child.content = rawBytes[:sublist]
1349 p.block(rawBytes[sublist:])
1350 } else {
1351 child := p.addChild(Paragraph, 0)
1352 child.content = rawBytes
1353 }
1354 }
1355 return line
1356}
1357
1358// render a single paragraph that has already been parsed out
1359func (p *parser) renderParagraph(data []byte) {
1360 if len(data) == 0 {
1361 return
1362 }
1363
1364 // trim leading spaces
1365 beg := 0
1366 for data[beg] == ' ' {
1367 beg++
1368 }
1369
1370 // trim trailing newline
1371 end := len(data) - 1
1372
1373 // trim trailing spaces
1374 for end > beg && data[end-1] == ' ' {
1375 end--
1376 }
1377
1378 p.addBlock(Paragraph, data[beg:end])
1379}
1380
1381func (p *parser) paragraph(data []byte) int {
1382 // prev: index of 1st char of previous line
1383 // line: index of 1st char of current line
1384 // i: index of cursor/end of current line
1385 var prev, line, i int
1386
1387 // keep going until we find something to mark the end of the paragraph
1388 for i < len(data) {
1389 // mark the beginning of the current line
1390 prev = line
1391 current := data[i:]
1392 line = i
1393
1394 // did we find a blank line marking the end of the paragraph?
1395 if n := p.isEmpty(current); n > 0 {
1396 // did this blank line followed by a definition list item?
1397 if p.flags&DefinitionLists != 0 {
1398 if i < len(data)-1 && data[i+1] == ':' {
1399 return p.list(data[prev:], ListTypeDefinition)
1400 }
1401 }
1402
1403 p.renderParagraph(data[:i])
1404 return i + n
1405 }
1406
1407 // an underline under some text marks a header, so our paragraph ended on prev line
1408 if i > 0 {
1409 if level := p.isUnderlinedHeader(current); level > 0 {
1410 // render the paragraph
1411 p.renderParagraph(data[:prev])
1412
1413 // ignore leading and trailing whitespace
1414 eol := i - 1
1415 for prev < eol && data[prev] == ' ' {
1416 prev++
1417 }
1418 for eol > prev && data[eol-1] == ' ' {
1419 eol--
1420 }
1421
1422 id := ""
1423 if p.flags&AutoHeaderIDs != 0 {
1424 id = sanitized_anchor_name.Create(string(data[prev:eol]))
1425 }
1426
1427 block := p.addBlock(Header, data[prev:eol])
1428 block.Level = level
1429 block.HeaderID = id
1430
1431 // find the end of the underline
1432 for data[i] != '\n' {
1433 i++
1434 }
1435 return i
1436 }
1437 }
1438
1439 // if the next line starts a block of HTML, then the paragraph ends here
1440 if p.flags&LaxHTMLBlocks != 0 {
1441 if data[i] == '<' && p.html(current, false) > 0 {
1442 // rewind to before the HTML block
1443 p.renderParagraph(data[:i])
1444 return i
1445 }
1446 }
1447
1448 // if there's a prefixed header or a horizontal rule after this, paragraph is over
1449 if p.isPrefixHeader(current) || p.isHRule(current) {
1450 p.renderParagraph(data[:i])
1451 return i
1452 }
1453
1454 // if there's a fenced code block, paragraph is over
1455 if p.flags&FencedCode != 0 {
1456 if p.fencedCodeBlock(current, false) > 0 {
1457 p.renderParagraph(data[:i])
1458 return i
1459 }
1460 }
1461
1462 // if there's a definition list item, prev line is a definition term
1463 if p.flags&DefinitionLists != 0 {
1464 if p.dliPrefix(current) != 0 {
1465 return p.list(data[prev:], ListTypeDefinition)
1466 }
1467 }
1468
1469 // if there's a list after this, paragraph is over
1470 if p.flags&NoEmptyLineBeforeBlock != 0 {
1471 if p.uliPrefix(current) != 0 ||
1472 p.oliPrefix(current) != 0 ||
1473 p.quotePrefix(current) != 0 ||
1474 p.codePrefix(current) != 0 {
1475 p.renderParagraph(data[:i])
1476 return i
1477 }
1478 }
1479
1480 // otherwise, scan to the beginning of the next line
1481 for data[i] != '\n' {
1482 i++
1483 }
1484 i++
1485 }
1486
1487 p.renderParagraph(data[:i])
1488 return i
1489}
1490
1491func skipChar(data []byte, start int, char byte) int {
1492 i := start
1493 for i < len(data) && data[i] == char {
1494 i++
1495 }
1496 return i
1497}
1498
1499func skipUntilChar(text []byte, start int, char byte) int {
1500 i := start
1501 for i < len(text) && text[i] != char {
1502 i++
1503 }
1504 return i
1505}