block.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11// Functions to parse block-level elements.
12//
13
14package blackfriday
15
16import (
17 "bytes"
18 "html"
19 "regexp"
20
21 "github.com/shurcooL/sanitized_anchor_name"
22)
23
24const (
25 charEntity = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"
26 escapable = "[!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]"
27)
28
29var (
30 reBackslashOrAmp = regexp.MustCompile("[\\&]")
31 reEntityOrEscapedChar = regexp.MustCompile("(?i)\\\\" + escapable + "|" + charEntity)
32)
33
34// Parse block-level data.
35// Note: this function and many that it calls assume that
36// the input buffer ends with a newline.
37func (p *parser) block(data []byte) {
38 if len(data) == 0 || data[len(data)-1] != '\n' {
39 panic("block input is missing terminating newline")
40 }
41
42 // this is called recursively: enforce a maximum depth
43 if p.nesting >= p.maxNesting {
44 return
45 }
46 p.nesting++
47
48 // parse out one block-level construct at a time
49 for len(data) > 0 {
50 // prefixed header:
51 //
52 // # Header 1
53 // ## Header 2
54 // ...
55 // ###### Header 6
56 if p.isPrefixHeader(data) {
57 data = data[p.prefixHeader(data):]
58 continue
59 }
60
61 // block of preformatted HTML:
62 //
63 // <div>
64 // ...
65 // </div>
66 if data[0] == '<' {
67 if i := p.html(data, true); i > 0 {
68 data = data[i:]
69 continue
70 }
71 }
72
73 // title block
74 //
75 // % stuff
76 // % more stuff
77 // % even more stuff
78 if p.flags&Titleblock != 0 {
79 if data[0] == '%' {
80 if i := p.titleBlock(data, true); i > 0 {
81 data = data[i:]
82 continue
83 }
84 }
85 }
86
87 // blank lines. note: returns the # of bytes to skip
88 if i := p.isEmpty(data); i > 0 {
89 data = data[i:]
90 continue
91 }
92
93 // indented code block:
94 //
95 // func max(a, b int) int {
96 // if a > b {
97 // return a
98 // }
99 // return b
100 // }
101 if p.codePrefix(data) > 0 {
102 data = data[p.code(data):]
103 continue
104 }
105
106 // fenced code block:
107 //
108 // ``` go
109 // func fact(n int) int {
110 // if n <= 1 {
111 // return n
112 // }
113 // return n * fact(n-1)
114 // }
115 // ```
116 if p.flags&FencedCode != 0 {
117 if i := p.fencedCodeBlock(data, true); i > 0 {
118 data = data[i:]
119 continue
120 }
121 }
122
123 // horizontal rule:
124 //
125 // ------
126 // or
127 // ******
128 // or
129 // ______
130 if p.isHRule(data) {
131 p.addBlock(HorizontalRule, nil)
132 var i int
133 for i = 0; data[i] != '\n'; i++ {
134 }
135 data = data[i:]
136 continue
137 }
138
139 // block quote:
140 //
141 // > A big quote I found somewhere
142 // > on the web
143 if p.quotePrefix(data) > 0 {
144 data = data[p.quote(data):]
145 continue
146 }
147
148 // table:
149 //
150 // Name | Age | Phone
151 // ------|-----|---------
152 // Bob | 31 | 555-1234
153 // Alice | 27 | 555-4321
154 if p.flags&Tables != 0 {
155 if i := p.table(data); i > 0 {
156 data = data[i:]
157 continue
158 }
159 }
160
161 // an itemized/unordered list:
162 //
163 // * Item 1
164 // * Item 2
165 //
166 // also works with + or -
167 if p.uliPrefix(data) > 0 {
168 data = data[p.list(data, 0):]
169 continue
170 }
171
172 // a numbered/ordered list:
173 //
174 // 1. Item 1
175 // 2. Item 2
176 if p.oliPrefix(data) > 0 {
177 data = data[p.list(data, ListTypeOrdered):]
178 continue
179 }
180
181 // definition lists:
182 //
183 // Term 1
184 // : Definition a
185 // : Definition b
186 //
187 // Term 2
188 // : Definition c
189 if p.flags&DefinitionLists != 0 {
190 if p.dliPrefix(data) > 0 {
191 data = data[p.list(data, ListTypeDefinition):]
192 continue
193 }
194 }
195
196 // anything else must look like a normal paragraph
197 // note: this finds underlined headers, too
198 data = data[p.paragraph(data):]
199 }
200
201 p.nesting--
202}
203
204func (p *parser) addBlock(typ NodeType, content []byte) *Node {
205 p.closeUnmatchedBlocks()
206 container := p.addChild(typ, 0)
207 container.content = content
208 return container
209}
210
211func (p *parser) isPrefixHeader(data []byte) bool {
212 if data[0] != '#' {
213 return false
214 }
215
216 if p.flags&SpaceHeaders != 0 {
217 level := 0
218 for level < 6 && data[level] == '#' {
219 level++
220 }
221 if data[level] != ' ' {
222 return false
223 }
224 }
225 return true
226}
227
228func (p *parser) prefixHeader(data []byte) int {
229 level := 0
230 for level < 6 && data[level] == '#' {
231 level++
232 }
233 i := skipChar(data, level, ' ')
234 end := skipUntilChar(data, i, '\n')
235 skip := end
236 id := ""
237 if p.flags&HeaderIDs != 0 {
238 j, k := 0, 0
239 // find start/end of header id
240 for j = i; j < end-1 && (data[j] != '{' || data[j+1] != '#'); j++ {
241 }
242 for k = j + 1; k < end && data[k] != '}'; k++ {
243 }
244 // extract header id iff found
245 if j < end && k < end {
246 id = string(data[j+2 : k])
247 end = j
248 skip = k + 1
249 for end > 0 && data[end-1] == ' ' {
250 end--
251 }
252 }
253 }
254 for end > 0 && data[end-1] == '#' {
255 if isBackslashEscaped(data, end-1) {
256 break
257 }
258 end--
259 }
260 for end > 0 && data[end-1] == ' ' {
261 end--
262 }
263 if end > i {
264 if id == "" && p.flags&AutoHeaderIDs != 0 {
265 id = sanitized_anchor_name.Create(string(data[i:end]))
266 }
267 block := p.addBlock(Header, data[i:end])
268 block.HeaderID = id
269 block.Level = level
270 }
271 return skip
272}
273
274func (p *parser) isUnderlinedHeader(data []byte) int {
275 // test of level 1 header
276 if data[0] == '=' {
277 i := skipChar(data, 1, '=')
278 i = skipChar(data, i, ' ')
279 if data[i] == '\n' {
280 return 1
281 }
282 return 0
283 }
284
285 // test of level 2 header
286 if data[0] == '-' {
287 i := skipChar(data, 1, '-')
288 i = skipChar(data, i, ' ')
289 if data[i] == '\n' {
290 return 2
291 }
292 return 0
293 }
294
295 return 0
296}
297
298func (p *parser) titleBlock(data []byte, doRender bool) int {
299 if data[0] != '%' {
300 return 0
301 }
302 splitData := bytes.Split(data, []byte("\n"))
303 var i int
304 for idx, b := range splitData {
305 if !bytes.HasPrefix(b, []byte("%")) {
306 i = idx // - 1
307 break
308 }
309 }
310
311 data = bytes.Join(splitData[0:i], []byte("\n"))
312 consumed := len(data)
313 data = bytes.TrimPrefix(data, []byte("% "))
314 data = bytes.Replace(data, []byte("\n% "), []byte("\n"), -1)
315 block := p.addBlock(Header, data)
316 block.Level = 1
317 block.IsTitleblock = true
318
319 return consumed
320}
321
322func (p *parser) html(data []byte, doRender bool) int {
323 var i, j int
324
325 // identify the opening tag
326 if data[0] != '<' {
327 return 0
328 }
329 curtag, tagfound := p.htmlFindTag(data[1:])
330
331 // handle special cases
332 if !tagfound {
333 // check for an HTML comment
334 if size := p.htmlComment(data, doRender); size > 0 {
335 return size
336 }
337
338 // check for an <hr> tag
339 if size := p.htmlHr(data, doRender); size > 0 {
340 return size
341 }
342
343 // no special case recognized
344 return 0
345 }
346
347 // look for an unindented matching closing tag
348 // followed by a blank line
349 found := false
350 /*
351 closetag := []byte("\n</" + curtag + ">")
352 j = len(curtag) + 1
353 for !found {
354 // scan for a closing tag at the beginning of a line
355 if skip := bytes.Index(data[j:], closetag); skip >= 0 {
356 j += skip + len(closetag)
357 } else {
358 break
359 }
360
361 // see if it is the only thing on the line
362 if skip := p.isEmpty(data[j:]); skip > 0 {
363 // see if it is followed by a blank line/eof
364 j += skip
365 if j >= len(data) {
366 found = true
367 i = j
368 } else {
369 if skip := p.isEmpty(data[j:]); skip > 0 {
370 j += skip
371 found = true
372 i = j
373 }
374 }
375 }
376 }
377 */
378
379 // if not found, try a second pass looking for indented match
380 // but not if tag is "ins" or "del" (following original Markdown.pl)
381 if !found && curtag != "ins" && curtag != "del" {
382 i = 1
383 for i < len(data) {
384 i++
385 for i < len(data) && !(data[i-1] == '<' && data[i] == '/') {
386 i++
387 }
388
389 if i+2+len(curtag) >= len(data) {
390 break
391 }
392
393 j = p.htmlFindEnd(curtag, data[i-1:])
394
395 if j > 0 {
396 i += j - 1
397 found = true
398 break
399 }
400 }
401 }
402
403 if !found {
404 return 0
405 }
406
407 // the end of the block has been found
408 if doRender {
409 // trim newlines
410 end := i
411 for end > 0 && data[end-1] == '\n' {
412 end--
413 }
414 finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
415 }
416
417 return i
418}
419
420func finalizeHTMLBlock(block *Node) {
421 block.Literal = block.content
422 block.content = nil
423}
424
425// HTML comment, lax form
426func (p *parser) htmlComment(data []byte, doRender bool) int {
427 i := p.inlineHTMLComment(data)
428 // needs to end with a blank line
429 if j := p.isEmpty(data[i:]); j > 0 {
430 size := i + j
431 if doRender {
432 // trim trailing newlines
433 end := size
434 for end > 0 && data[end-1] == '\n' {
435 end--
436 }
437 block := p.addBlock(HTMLBlock, data[:end])
438 finalizeHTMLBlock(block)
439 }
440 return size
441 }
442 return 0
443}
444
445// HR, which is the only self-closing block tag considered
446func (p *parser) htmlHr(data []byte, doRender bool) int {
447 if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') {
448 return 0
449 }
450 if data[3] != ' ' && data[3] != '/' && data[3] != '>' {
451 // not an <hr> tag after all; at least not a valid one
452 return 0
453 }
454
455 i := 3
456 for data[i] != '>' && data[i] != '\n' {
457 i++
458 }
459
460 if data[i] == '>' {
461 i++
462 if j := p.isEmpty(data[i:]); j > 0 {
463 size := i + j
464 if doRender {
465 // trim newlines
466 end := size
467 for end > 0 && data[end-1] == '\n' {
468 end--
469 }
470 finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
471 }
472 return size
473 }
474 }
475
476 return 0
477}
478
479func (p *parser) htmlFindTag(data []byte) (string, bool) {
480 i := 0
481 for isalnum(data[i]) {
482 i++
483 }
484 key := string(data[:i])
485 if _, ok := blockTags[key]; ok {
486 return key, true
487 }
488 return "", false
489}
490
491func (p *parser) htmlFindEnd(tag string, data []byte) int {
492 // assume data[0] == '<' && data[1] == '/' already tested
493 if tag == "hr" {
494 return 2
495 }
496 // check if tag is a match
497 closetag := []byte("</" + tag + ">")
498 if !bytes.HasPrefix(data, closetag) {
499 return 0
500 }
501 i := len(closetag)
502
503 // check that the rest of the line is blank
504 skip := 0
505 if skip = p.isEmpty(data[i:]); skip == 0 {
506 return 0
507 }
508 i += skip
509 skip = 0
510
511 if i >= len(data) {
512 return i
513 }
514
515 if p.flags&LaxHTMLBlocks != 0 {
516 return i
517 }
518 if skip = p.isEmpty(data[i:]); skip == 0 {
519 // following line must be blank
520 return 0
521 }
522
523 return i + skip
524}
525
526func (*parser) isEmpty(data []byte) int {
527 // it is okay to call isEmpty on an empty buffer
528 if len(data) == 0 {
529 return 0
530 }
531
532 var i int
533 for i = 0; i < len(data) && data[i] != '\n'; i++ {
534 if data[i] != ' ' && data[i] != '\t' {
535 return 0
536 }
537 }
538 return i + 1
539}
540
541func (*parser) isHRule(data []byte) bool {
542 i := 0
543
544 // skip up to three spaces
545 for i < 3 && data[i] == ' ' {
546 i++
547 }
548
549 // look at the hrule char
550 if data[i] != '*' && data[i] != '-' && data[i] != '_' {
551 return false
552 }
553 c := data[i]
554
555 // the whole line must be the char or whitespace
556 n := 0
557 for data[i] != '\n' {
558 switch {
559 case data[i] == c:
560 n++
561 case data[i] != ' ':
562 return false
563 }
564 i++
565 }
566
567 return n >= 3
568}
569
570// isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data,
571// and returns the end index if so, or 0 otherwise. It also returns the marker found.
572// If syntax is not nil, it gets set to the syntax specified in the fence line.
573// A final newline is mandatory to recognize the fence line, unless newlineOptional is true.
574func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional bool) (end int, marker string) {
575 i, size := 0, 0
576
577 // skip up to three spaces
578 for i < len(data) && i < 3 && data[i] == ' ' {
579 i++
580 }
581
582 // check for the marker characters: ~ or `
583 if i >= len(data) {
584 return 0, ""
585 }
586 if data[i] != '~' && data[i] != '`' {
587 return 0, ""
588 }
589
590 c := data[i]
591
592 // the whole line must be the same char or whitespace
593 for i < len(data) && data[i] == c {
594 size++
595 i++
596 }
597
598 // the marker char must occur at least 3 times
599 if size < 3 {
600 return 0, ""
601 }
602 marker = string(data[i-size : i])
603
604 // if this is the end marker, it must match the beginning marker
605 if oldmarker != "" && marker != oldmarker {
606 return 0, ""
607 }
608
609 // TODO(shurcooL): It's probably a good idea to simplify the 2 code paths here
610 // into one, always get the syntax, and discard it if the caller doesn't care.
611 if syntax != nil {
612 syn := 0
613 i = skipChar(data, i, ' ')
614
615 if i >= len(data) {
616 if newlineOptional && i == len(data) {
617 return i, marker
618 }
619 return 0, ""
620 }
621
622 syntaxStart := i
623
624 if data[i] == '{' {
625 i++
626 syntaxStart++
627
628 for i < len(data) && data[i] != '}' && data[i] != '\n' {
629 syn++
630 i++
631 }
632
633 if i >= len(data) || data[i] != '}' {
634 return 0, ""
635 }
636
637 // strip all whitespace at the beginning and the end
638 // of the {} block
639 for syn > 0 && isspace(data[syntaxStart]) {
640 syntaxStart++
641 syn--
642 }
643
644 for syn > 0 && isspace(data[syntaxStart+syn-1]) {
645 syn--
646 }
647
648 i++
649 } else {
650 for i < len(data) && !isspace(data[i]) {
651 syn++
652 i++
653 }
654 }
655
656 *syntax = string(data[syntaxStart : syntaxStart+syn])
657 }
658
659 i = skipChar(data, i, ' ')
660 if i >= len(data) || data[i] != '\n' {
661 if newlineOptional && i == len(data) {
662 return i, marker
663 }
664 return 0, ""
665 }
666
667 return i + 1, marker // Take newline into account.
668}
669
670// fencedCodeBlock returns the end index if data contains a fenced code block at the beginning,
671// or 0 otherwise. It writes to out if doRender is true, otherwise it has no side effects.
672// If doRender is true, a final newline is mandatory to recognize the fenced code block.
673func (p *parser) fencedCodeBlock(data []byte, doRender bool) int {
674 var syntax string
675 beg, marker := isFenceLine(data, &syntax, "", false)
676 if beg == 0 || beg >= len(data) {
677 return 0
678 }
679
680 var work bytes.Buffer
681 work.Write([]byte(syntax))
682 work.WriteByte('\n')
683
684 for {
685 // safe to assume beg < len(data)
686
687 // check for the end of the code block
688 newlineOptional := !doRender
689 fenceEnd, _ := isFenceLine(data[beg:], nil, marker, newlineOptional)
690 if fenceEnd != 0 {
691 beg += fenceEnd
692 break
693 }
694
695 // copy the current line
696 end := skipUntilChar(data, beg, '\n') + 1
697
698 // did we reach the end of the buffer without a closing marker?
699 if end >= len(data) {
700 return 0
701 }
702
703 // verbatim copy to the working buffer
704 if doRender {
705 work.Write(data[beg:end])
706 }
707 beg = end
708 }
709
710 if doRender {
711 block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
712 block.IsFenced = true
713 finalizeCodeBlock(block)
714 }
715
716 return beg
717}
718
719func unescapeChar(str []byte) []byte {
720 if str[0] == '\\' {
721 return []byte{str[1]}
722 }
723 return []byte(html.UnescapeString(string(str)))
724}
725
726func unescapeString(str []byte) []byte {
727 if reBackslashOrAmp.Match(str) {
728 return reEntityOrEscapedChar.ReplaceAllFunc(str, unescapeChar)
729 }
730 return str
731}
732
733func finalizeCodeBlock(block *Node) {
734 if block.IsFenced {
735 newlinePos := bytes.IndexByte(block.content, '\n')
736 firstLine := block.content[:newlinePos]
737 rest := block.content[newlinePos+1:]
738 block.Info = unescapeString(bytes.Trim(firstLine, "\n"))
739 block.Literal = rest
740 } else {
741 block.Literal = block.content
742 }
743 block.content = nil
744}
745
746func (p *parser) table(data []byte) int {
747 table := p.addBlock(Table, nil)
748 i, columns := p.tableHeader(data)
749 if i == 0 {
750 p.tip = table.Parent
751 table.Unlink()
752 return 0
753 }
754
755 p.addBlock(TableBody, nil)
756
757 for i < len(data) {
758 pipes, rowStart := 0, i
759 for ; data[i] != '\n'; i++ {
760 if data[i] == '|' {
761 pipes++
762 }
763 }
764
765 if pipes == 0 {
766 i = rowStart
767 break
768 }
769
770 // include the newline in data sent to tableRow
771 i++
772 p.tableRow(data[rowStart:i], columns, false)
773 }
774
775 return i
776}
777
778// check if the specified position is preceded by an odd number of backslashes
779func isBackslashEscaped(data []byte, i int) bool {
780 backslashes := 0
781 for i-backslashes-1 >= 0 && data[i-backslashes-1] == '\\' {
782 backslashes++
783 }
784 return backslashes&1 == 1
785}
786
787func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
788 i := 0
789 colCount := 1
790 for i = 0; data[i] != '\n'; i++ {
791 if data[i] == '|' && !isBackslashEscaped(data, i) {
792 colCount++
793 }
794 }
795
796 // doesn't look like a table header
797 if colCount == 1 {
798 return
799 }
800
801 // include the newline in the data sent to tableRow
802 header := data[:i+1]
803
804 // column count ignores pipes at beginning or end of line
805 if data[0] == '|' {
806 colCount--
807 }
808 if i > 2 && data[i-1] == '|' && !isBackslashEscaped(data, i-1) {
809 colCount--
810 }
811
812 columns = make([]CellAlignFlags, colCount)
813
814 // move on to the header underline
815 i++
816 if i >= len(data) {
817 return
818 }
819
820 if data[i] == '|' && !isBackslashEscaped(data, i) {
821 i++
822 }
823 i = skipChar(data, i, ' ')
824
825 // each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3
826 // and trailing | optional on last column
827 col := 0
828 for data[i] != '\n' {
829 dashes := 0
830
831 if data[i] == ':' {
832 i++
833 columns[col] |= TableAlignmentLeft
834 dashes++
835 }
836 for data[i] == '-' {
837 i++
838 dashes++
839 }
840 if data[i] == ':' {
841 i++
842 columns[col] |= TableAlignmentRight
843 dashes++
844 }
845 for data[i] == ' ' {
846 i++
847 }
848
849 // end of column test is messy
850 switch {
851 case dashes < 3:
852 // not a valid column
853 return
854
855 case data[i] == '|' && !isBackslashEscaped(data, i):
856 // marker found, now skip past trailing whitespace
857 col++
858 i++
859 for data[i] == ' ' {
860 i++
861 }
862
863 // trailing junk found after last column
864 if col >= colCount && data[i] != '\n' {
865 return
866 }
867
868 case (data[i] != '|' || isBackslashEscaped(data, i)) && col+1 < colCount:
869 // something else found where marker was required
870 return
871
872 case data[i] == '\n':
873 // marker is optional for the last column
874 col++
875
876 default:
877 // trailing junk found after last column
878 return
879 }
880 }
881 if col != colCount {
882 return
883 }
884
885 p.addBlock(TableHead, nil)
886 p.tableRow(header, columns, true)
887 size = i + 1
888 return
889}
890
891func (p *parser) tableRow(data []byte, columns []CellAlignFlags, header bool) {
892 p.addBlock(TableRow, nil)
893 i, col := 0, 0
894
895 if data[i] == '|' && !isBackslashEscaped(data, i) {
896 i++
897 }
898
899 for col = 0; col < len(columns) && i < len(data); col++ {
900 for data[i] == ' ' {
901 i++
902 }
903
904 cellStart := i
905
906 for (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
907 i++
908 }
909
910 cellEnd := i
911
912 // skip the end-of-cell marker, possibly taking us past end of buffer
913 i++
914
915 for cellEnd > cellStart && data[cellEnd-1] == ' ' {
916 cellEnd--
917 }
918
919 cell := p.addBlock(TableCell, data[cellStart:cellEnd])
920 cell.IsHeader = header
921 cell.Align = columns[col]
922 }
923
924 // pad it out with empty columns to get the right number
925 for ; col < len(columns); col++ {
926 cell := p.addBlock(TableCell, nil)
927 cell.IsHeader = header
928 cell.Align = columns[col]
929 }
930
931 // silently ignore rows with too many cells
932}
933
934// returns blockquote prefix length
935func (p *parser) quotePrefix(data []byte) int {
936 i := 0
937 for i < 3 && data[i] == ' ' {
938 i++
939 }
940 if data[i] == '>' {
941 if data[i+1] == ' ' {
942 return i + 2
943 }
944 return i + 1
945 }
946 return 0
947}
948
949// blockquote ends with at least one blank line
950// followed by something without a blockquote prefix
951func (p *parser) terminateBlockquote(data []byte, beg, end int) bool {
952 if p.isEmpty(data[beg:]) <= 0 {
953 return false
954 }
955 if end >= len(data) {
956 return true
957 }
958 return p.quotePrefix(data[end:]) == 0 && p.isEmpty(data[end:]) == 0
959}
960
961// parse a blockquote fragment
962func (p *parser) quote(data []byte) int {
963 block := p.addBlock(BlockQuote, nil)
964 var raw bytes.Buffer
965 beg, end := 0, 0
966 for beg < len(data) {
967 end = beg
968 // Step over whole lines, collecting them. While doing that, check for
969 // fenced code and if one's found, incorporate it altogether,
970 // irregardless of any contents inside it
971 for data[end] != '\n' {
972 if p.flags&FencedCode != 0 {
973 if i := p.fencedCodeBlock(data[end:], false); i > 0 {
974 // -1 to compensate for the extra end++ after the loop:
975 end += i - 1
976 break
977 }
978 }
979 end++
980 }
981 end++
982 if pre := p.quotePrefix(data[beg:]); pre > 0 {
983 // skip the prefix
984 beg += pre
985 } else if p.terminateBlockquote(data, beg, end) {
986 break
987 }
988 // this line is part of the blockquote
989 raw.Write(data[beg:end])
990 beg = end
991 }
992 p.block(raw.Bytes())
993 p.finalize(block)
994 return end
995}
996
997// returns prefix length for block code
998func (p *parser) codePrefix(data []byte) int {
999 if data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
1000 return 4
1001 }
1002 return 0
1003}
1004
1005func (p *parser) code(data []byte) int {
1006 var work bytes.Buffer
1007
1008 i := 0
1009 for i < len(data) {
1010 beg := i
1011 for data[i] != '\n' {
1012 i++
1013 }
1014 i++
1015
1016 blankline := p.isEmpty(data[beg:i]) > 0
1017 if pre := p.codePrefix(data[beg:i]); pre > 0 {
1018 beg += pre
1019 } else if !blankline {
1020 // non-empty, non-prefixed line breaks the pre
1021 i = beg
1022 break
1023 }
1024
1025 // verbatim copy to the working buffeu
1026 if blankline {
1027 work.WriteByte('\n')
1028 } else {
1029 work.Write(data[beg:i])
1030 }
1031 }
1032
1033 // trim all the \n off the end of work
1034 workbytes := work.Bytes()
1035 eol := len(workbytes)
1036 for eol > 0 && workbytes[eol-1] == '\n' {
1037 eol--
1038 }
1039 if eol != len(workbytes) {
1040 work.Truncate(eol)
1041 }
1042
1043 work.WriteByte('\n')
1044
1045 block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
1046 block.IsFenced = false
1047 finalizeCodeBlock(block)
1048
1049 return i
1050}
1051
1052// returns unordered list item prefix
1053func (p *parser) uliPrefix(data []byte) int {
1054 i := 0
1055
1056 // start with up to 3 spaces
1057 for i < 3 && data[i] == ' ' {
1058 i++
1059 }
1060
1061 // need a *, +, or - followed by a space
1062 if (data[i] != '*' && data[i] != '+' && data[i] != '-') ||
1063 data[i+1] != ' ' {
1064 return 0
1065 }
1066 return i + 2
1067}
1068
1069// returns ordered list item prefix
1070func (p *parser) oliPrefix(data []byte) int {
1071 i := 0
1072
1073 // start with up to 3 spaces
1074 for i < 3 && data[i] == ' ' {
1075 i++
1076 }
1077
1078 // count the digits
1079 start := i
1080 for data[i] >= '0' && data[i] <= '9' {
1081 i++
1082 }
1083
1084 // we need >= 1 digits followed by a dot and a space
1085 if start == i || data[i] != '.' || data[i+1] != ' ' {
1086 return 0
1087 }
1088 return i + 2
1089}
1090
1091// returns definition list item prefix
1092func (p *parser) dliPrefix(data []byte) int {
1093 i := 0
1094
1095 // need a : followed by a spaces
1096 if data[i] != ':' || data[i+1] != ' ' {
1097 return 0
1098 }
1099 for data[i] == ' ' {
1100 i++
1101 }
1102 return i + 2
1103}
1104
1105// parse ordered or unordered list block
1106func (p *parser) list(data []byte, flags ListType) int {
1107 i := 0
1108 flags |= ListItemBeginningOfList
1109 block := p.addBlock(List, nil)
1110 block.ListFlags = flags
1111 block.Tight = true
1112
1113 for i < len(data) {
1114 skip := p.listItem(data[i:], &flags)
1115 if flags&ListItemContainsBlock != 0 {
1116 block.ListData.Tight = false
1117 }
1118 i += skip
1119 if skip == 0 || flags&ListItemEndOfList != 0 {
1120 break
1121 }
1122 flags &= ^ListItemBeginningOfList
1123 }
1124
1125 above := block.Parent
1126 finalizeList(block)
1127 p.tip = above
1128 return i
1129}
1130
1131// Returns true if block ends with a blank line, descending if needed
1132// into lists and sublists.
1133func endsWithBlankLine(block *Node) bool {
1134 // TODO: figure this out. Always false now.
1135 for block != nil {
1136 //if block.lastLineBlank {
1137 //return true
1138 //}
1139 t := block.Type
1140 if t == List || t == Item {
1141 block = block.LastChild
1142 } else {
1143 break
1144 }
1145 }
1146 return false
1147}
1148
1149func finalizeList(block *Node) {
1150 block.open = false
1151 item := block.FirstChild
1152 for item != nil {
1153 // check for non-final list item ending with blank line:
1154 if endsWithBlankLine(item) && item.Next != nil {
1155 block.ListData.Tight = false
1156 break
1157 }
1158 // recurse into children of list item, to see if there are spaces
1159 // between any of them:
1160 subItem := item.FirstChild
1161 for subItem != nil {
1162 if endsWithBlankLine(subItem) && (item.Next != nil || subItem.Next != nil) {
1163 block.ListData.Tight = false
1164 break
1165 }
1166 subItem = subItem.Next
1167 }
1168 item = item.Next
1169 }
1170}
1171
1172// Parse a single list item.
1173// Assumes initial prefix is already removed if this is a sublist.
1174func (p *parser) listItem(data []byte, flags *ListType) int {
1175 // keep track of the indentation of the first line
1176 itemIndent := 0
1177 for itemIndent < 3 && data[itemIndent] == ' ' {
1178 itemIndent++
1179 }
1180
1181 var bulletChar byte = '*'
1182 i := p.uliPrefix(data)
1183 if i == 0 {
1184 i = p.oliPrefix(data)
1185 } else {
1186 bulletChar = data[i-2]
1187 }
1188 if i == 0 {
1189 i = p.dliPrefix(data)
1190 // reset definition term flag
1191 if i > 0 {
1192 *flags &= ^ListTypeTerm
1193 }
1194 }
1195 if i == 0 {
1196 // if in definition list, set term flag and continue
1197 if *flags&ListTypeDefinition != 0 {
1198 *flags |= ListTypeTerm
1199 } else {
1200 return 0
1201 }
1202 }
1203
1204 // skip leading whitespace on first line
1205 for data[i] == ' ' {
1206 i++
1207 }
1208
1209 // find the end of the line
1210 line := i
1211 for i > 0 && data[i-1] != '\n' {
1212 i++
1213 }
1214
1215 // get working buffer
1216 var raw bytes.Buffer
1217
1218 // put the first line into the working buffer
1219 raw.Write(data[line:i])
1220 line = i
1221
1222 // process the following lines
1223 containsBlankLine := false
1224 sublist := 0
1225
1226gatherlines:
1227 for line < len(data) {
1228 i++
1229
1230 // find the end of this line
1231 for data[i-1] != '\n' {
1232 i++
1233 }
1234
1235 // if it is an empty line, guess that it is part of this item
1236 // and move on to the next line
1237 if p.isEmpty(data[line:i]) > 0 {
1238 containsBlankLine = true
1239 line = i
1240 continue
1241 }
1242
1243 // calculate the indentation
1244 indent := 0
1245 for indent < 4 && line+indent < i && data[line+indent] == ' ' {
1246 indent++
1247 }
1248
1249 chunk := data[line+indent : i]
1250
1251 // evaluate how this line fits in
1252 switch {
1253 // is this a nested list item?
1254 case (p.uliPrefix(chunk) > 0 && !p.isHRule(chunk)) ||
1255 p.oliPrefix(chunk) > 0 ||
1256 p.dliPrefix(chunk) > 0:
1257
1258 if containsBlankLine {
1259 *flags |= ListItemContainsBlock
1260 }
1261
1262 // to be a nested list, it must be indented more
1263 // if not, it is the next item in the same list
1264 if indent <= itemIndent {
1265 break gatherlines
1266 }
1267
1268 // is this the first item in the nested list?
1269 if sublist == 0 {
1270 sublist = raw.Len()
1271 }
1272
1273 // is this a nested prefix header?
1274 case p.isPrefixHeader(chunk):
1275 // if the header is not indented, it is not nested in the list
1276 // and thus ends the list
1277 if containsBlankLine && indent < 4 {
1278 *flags |= ListItemEndOfList
1279 break gatherlines
1280 }
1281 *flags |= ListItemContainsBlock
1282
1283 // anything following an empty line is only part
1284 // of this item if it is indented 4 spaces
1285 // (regardless of the indentation of the beginning of the item)
1286 case containsBlankLine && indent < 4:
1287 if *flags&ListTypeDefinition != 0 && i < len(data)-1 {
1288 // is the next item still a part of this list?
1289 next := i
1290 for data[next] != '\n' {
1291 next++
1292 }
1293 for next < len(data)-1 && data[next] == '\n' {
1294 next++
1295 }
1296 if i < len(data)-1 && data[i] != ':' && data[next] != ':' {
1297 *flags |= ListItemEndOfList
1298 }
1299 } else {
1300 *flags |= ListItemEndOfList
1301 }
1302 break gatherlines
1303
1304 // a blank line means this should be parsed as a block
1305 case containsBlankLine:
1306 raw.WriteByte('\n')
1307 *flags |= ListItemContainsBlock
1308 }
1309
1310 // if this line was preceded by one or more blanks,
1311 // re-introduce the blank into the buffer
1312 if containsBlankLine {
1313 containsBlankLine = false
1314 raw.WriteByte('\n')
1315 }
1316
1317 // add the line into the working buffer without prefix
1318 raw.Write(data[line+indent : i])
1319
1320 line = i
1321 }
1322
1323 rawBytes := raw.Bytes()
1324
1325 block := p.addBlock(Item, nil)
1326 block.ListFlags = *flags
1327 block.Tight = false
1328 block.BulletChar = bulletChar
1329 block.Delimiter = '.' // Only '.' is possible in Markdown, but ')' will also be possible in CommonMark
1330
1331 // render the contents of the list item
1332 if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 {
1333 // intermediate render of block item, except for definition term
1334 if sublist > 0 {
1335 p.block(rawBytes[:sublist])
1336 p.block(rawBytes[sublist:])
1337 } else {
1338 p.block(rawBytes)
1339 }
1340 } else {
1341 // intermediate render of inline item
1342 if sublist > 0 {
1343 child := p.addChild(Paragraph, 0)
1344 child.content = rawBytes[:sublist]
1345 p.block(rawBytes[sublist:])
1346 } else {
1347 child := p.addChild(Paragraph, 0)
1348 child.content = rawBytes
1349 }
1350 }
1351 return line
1352}
1353
1354// render a single paragraph that has already been parsed out
1355func (p *parser) renderParagraph(data []byte) {
1356 if len(data) == 0 {
1357 return
1358 }
1359
1360 // trim leading spaces
1361 beg := 0
1362 for data[beg] == ' ' {
1363 beg++
1364 }
1365
1366 // trim trailing newline
1367 end := len(data) - 1
1368
1369 // trim trailing spaces
1370 for end > beg && data[end-1] == ' ' {
1371 end--
1372 }
1373
1374 p.addBlock(Paragraph, data[beg:end])
1375}
1376
1377func (p *parser) paragraph(data []byte) int {
1378 // prev: index of 1st char of previous line
1379 // line: index of 1st char of current line
1380 // i: index of cursor/end of current line
1381 var prev, line, i int
1382 tabSize := TabSizeDefault
1383 if p.flags&TabSizeEight != 0 {
1384 tabSize = TabSizeDouble
1385 }
1386 // keep going until we find something to mark the end of the paragraph
1387 for i < len(data) {
1388 // mark the beginning of the current line
1389 prev = line
1390 current := data[i:]
1391 line = i
1392
1393 // did we find a reference or a footnote? If so, end a paragraph
1394 // preceding it and report that we have consumed up to the end of that
1395 // reference:
1396 if refEnd := isReference(p, current, tabSize); refEnd > 0 {
1397 p.renderParagraph(data[:i])
1398 return i + refEnd
1399 }
1400
1401 // did we find a blank line marking the end of the paragraph?
1402 if n := p.isEmpty(current); n > 0 {
1403 // did this blank line followed by a definition list item?
1404 if p.flags&DefinitionLists != 0 {
1405 if i < len(data)-1 && data[i+1] == ':' {
1406 return p.list(data[prev:], ListTypeDefinition)
1407 }
1408 }
1409
1410 p.renderParagraph(data[:i])
1411 return i + n
1412 }
1413
1414 // an underline under some text marks a header, so our paragraph ended on prev line
1415 if i > 0 {
1416 if level := p.isUnderlinedHeader(current); level > 0 {
1417 // render the paragraph
1418 p.renderParagraph(data[:prev])
1419
1420 // ignore leading and trailing whitespace
1421 eol := i - 1
1422 for prev < eol && data[prev] == ' ' {
1423 prev++
1424 }
1425 for eol > prev && data[eol-1] == ' ' {
1426 eol--
1427 }
1428
1429 id := ""
1430 if p.flags&AutoHeaderIDs != 0 {
1431 id = sanitized_anchor_name.Create(string(data[prev:eol]))
1432 }
1433
1434 block := p.addBlock(Header, data[prev:eol])
1435 block.Level = level
1436 block.HeaderID = id
1437
1438 // find the end of the underline
1439 for data[i] != '\n' {
1440 i++
1441 }
1442 return i
1443 }
1444 }
1445
1446 // if the next line starts a block of HTML, then the paragraph ends here
1447 if p.flags&LaxHTMLBlocks != 0 {
1448 if data[i] == '<' && p.html(current, false) > 0 {
1449 // rewind to before the HTML block
1450 p.renderParagraph(data[:i])
1451 return i
1452 }
1453 }
1454
1455 // if there's a prefixed header or a horizontal rule after this, paragraph is over
1456 if p.isPrefixHeader(current) || p.isHRule(current) {
1457 p.renderParagraph(data[:i])
1458 return i
1459 }
1460
1461 // if there's a fenced code block, paragraph is over
1462 if p.flags&FencedCode != 0 {
1463 if p.fencedCodeBlock(current, false) > 0 {
1464 p.renderParagraph(data[:i])
1465 return i
1466 }
1467 }
1468
1469 // if there's a definition list item, prev line is a definition term
1470 if p.flags&DefinitionLists != 0 {
1471 if p.dliPrefix(current) != 0 {
1472 return p.list(data[prev:], ListTypeDefinition)
1473 }
1474 }
1475
1476 // if there's a list after this, paragraph is over
1477 if p.flags&NoEmptyLineBeforeBlock != 0 {
1478 if p.uliPrefix(current) != 0 ||
1479 p.oliPrefix(current) != 0 ||
1480 p.quotePrefix(current) != 0 ||
1481 p.codePrefix(current) != 0 {
1482 p.renderParagraph(data[:i])
1483 return i
1484 }
1485 }
1486
1487 // otherwise, scan to the beginning of the next line
1488 for data[i] != '\n' {
1489 i++
1490 }
1491 i++
1492 }
1493
1494 p.renderParagraph(data[:i])
1495 return i
1496}
1497
1498func skipChar(data []byte, start int, char byte) int {
1499 i := start
1500 for i < len(data) && data[i] == char {
1501 i++
1502 }
1503 return i
1504}
1505
1506func skipUntilChar(text []byte, start int, char byte) int {
1507 i := start
1508 for i < len(text) && text[i] != char {
1509 i++
1510 }
1511 return i
1512}