block.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11// Functions to parse block-level elements.
12//
13
14package blackfriday
15
16import (
17 "bytes"
18 "html"
19 "regexp"
20
21 "github.com/shurcooL/sanitized_anchor_name"
22)
23
24const (
25 charEntity = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"
26 escapable = "[!\"#$%&'()*+,./:;<=>?@[\\\\\\]^_`{|}~-]"
27)
28
29var (
30 reBackslashOrAmp = regexp.MustCompile("[\\&]")
31 reEntityOrEscapedChar = regexp.MustCompile("(?i)\\\\" + escapable + "|" + charEntity)
32 reTrailingWhitespace = regexp.MustCompile("(\n *)+$")
33)
34
35// Parse block-level data.
36// Note: this function and many that it calls assume that
37// the input buffer ends with a newline.
38func (p *parser) block(data []byte) {
39 if len(data) == 0 || data[len(data)-1] != '\n' {
40 panic("block input is missing terminating newline")
41 }
42
43 // this is called recursively: enforce a maximum depth
44 if p.nesting >= p.maxNesting {
45 return
46 }
47 p.nesting++
48
49 // parse out one block-level construct at a time
50 for len(data) > 0 {
51 // prefixed header:
52 //
53 // # Header 1
54 // ## Header 2
55 // ...
56 // ###### Header 6
57 if p.isPrefixHeader(data) {
58 data = data[p.prefixHeader(data):]
59 continue
60 }
61
62 // block of preformatted HTML:
63 //
64 // <div>
65 // ...
66 // </div>
67 if data[0] == '<' {
68 if i := p.html(data, true); i > 0 {
69 data = data[i:]
70 continue
71 }
72 }
73
74 // title block
75 //
76 // % stuff
77 // % more stuff
78 // % even more stuff
79 if p.flags&Titleblock != 0 {
80 if data[0] == '%' {
81 if i := p.titleBlock(data, true); i > 0 {
82 data = data[i:]
83 continue
84 }
85 }
86 }
87
88 // blank lines. note: returns the # of bytes to skip
89 if i := p.isEmpty(data); i > 0 {
90 data = data[i:]
91 continue
92 }
93
94 // indented code block:
95 //
96 // func max(a, b int) int {
97 // if a > b {
98 // return a
99 // }
100 // return b
101 // }
102 if p.codePrefix(data) > 0 {
103 data = data[p.code(data):]
104 continue
105 }
106
107 // fenced code block:
108 //
109 // ``` go
110 // func fact(n int) int {
111 // if n <= 1 {
112 // return n
113 // }
114 // return n * fact(n-1)
115 // }
116 // ```
117 if p.flags&FencedCode != 0 {
118 if i := p.fencedCodeBlock(data, true); i > 0 {
119 data = data[i:]
120 continue
121 }
122 }
123
124 // horizontal rule:
125 //
126 // ------
127 // or
128 // ******
129 // or
130 // ______
131 if p.isHRule(data) {
132 p.addBlock(HorizontalRule, nil)
133 var i int
134 for i = 0; data[i] != '\n'; i++ {
135 }
136 data = data[i:]
137 continue
138 }
139
140 // block quote:
141 //
142 // > A big quote I found somewhere
143 // > on the web
144 if p.quotePrefix(data) > 0 {
145 data = data[p.quote(data):]
146 continue
147 }
148
149 // table:
150 //
151 // Name | Age | Phone
152 // ------|-----|---------
153 // Bob | 31 | 555-1234
154 // Alice | 27 | 555-4321
155 if p.flags&Tables != 0 {
156 if i := p.table(data); i > 0 {
157 data = data[i:]
158 continue
159 }
160 }
161
162 // an itemized/unordered list:
163 //
164 // * Item 1
165 // * Item 2
166 //
167 // also works with + or -
168 if p.uliPrefix(data) > 0 {
169 data = data[p.list(data, 0):]
170 continue
171 }
172
173 // a numbered/ordered list:
174 //
175 // 1. Item 1
176 // 2. Item 2
177 if p.oliPrefix(data) > 0 {
178 data = data[p.list(data, ListTypeOrdered):]
179 continue
180 }
181
182 // definition lists:
183 //
184 // Term 1
185 // : Definition a
186 // : Definition b
187 //
188 // Term 2
189 // : Definition c
190 if p.flags&DefinitionLists != 0 {
191 if p.dliPrefix(data) > 0 {
192 data = data[p.list(data, ListTypeDefinition):]
193 continue
194 }
195 }
196
197 // anything else must look like a normal paragraph
198 // note: this finds underlined headers, too
199 data = data[p.paragraph(data):]
200 }
201
202 p.nesting--
203}
204
205func (p *parser) addBlock(typ NodeType, content []byte) *Node {
206 p.closeUnmatchedBlocks()
207 container := p.addChild(typ, 0)
208 container.content = content
209 return container
210}
211
212func (p *parser) isPrefixHeader(data []byte) bool {
213 if data[0] != '#' {
214 return false
215 }
216
217 if p.flags&SpaceHeaders != 0 {
218 level := 0
219 for level < 6 && data[level] == '#' {
220 level++
221 }
222 if data[level] != ' ' {
223 return false
224 }
225 }
226 return true
227}
228
229func (p *parser) prefixHeader(data []byte) int {
230 level := 0
231 for level < 6 && data[level] == '#' {
232 level++
233 }
234 i := skipChar(data, level, ' ')
235 end := skipUntilChar(data, i, '\n')
236 skip := end
237 id := ""
238 if p.flags&HeaderIDs != 0 {
239 j, k := 0, 0
240 // find start/end of header id
241 for j = i; j < end-1 && (data[j] != '{' || data[j+1] != '#'); j++ {
242 }
243 for k = j + 1; k < end && data[k] != '}'; k++ {
244 }
245 // extract header id iff found
246 if j < end && k < end {
247 id = string(data[j+2 : k])
248 end = j
249 skip = k + 1
250 for end > 0 && data[end-1] == ' ' {
251 end--
252 }
253 }
254 }
255 for end > 0 && data[end-1] == '#' {
256 if isBackslashEscaped(data, end-1) {
257 break
258 }
259 end--
260 }
261 for end > 0 && data[end-1] == ' ' {
262 end--
263 }
264 if end > i {
265 if id == "" && p.flags&AutoHeaderIDs != 0 {
266 id = sanitized_anchor_name.Create(string(data[i:end]))
267 }
268 block := p.addBlock(Header, data[i:end])
269 block.HeaderID = id
270 block.Level = level
271 }
272 return skip
273}
274
275func (p *parser) isUnderlinedHeader(data []byte) int {
276 // test of level 1 header
277 if data[0] == '=' {
278 i := skipChar(data, 1, '=')
279 i = skipChar(data, i, ' ')
280 if data[i] == '\n' {
281 return 1
282 }
283 return 0
284 }
285
286 // test of level 2 header
287 if data[0] == '-' {
288 i := skipChar(data, 1, '-')
289 i = skipChar(data, i, ' ')
290 if data[i] == '\n' {
291 return 2
292 }
293 return 0
294 }
295
296 return 0
297}
298
299func (p *parser) titleBlock(data []byte, doRender bool) int {
300 if data[0] != '%' {
301 return 0
302 }
303 splitData := bytes.Split(data, []byte("\n"))
304 var i int
305 for idx, b := range splitData {
306 if !bytes.HasPrefix(b, []byte("%")) {
307 i = idx // - 1
308 break
309 }
310 }
311
312 data = bytes.Join(splitData[0:i], []byte("\n"))
313 consumed := len(data)
314 data = bytes.TrimPrefix(data, []byte("% "))
315 data = bytes.Replace(data, []byte("\n% "), []byte("\n"), -1)
316 block := p.addBlock(Header, data)
317 block.Level = 1
318 block.IsTitleblock = true
319
320 return consumed
321}
322
323func (p *parser) html(data []byte, doRender bool) int {
324 var i, j int
325
326 // identify the opening tag
327 if data[0] != '<' {
328 return 0
329 }
330 curtag, tagfound := p.htmlFindTag(data[1:])
331
332 // handle special cases
333 if !tagfound {
334 // check for an HTML comment
335 if size := p.htmlComment(data, doRender); size > 0 {
336 return size
337 }
338
339 // check for an <hr> tag
340 if size := p.htmlHr(data, doRender); size > 0 {
341 return size
342 }
343
344 // no special case recognized
345 return 0
346 }
347
348 // look for an unindented matching closing tag
349 // followed by a blank line
350 found := false
351 /*
352 closetag := []byte("\n</" + curtag + ">")
353 j = len(curtag) + 1
354 for !found {
355 // scan for a closing tag at the beginning of a line
356 if skip := bytes.Index(data[j:], closetag); skip >= 0 {
357 j += skip + len(closetag)
358 } else {
359 break
360 }
361
362 // see if it is the only thing on the line
363 if skip := p.isEmpty(data[j:]); skip > 0 {
364 // see if it is followed by a blank line/eof
365 j += skip
366 if j >= len(data) {
367 found = true
368 i = j
369 } else {
370 if skip := p.isEmpty(data[j:]); skip > 0 {
371 j += skip
372 found = true
373 i = j
374 }
375 }
376 }
377 }
378 */
379
380 // if not found, try a second pass looking for indented match
381 // but not if tag is "ins" or "del" (following original Markdown.pl)
382 if !found && curtag != "ins" && curtag != "del" {
383 i = 1
384 for i < len(data) {
385 i++
386 for i < len(data) && !(data[i-1] == '<' && data[i] == '/') {
387 i++
388 }
389
390 if i+2+len(curtag) >= len(data) {
391 break
392 }
393
394 j = p.htmlFindEnd(curtag, data[i-1:])
395
396 if j > 0 {
397 i += j - 1
398 found = true
399 break
400 }
401 }
402 }
403
404 if !found {
405 return 0
406 }
407
408 // the end of the block has been found
409 if doRender {
410 // trim newlines
411 end := i
412 for end > 0 && data[end-1] == '\n' {
413 end--
414 }
415 finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
416 }
417
418 return i
419}
420
421func finalizeHTMLBlock(block *Node) {
422 block.Literal = reTrailingWhitespace.ReplaceAll(block.content, []byte{})
423 block.content = []byte{}
424}
425
426// HTML comment, lax form
427func (p *parser) htmlComment(data []byte, doRender bool) int {
428 i := p.inlineHTMLComment(data)
429 // needs to end with a blank line
430 if j := p.isEmpty(data[i:]); j > 0 {
431 size := i + j
432 if doRender {
433 // trim trailing newlines
434 end := size
435 for end > 0 && data[end-1] == '\n' {
436 end--
437 }
438 block := p.addBlock(HTMLBlock, data[:end])
439 finalizeHTMLBlock(block)
440 }
441 return size
442 }
443 return 0
444}
445
446// HR, which is the only self-closing block tag considered
447func (p *parser) htmlHr(data []byte, doRender bool) int {
448 if data[0] != '<' || (data[1] != 'h' && data[1] != 'H') || (data[2] != 'r' && data[2] != 'R') {
449 return 0
450 }
451 if data[3] != ' ' && data[3] != '/' && data[3] != '>' {
452 // not an <hr> tag after all; at least not a valid one
453 return 0
454 }
455
456 i := 3
457 for data[i] != '>' && data[i] != '\n' {
458 i++
459 }
460
461 if data[i] == '>' {
462 i++
463 if j := p.isEmpty(data[i:]); j > 0 {
464 size := i + j
465 if doRender {
466 // trim newlines
467 end := size
468 for end > 0 && data[end-1] == '\n' {
469 end--
470 }
471 finalizeHTMLBlock(p.addBlock(HTMLBlock, data[:end]))
472 }
473 return size
474 }
475 }
476
477 return 0
478}
479
480func (p *parser) htmlFindTag(data []byte) (string, bool) {
481 i := 0
482 for isalnum(data[i]) {
483 i++
484 }
485 key := string(data[:i])
486 if _, ok := blockTags[key]; ok {
487 return key, true
488 }
489 return "", false
490}
491
492func (p *parser) htmlFindEnd(tag string, data []byte) int {
493 // assume data[0] == '<' && data[1] == '/' already tested
494 if tag == "hr" {
495 return 2
496 }
497 // check if tag is a match
498 closetag := []byte("</" + tag + ">")
499 if !bytes.HasPrefix(data, closetag) {
500 return 0
501 }
502 i := len(closetag)
503
504 // check that the rest of the line is blank
505 skip := 0
506 if skip = p.isEmpty(data[i:]); skip == 0 {
507 return 0
508 }
509 i += skip
510 skip = 0
511
512 if i >= len(data) {
513 return i
514 }
515
516 if p.flags&LaxHTMLBlocks != 0 {
517 return i
518 }
519 if skip = p.isEmpty(data[i:]); skip == 0 {
520 // following line must be blank
521 return 0
522 }
523
524 return i + skip
525}
526
527func (*parser) isEmpty(data []byte) int {
528 // it is okay to call isEmpty on an empty buffer
529 if len(data) == 0 {
530 return 0
531 }
532
533 var i int
534 for i = 0; i < len(data) && data[i] != '\n'; i++ {
535 if data[i] != ' ' && data[i] != '\t' {
536 return 0
537 }
538 }
539 return i + 1
540}
541
542func (*parser) isHRule(data []byte) bool {
543 i := 0
544
545 // skip up to three spaces
546 for i < 3 && data[i] == ' ' {
547 i++
548 }
549
550 // look at the hrule char
551 if data[i] != '*' && data[i] != '-' && data[i] != '_' {
552 return false
553 }
554 c := data[i]
555
556 // the whole line must be the char or whitespace
557 n := 0
558 for data[i] != '\n' {
559 switch {
560 case data[i] == c:
561 n++
562 case data[i] != ' ':
563 return false
564 }
565 i++
566 }
567
568 return n >= 3
569}
570
571// isFenceLine checks if there's a fence line (e.g., ``` or ``` go) at the beginning of data,
572// and returns the end index if so, or 0 otherwise. It also returns the marker found.
573// If syntax is not nil, it gets set to the syntax specified in the fence line.
574// A final newline is mandatory to recognize the fence line, unless newlineOptional is true.
575func isFenceLine(data []byte, syntax *string, oldmarker string, newlineOptional bool) (end int, marker string) {
576 i, size := 0, 0
577
578 // skip up to three spaces
579 for i < len(data) && i < 3 && data[i] == ' ' {
580 i++
581 }
582
583 // check for the marker characters: ~ or `
584 if i >= len(data) {
585 return 0, ""
586 }
587 if data[i] != '~' && data[i] != '`' {
588 return 0, ""
589 }
590
591 c := data[i]
592
593 // the whole line must be the same char or whitespace
594 for i < len(data) && data[i] == c {
595 size++
596 i++
597 }
598
599 // the marker char must occur at least 3 times
600 if size < 3 {
601 return 0, ""
602 }
603 marker = string(data[i-size : i])
604
605 // if this is the end marker, it must match the beginning marker
606 if oldmarker != "" && marker != oldmarker {
607 return 0, ""
608 }
609
610 // TODO(shurcooL): It's probably a good idea to simplify the 2 code paths here
611 // into one, always get the syntax, and discard it if the caller doesn't care.
612 if syntax != nil {
613 syn := 0
614 i = skipChar(data, i, ' ')
615
616 if i >= len(data) {
617 if newlineOptional && i == len(data) {
618 return i, marker
619 }
620 return 0, ""
621 }
622
623 syntaxStart := i
624
625 if data[i] == '{' {
626 i++
627 syntaxStart++
628
629 for i < len(data) && data[i] != '}' && data[i] != '\n' {
630 syn++
631 i++
632 }
633
634 if i >= len(data) || data[i] != '}' {
635 return 0, ""
636 }
637
638 // strip all whitespace at the beginning and the end
639 // of the {} block
640 for syn > 0 && isspace(data[syntaxStart]) {
641 syntaxStart++
642 syn--
643 }
644
645 for syn > 0 && isspace(data[syntaxStart+syn-1]) {
646 syn--
647 }
648
649 i++
650 } else {
651 for i < len(data) && !isspace(data[i]) {
652 syn++
653 i++
654 }
655 }
656
657 *syntax = string(data[syntaxStart : syntaxStart+syn])
658 }
659
660 i = skipChar(data, i, ' ')
661 if i >= len(data) || data[i] != '\n' {
662 if newlineOptional && i == len(data) {
663 return i, marker
664 }
665 return 0, ""
666 }
667
668 return i + 1, marker // Take newline into account.
669}
670
671// fencedCodeBlock returns the end index if data contains a fenced code block at the beginning,
672// or 0 otherwise. It writes to out if doRender is true, otherwise it has no side effects.
673// If doRender is true, a final newline is mandatory to recognize the fenced code block.
674func (p *parser) fencedCodeBlock(data []byte, doRender bool) int {
675 var syntax string
676 beg, marker := isFenceLine(data, &syntax, "", false)
677 if beg == 0 || beg >= len(data) {
678 return 0
679 }
680
681 var work bytes.Buffer
682 work.Write([]byte(syntax))
683 work.WriteByte('\n')
684
685 for {
686 // safe to assume beg < len(data)
687
688 // check for the end of the code block
689 newlineOptional := !doRender
690 fenceEnd, _ := isFenceLine(data[beg:], nil, marker, newlineOptional)
691 if fenceEnd != 0 {
692 beg += fenceEnd
693 break
694 }
695
696 // copy the current line
697 end := skipUntilChar(data, beg, '\n') + 1
698
699 // did we reach the end of the buffer without a closing marker?
700 if end >= len(data) {
701 return 0
702 }
703
704 // verbatim copy to the working buffer
705 if doRender {
706 work.Write(data[beg:end])
707 }
708 beg = end
709 }
710
711 if doRender {
712 block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
713 block.IsFenced = true
714 finalizeCodeBlock(block)
715 }
716
717 return beg
718}
719
720func unescapeChar(str []byte) []byte {
721 if str[0] == '\\' {
722 return []byte{str[1]}
723 }
724 return []byte(html.UnescapeString(string(str)))
725}
726
727func unescapeString(str []byte) []byte {
728 if reBackslashOrAmp.Match(str) {
729 return reEntityOrEscapedChar.ReplaceAllFunc(str, unescapeChar)
730 }
731 return str
732}
733
734func finalizeCodeBlock(block *Node) {
735 if block.IsFenced {
736 newlinePos := bytes.IndexByte(block.content, '\n')
737 firstLine := block.content[:newlinePos]
738 rest := block.content[newlinePos+1:]
739 block.Info = unescapeString(bytes.Trim(firstLine, "\n"))
740 block.Literal = rest
741 } else {
742 block.Literal = reTrailingWhitespace.ReplaceAll(block.content, []byte{'\n'})
743 }
744 block.content = nil
745}
746
747func (p *parser) table(data []byte) int {
748 table := p.addBlock(Table, nil)
749 i, columns := p.tableHeader(data)
750 if i == 0 {
751 p.tip = table.Parent
752 table.Unlink()
753 return 0
754 }
755
756 p.addBlock(TableBody, nil)
757
758 for i < len(data) {
759 pipes, rowStart := 0, i
760 for ; data[i] != '\n'; i++ {
761 if data[i] == '|' {
762 pipes++
763 }
764 }
765
766 if pipes == 0 {
767 i = rowStart
768 break
769 }
770
771 // include the newline in data sent to tableRow
772 i++
773 p.tableRow(data[rowStart:i], columns, false)
774 }
775
776 return i
777}
778
779// check if the specified position is preceded by an odd number of backslashes
780func isBackslashEscaped(data []byte, i int) bool {
781 backslashes := 0
782 for i-backslashes-1 >= 0 && data[i-backslashes-1] == '\\' {
783 backslashes++
784 }
785 return backslashes&1 == 1
786}
787
788func (p *parser) tableHeader(data []byte) (size int, columns []CellAlignFlags) {
789 i := 0
790 colCount := 1
791 for i = 0; data[i] != '\n'; i++ {
792 if data[i] == '|' && !isBackslashEscaped(data, i) {
793 colCount++
794 }
795 }
796
797 // doesn't look like a table header
798 if colCount == 1 {
799 return
800 }
801
802 // include the newline in the data sent to tableRow
803 header := data[:i+1]
804
805 // column count ignores pipes at beginning or end of line
806 if data[0] == '|' {
807 colCount--
808 }
809 if i > 2 && data[i-1] == '|' && !isBackslashEscaped(data, i-1) {
810 colCount--
811 }
812
813 columns = make([]CellAlignFlags, colCount)
814
815 // move on to the header underline
816 i++
817 if i >= len(data) {
818 return
819 }
820
821 if data[i] == '|' && !isBackslashEscaped(data, i) {
822 i++
823 }
824 i = skipChar(data, i, ' ')
825
826 // each column header is of form: / *:?-+:? *|/ with # dashes + # colons >= 3
827 // and trailing | optional on last column
828 col := 0
829 for data[i] != '\n' {
830 dashes := 0
831
832 if data[i] == ':' {
833 i++
834 columns[col] |= TableAlignmentLeft
835 dashes++
836 }
837 for data[i] == '-' {
838 i++
839 dashes++
840 }
841 if data[i] == ':' {
842 i++
843 columns[col] |= TableAlignmentRight
844 dashes++
845 }
846 for data[i] == ' ' {
847 i++
848 }
849
850 // end of column test is messy
851 switch {
852 case dashes < 3:
853 // not a valid column
854 return
855
856 case data[i] == '|' && !isBackslashEscaped(data, i):
857 // marker found, now skip past trailing whitespace
858 col++
859 i++
860 for data[i] == ' ' {
861 i++
862 }
863
864 // trailing junk found after last column
865 if col >= colCount && data[i] != '\n' {
866 return
867 }
868
869 case (data[i] != '|' || isBackslashEscaped(data, i)) && col+1 < colCount:
870 // something else found where marker was required
871 return
872
873 case data[i] == '\n':
874 // marker is optional for the last column
875 col++
876
877 default:
878 // trailing junk found after last column
879 return
880 }
881 }
882 if col != colCount {
883 return
884 }
885
886 p.addBlock(TableHead, nil)
887 p.tableRow(header, columns, true)
888 size = i + 1
889 return
890}
891
892func (p *parser) tableRow(data []byte, columns []CellAlignFlags, header bool) {
893 p.addBlock(TableRow, nil)
894 i, col := 0, 0
895
896 if data[i] == '|' && !isBackslashEscaped(data, i) {
897 i++
898 }
899
900 for col = 0; col < len(columns) && i < len(data); col++ {
901 for data[i] == ' ' {
902 i++
903 }
904
905 cellStart := i
906
907 for (data[i] != '|' || isBackslashEscaped(data, i)) && data[i] != '\n' {
908 i++
909 }
910
911 cellEnd := i
912
913 // skip the end-of-cell marker, possibly taking us past end of buffer
914 i++
915
916 for cellEnd > cellStart && data[cellEnd-1] == ' ' {
917 cellEnd--
918 }
919
920 cell := p.addBlock(TableCell, data[cellStart:cellEnd])
921 cell.IsHeader = header
922 cell.Align = columns[col]
923 }
924
925 // pad it out with empty columns to get the right number
926 for ; col < len(columns); col++ {
927 cell := p.addBlock(TableCell, nil)
928 cell.IsHeader = header
929 cell.Align = columns[col]
930 }
931
932 // silently ignore rows with too many cells
933}
934
935// returns blockquote prefix length
936func (p *parser) quotePrefix(data []byte) int {
937 i := 0
938 for i < 3 && data[i] == ' ' {
939 i++
940 }
941 if data[i] == '>' {
942 if data[i+1] == ' ' {
943 return i + 2
944 }
945 return i + 1
946 }
947 return 0
948}
949
950// blockquote ends with at least one blank line
951// followed by something without a blockquote prefix
952func (p *parser) terminateBlockquote(data []byte, beg, end int) bool {
953 if p.isEmpty(data[beg:]) <= 0 {
954 return false
955 }
956 if end >= len(data) {
957 return true
958 }
959 return p.quotePrefix(data[end:]) == 0 && p.isEmpty(data[end:]) == 0
960}
961
962// parse a blockquote fragment
963func (p *parser) quote(data []byte) int {
964 block := p.addBlock(BlockQuote, nil)
965 var raw bytes.Buffer
966 beg, end := 0, 0
967 for beg < len(data) {
968 end = beg
969 // Step over whole lines, collecting them. While doing that, check for
970 // fenced code and if one's found, incorporate it altogether,
971 // irregardless of any contents inside it
972 for data[end] != '\n' {
973 if p.flags&FencedCode != 0 {
974 if i := p.fencedCodeBlock(data[end:], false); i > 0 {
975 // -1 to compensate for the extra end++ after the loop:
976 end += i - 1
977 break
978 }
979 }
980 end++
981 }
982 end++
983 if pre := p.quotePrefix(data[beg:]); pre > 0 {
984 // skip the prefix
985 beg += pre
986 } else if p.terminateBlockquote(data, beg, end) {
987 break
988 }
989 // this line is part of the blockquote
990 raw.Write(data[beg:end])
991 beg = end
992 }
993 p.block(raw.Bytes())
994 p.finalize(block)
995 return end
996}
997
998// returns prefix length for block code
999func (p *parser) codePrefix(data []byte) int {
1000 if data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' {
1001 return 4
1002 }
1003 return 0
1004}
1005
1006func (p *parser) code(data []byte) int {
1007 var work bytes.Buffer
1008
1009 i := 0
1010 for i < len(data) {
1011 beg := i
1012 for data[i] != '\n' {
1013 i++
1014 }
1015 i++
1016
1017 blankline := p.isEmpty(data[beg:i]) > 0
1018 if pre := p.codePrefix(data[beg:i]); pre > 0 {
1019 beg += pre
1020 } else if !blankline {
1021 // non-empty, non-prefixed line breaks the pre
1022 i = beg
1023 break
1024 }
1025
1026 // verbatim copy to the working buffeu
1027 if blankline {
1028 work.WriteByte('\n')
1029 } else {
1030 work.Write(data[beg:i])
1031 }
1032 }
1033
1034 // trim all the \n off the end of work
1035 workbytes := work.Bytes()
1036 eol := len(workbytes)
1037 for eol > 0 && workbytes[eol-1] == '\n' {
1038 eol--
1039 }
1040 if eol != len(workbytes) {
1041 work.Truncate(eol)
1042 }
1043
1044 work.WriteByte('\n')
1045
1046 block := p.addBlock(CodeBlock, work.Bytes()) // TODO: get rid of temp buffer
1047 block.IsFenced = false
1048 finalizeCodeBlock(block)
1049
1050 return i
1051}
1052
1053// returns unordered list item prefix
1054func (p *parser) uliPrefix(data []byte) int {
1055 i := 0
1056
1057 // start with up to 3 spaces
1058 for i < 3 && data[i] == ' ' {
1059 i++
1060 }
1061
1062 // need a *, +, or - followed by a space
1063 if (data[i] != '*' && data[i] != '+' && data[i] != '-') ||
1064 data[i+1] != ' ' {
1065 return 0
1066 }
1067 return i + 2
1068}
1069
1070// returns ordered list item prefix
1071func (p *parser) oliPrefix(data []byte) int {
1072 i := 0
1073
1074 // start with up to 3 spaces
1075 for i < 3 && data[i] == ' ' {
1076 i++
1077 }
1078
1079 // count the digits
1080 start := i
1081 for data[i] >= '0' && data[i] <= '9' {
1082 i++
1083 }
1084
1085 // we need >= 1 digits followed by a dot and a space
1086 if start == i || data[i] != '.' || data[i+1] != ' ' {
1087 return 0
1088 }
1089 return i + 2
1090}
1091
1092// returns definition list item prefix
1093func (p *parser) dliPrefix(data []byte) int {
1094 i := 0
1095
1096 // need a : followed by a spaces
1097 if data[i] != ':' || data[i+1] != ' ' {
1098 return 0
1099 }
1100 for data[i] == ' ' {
1101 i++
1102 }
1103 return i + 2
1104}
1105
1106// parse ordered or unordered list block
1107func (p *parser) list(data []byte, flags ListType) int {
1108 i := 0
1109 flags |= ListItemBeginningOfList
1110 block := p.addBlock(List, nil)
1111 block.ListFlags = flags
1112 block.Tight = true
1113
1114 for i < len(data) {
1115 skip := p.listItem(data[i:], &flags)
1116 if flags&ListItemContainsBlock != 0 {
1117 block.ListData.Tight = false
1118 }
1119 i += skip
1120 if skip == 0 || flags&ListItemEndOfList != 0 {
1121 break
1122 }
1123 flags &= ^ListItemBeginningOfList
1124 }
1125
1126 above := block.Parent
1127 finalizeList(block)
1128 p.tip = above
1129 return i
1130}
1131
1132// Returns true if block ends with a blank line, descending if needed
1133// into lists and sublists.
1134func endsWithBlankLine(block *Node) bool {
1135 // TODO: figure this out. Always false now.
1136 for block != nil {
1137 //if block.lastLineBlank {
1138 //return true
1139 //}
1140 t := block.Type
1141 if t == List || t == Item {
1142 block = block.LastChild
1143 } else {
1144 break
1145 }
1146 }
1147 return false
1148}
1149
1150func finalizeList(block *Node) {
1151 block.open = false
1152 item := block.FirstChild
1153 for item != nil {
1154 // check for non-final list item ending with blank line:
1155 if endsWithBlankLine(item) && item.Next != nil {
1156 block.ListData.Tight = false
1157 break
1158 }
1159 // recurse into children of list item, to see if there are spaces
1160 // between any of them:
1161 subItem := item.FirstChild
1162 for subItem != nil {
1163 if endsWithBlankLine(subItem) && (item.Next != nil || subItem.Next != nil) {
1164 block.ListData.Tight = false
1165 break
1166 }
1167 subItem = subItem.Next
1168 }
1169 item = item.Next
1170 }
1171}
1172
1173// Parse a single list item.
1174// Assumes initial prefix is already removed if this is a sublist.
1175func (p *parser) listItem(data []byte, flags *ListType) int {
1176 // keep track of the indentation of the first line
1177 itemIndent := 0
1178 for itemIndent < 3 && data[itemIndent] == ' ' {
1179 itemIndent++
1180 }
1181
1182 var bulletChar byte = '*'
1183 i := p.uliPrefix(data)
1184 if i == 0 {
1185 i = p.oliPrefix(data)
1186 } else {
1187 bulletChar = data[i-2]
1188 }
1189 if i == 0 {
1190 i = p.dliPrefix(data)
1191 // reset definition term flag
1192 if i > 0 {
1193 *flags &= ^ListTypeTerm
1194 }
1195 }
1196 if i == 0 {
1197 // if in definition list, set term flag and continue
1198 if *flags&ListTypeDefinition != 0 {
1199 *flags |= ListTypeTerm
1200 } else {
1201 return 0
1202 }
1203 }
1204
1205 // skip leading whitespace on first line
1206 for data[i] == ' ' {
1207 i++
1208 }
1209
1210 // find the end of the line
1211 line := i
1212 for i > 0 && data[i-1] != '\n' {
1213 i++
1214 }
1215
1216 // get working buffer
1217 var raw bytes.Buffer
1218
1219 // put the first line into the working buffer
1220 raw.Write(data[line:i])
1221 line = i
1222
1223 // process the following lines
1224 containsBlankLine := false
1225 sublist := 0
1226
1227gatherlines:
1228 for line < len(data) {
1229 i++
1230
1231 // find the end of this line
1232 for data[i-1] != '\n' {
1233 i++
1234 }
1235
1236 // if it is an empty line, guess that it is part of this item
1237 // and move on to the next line
1238 if p.isEmpty(data[line:i]) > 0 {
1239 containsBlankLine = true
1240 line = i
1241 continue
1242 }
1243
1244 // calculate the indentation
1245 indent := 0
1246 for indent < 4 && line+indent < i && data[line+indent] == ' ' {
1247 indent++
1248 }
1249
1250 chunk := data[line+indent : i]
1251
1252 // evaluate how this line fits in
1253 switch {
1254 // is this a nested list item?
1255 case (p.uliPrefix(chunk) > 0 && !p.isHRule(chunk)) ||
1256 p.oliPrefix(chunk) > 0 ||
1257 p.dliPrefix(chunk) > 0:
1258
1259 if containsBlankLine {
1260 *flags |= ListItemContainsBlock
1261 }
1262
1263 // to be a nested list, it must be indented more
1264 // if not, it is the next item in the same list
1265 if indent <= itemIndent {
1266 break gatherlines
1267 }
1268
1269 // is this the first item in the nested list?
1270 if sublist == 0 {
1271 sublist = raw.Len()
1272 }
1273
1274 // is this a nested prefix header?
1275 case p.isPrefixHeader(chunk):
1276 // if the header is not indented, it is not nested in the list
1277 // and thus ends the list
1278 if containsBlankLine && indent < 4 {
1279 *flags |= ListItemEndOfList
1280 break gatherlines
1281 }
1282 *flags |= ListItemContainsBlock
1283
1284 // anything following an empty line is only part
1285 // of this item if it is indented 4 spaces
1286 // (regardless of the indentation of the beginning of the item)
1287 case containsBlankLine && indent < 4:
1288 if *flags&ListTypeDefinition != 0 && i < len(data)-1 {
1289 // is the next item still a part of this list?
1290 next := i
1291 for data[next] != '\n' {
1292 next++
1293 }
1294 for next < len(data)-1 && data[next] == '\n' {
1295 next++
1296 }
1297 if i < len(data)-1 && data[i] != ':' && data[next] != ':' {
1298 *flags |= ListItemEndOfList
1299 }
1300 } else {
1301 *flags |= ListItemEndOfList
1302 }
1303 break gatherlines
1304
1305 // a blank line means this should be parsed as a block
1306 case containsBlankLine:
1307 raw.WriteByte('\n')
1308 *flags |= ListItemContainsBlock
1309 }
1310
1311 // if this line was preceded by one or more blanks,
1312 // re-introduce the blank into the buffer
1313 if containsBlankLine {
1314 containsBlankLine = false
1315 raw.WriteByte('\n')
1316 }
1317
1318 // add the line into the working buffer without prefix
1319 raw.Write(data[line+indent : i])
1320
1321 line = i
1322 }
1323
1324 rawBytes := raw.Bytes()
1325
1326 block := p.addBlock(Item, nil)
1327 block.ListFlags = *flags
1328 block.Tight = false
1329 block.BulletChar = bulletChar
1330 block.Delimiter = '.' // Only '.' is possible in Markdown, but ')' will also be possible in CommonMark
1331
1332 // render the contents of the list item
1333 if *flags&ListItemContainsBlock != 0 && *flags&ListTypeTerm == 0 {
1334 // intermediate render of block item, except for definition term
1335 if sublist > 0 {
1336 p.block(rawBytes[:sublist])
1337 p.block(rawBytes[sublist:])
1338 } else {
1339 p.block(rawBytes)
1340 }
1341 } else {
1342 // intermediate render of inline item
1343 if sublist > 0 {
1344 child := p.addChild(Paragraph, 0)
1345 child.content = rawBytes[:sublist]
1346 p.block(rawBytes[sublist:])
1347 } else {
1348 child := p.addChild(Paragraph, 0)
1349 child.content = rawBytes
1350 }
1351 }
1352 return line
1353}
1354
1355// render a single paragraph that has already been parsed out
1356func (p *parser) renderParagraph(data []byte) {
1357 if len(data) == 0 {
1358 return
1359 }
1360
1361 // trim leading spaces
1362 beg := 0
1363 for data[beg] == ' ' {
1364 beg++
1365 }
1366
1367 // trim trailing newline
1368 end := len(data) - 1
1369
1370 // trim trailing spaces
1371 for end > beg && data[end-1] == ' ' {
1372 end--
1373 }
1374
1375 p.addBlock(Paragraph, data[beg:end])
1376}
1377
1378func (p *parser) paragraph(data []byte) int {
1379 // prev: index of 1st char of previous line
1380 // line: index of 1st char of current line
1381 // i: index of cursor/end of current line
1382 var prev, line, i int
1383
1384 // keep going until we find something to mark the end of the paragraph
1385 for i < len(data) {
1386 // mark the beginning of the current line
1387 prev = line
1388 current := data[i:]
1389 line = i
1390
1391 // did we find a blank line marking the end of the paragraph?
1392 if n := p.isEmpty(current); n > 0 {
1393 // did this blank line followed by a definition list item?
1394 if p.flags&DefinitionLists != 0 {
1395 if i < len(data)-1 && data[i+1] == ':' {
1396 return p.list(data[prev:], ListTypeDefinition)
1397 }
1398 }
1399
1400 p.renderParagraph(data[:i])
1401 return i + n
1402 }
1403
1404 // an underline under some text marks a header, so our paragraph ended on prev line
1405 if i > 0 {
1406 if level := p.isUnderlinedHeader(current); level > 0 {
1407 // render the paragraph
1408 p.renderParagraph(data[:prev])
1409
1410 // ignore leading and trailing whitespace
1411 eol := i - 1
1412 for prev < eol && data[prev] == ' ' {
1413 prev++
1414 }
1415 for eol > prev && data[eol-1] == ' ' {
1416 eol--
1417 }
1418
1419 id := ""
1420 if p.flags&AutoHeaderIDs != 0 {
1421 id = sanitized_anchor_name.Create(string(data[prev:eol]))
1422 }
1423
1424 block := p.addBlock(Header, data[prev:eol])
1425 block.Level = level
1426 block.HeaderID = id
1427
1428 // find the end of the underline
1429 for data[i] != '\n' {
1430 i++
1431 }
1432 return i
1433 }
1434 }
1435
1436 // if the next line starts a block of HTML, then the paragraph ends here
1437 if p.flags&LaxHTMLBlocks != 0 {
1438 if data[i] == '<' && p.html(current, false) > 0 {
1439 // rewind to before the HTML block
1440 p.renderParagraph(data[:i])
1441 return i
1442 }
1443 }
1444
1445 // if there's a prefixed header or a horizontal rule after this, paragraph is over
1446 if p.isPrefixHeader(current) || p.isHRule(current) {
1447 p.renderParagraph(data[:i])
1448 return i
1449 }
1450
1451 // if there's a fenced code block, paragraph is over
1452 if p.flags&FencedCode != 0 {
1453 if p.fencedCodeBlock(current, false) > 0 {
1454 p.renderParagraph(data[:i])
1455 return i
1456 }
1457 }
1458
1459 // if there's a definition list item, prev line is a definition term
1460 if p.flags&DefinitionLists != 0 {
1461 if p.dliPrefix(current) != 0 {
1462 return p.list(data[prev:], ListTypeDefinition)
1463 }
1464 }
1465
1466 // if there's a list after this, paragraph is over
1467 if p.flags&NoEmptyLineBeforeBlock != 0 {
1468 if p.uliPrefix(current) != 0 ||
1469 p.oliPrefix(current) != 0 ||
1470 p.quotePrefix(current) != 0 ||
1471 p.codePrefix(current) != 0 {
1472 p.renderParagraph(data[:i])
1473 return i
1474 }
1475 }
1476
1477 // otherwise, scan to the beginning of the next line
1478 for data[i] != '\n' {
1479 i++
1480 }
1481 i++
1482 }
1483
1484 p.renderParagraph(data[:i])
1485 return i
1486}
1487
1488func skipChar(data []byte, start int, char byte) int {
1489 i := start
1490 for i < len(data) && data[i] == char {
1491 i++
1492 }
1493 return i
1494}
1495
1496func skipUntilChar(text []byte, start int, char byte) int {
1497 i := start
1498 for i < len(text) && text[i] != char {
1499 i++
1500 }
1501 return i
1502}