inline.go (view raw)
1//
2// Black Friday Markdown Processor
3// Originally based on http://github.com/tanoku/upskirt
4// by Russ Ross <russ@russross.com>
5//
6
7//
8// Functions to parse inline elements.
9//
10
11package blackfriday
12
13import (
14 "bytes"
15)
16
17// Functions to parse text within a block
18// Each function returns the number of chars taken care of
19// data is the complete block being rendered
20// offset is the number of valid chars before the current cursor
21
22func parseInline(out *bytes.Buffer, rndr *render, data []byte) {
23 if rndr.nesting >= rndr.maxNesting {
24 return
25 }
26 rndr.nesting++
27
28 i, end := 0, 0
29 for i < len(data) {
30 // copy inactive chars into the output
31 for end < len(data) && rndr.inline[data[end]] == nil {
32 end++
33 }
34
35 if rndr.mk.normalText != nil {
36 rndr.mk.normalText(out, data[i:end], rndr.mk.opaque)
37 } else {
38 out.Write(data[i:end])
39 }
40
41 if end >= len(data) {
42 break
43 }
44 i = end
45
46 // call the trigger
47 parser := rndr.inline[data[end]]
48 end = parser(out, rndr, data, i)
49
50 if end == 0 { // no action from the callback
51 end = i + 1
52 } else {
53 i += end
54 end = i
55 }
56 }
57
58 rndr.nesting--
59}
60
61// single and double emphasis parsing
62func inlineEmphasis(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
63 data = data[offset:]
64 c := data[0]
65 ret := 0
66
67 if len(data) > 2 && data[1] != c {
68 // whitespace cannot follow an opening emphasis;
69 // strikethrough only takes two characters '~~'
70 if c == '~' || isspace(data[1]) {
71 return 0
72 }
73 if ret = inlineHelperEmph1(out, rndr, data[1:], c); ret == 0 {
74 return 0
75 }
76
77 return ret + 1
78 }
79
80 if len(data) > 3 && data[1] == c && data[2] != c {
81 if isspace(data[2]) {
82 return 0
83 }
84 if ret = inlineHelperEmph2(out, rndr, data[2:], c); ret == 0 {
85 return 0
86 }
87
88 return ret + 2
89 }
90
91 if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
92 if c == '~' || isspace(data[3]) {
93 return 0
94 }
95 if ret = inlineHelperEmph3(out, rndr, data, 3, c); ret == 0 {
96 return 0
97 }
98
99 return ret + 3
100 }
101
102 return 0
103}
104
105func inlineCodespan(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
106 data = data[offset:]
107
108 nb := 0
109
110 // count the number of backticks in the delimiter
111 for nb < len(data) && data[nb] == '`' {
112 nb++
113 }
114
115 // find the next delimiter
116 i, end := 0, 0
117 for end = nb; end < len(data) && i < nb; end++ {
118 if data[end] == '`' {
119 i++
120 } else {
121 i = 0
122 }
123 }
124
125 if i < nb && end >= len(data) {
126 return 0 // no matching delimiter
127 }
128
129 // trim outside whitespace
130 f_begin := nb
131 for f_begin < end && (data[f_begin] == ' ' || data[f_begin] == '\t') {
132 f_begin++
133 }
134
135 f_end := end - nb
136 for f_end > nb && (data[f_end-1] == ' ' || data[f_end-1] == '\t') {
137 f_end--
138 }
139
140 // real code span
141 if rndr.mk.codespan == nil {
142 return 0
143 }
144 if f_begin < f_end {
145 if rndr.mk.codespan(out, data[f_begin:f_end], rndr.mk.opaque) == 0 {
146 end = 0
147 }
148 } else {
149 if rndr.mk.codespan(out, nil, rndr.mk.opaque) == 0 {
150 end = 0
151 }
152 }
153
154 return end
155
156}
157
158// '\n' preceded by two spaces
159func inlineLinebreak(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
160 if offset < 2 || data[offset-1] != ' ' || data[offset-2] != ' ' {
161 return 0
162 }
163
164 // remove trailing spaces from out and render
165 outBytes := out.Bytes()
166 end := len(outBytes)
167 for end > 0 && outBytes[end-1] == ' ' {
168 end--
169 }
170 out.Truncate(end)
171
172 if rndr.mk.linebreak == nil {
173 return 0
174 }
175 if rndr.mk.linebreak(out, rndr.mk.opaque) > 0 {
176 return 1
177 } else {
178 return 0
179 }
180
181 return 0
182}
183
184// '[': parse a link or an image
185func inlineLink(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
186 isImg := offset > 0 && data[offset-1] == '!'
187
188 data = data[offset:]
189
190 i := 1
191 var title, link []byte
192 text_has_nl := false
193
194 // check whether the correct renderer exists
195 if (isImg && rndr.mk.image == nil) || (!isImg && rndr.mk.link == nil) {
196 return 0
197 }
198
199 // look for the matching closing bracket
200 for level := 1; level > 0 && i < len(data); i++ {
201 switch {
202 case data[i] == '\n':
203 text_has_nl = true
204
205 case data[i-1] == '\\':
206 continue
207
208 case data[i] == '[':
209 level++
210
211 case data[i] == ']':
212 level--
213 if level <= 0 {
214 i-- // compensate for extra i++ in for loop
215 }
216 }
217 }
218
219 if i >= len(data) {
220 return 0
221 }
222
223 txt_e := i
224 i++
225
226 // skip any amount of whitespace or newline
227 // (this is much more lax than original markdown syntax)
228 for i < len(data) && isspace(data[i]) {
229 i++
230 }
231
232 // inline style link
233 switch {
234 case i < len(data) && data[i] == '(':
235 // skip initial whitespace
236 i++
237
238 for i < len(data) && isspace(data[i]) {
239 i++
240 }
241
242 link_b := i
243
244 // look for link end: ' " )
245 for i < len(data) {
246 if data[i] == '\\' {
247 i += 2
248 } else {
249 if data[i] == ')' || data[i] == '\'' || data[i] == '"' {
250 break
251 }
252 i++
253 }
254 }
255
256 if i >= len(data) {
257 return 0
258 }
259 link_e := i
260
261 // look for title end if present
262 title_b, title_e := 0, 0
263 if data[i] == '\'' || data[i] == '"' {
264 i++
265 title_b = i
266
267 for i < len(data) {
268 if data[i] == '\\' {
269 i += 2
270 } else {
271 if data[i] == ')' {
272 break
273 }
274 i++
275 }
276 }
277
278 if i >= len(data) {
279 return 0
280 }
281
282 // skip whitespace after title
283 title_e = i - 1
284 for title_e > title_b && isspace(data[title_e]) {
285 title_e--
286 }
287
288 // check for closing quote presence
289 if data[title_e] != '\'' && data[title_e] != '"' {
290 title_b, title_e = 0, 0
291 link_e = i
292 }
293 }
294
295 // remove whitespace at the end of the link
296 for link_e > link_b && isspace(data[link_e-1]) {
297 link_e--
298 }
299
300 // remove optional angle brackets around the link
301 if data[link_b] == '<' {
302 link_b++
303 }
304 if data[link_e-1] == '>' {
305 link_e--
306 }
307
308 // build escaped link and title
309 if link_e > link_b {
310 link = data[link_b:link_e]
311 }
312
313 if title_e > title_b {
314 title = data[title_b:title_e]
315 }
316
317 i++
318
319 // reference style link
320 case i < len(data) && data[i] == '[':
321 var id []byte
322
323 // look for the id
324 i++
325 link_b := i
326 for i < len(data) && data[i] != ']' {
327 i++
328 }
329 if i >= len(data) {
330 return 0
331 }
332 link_e := i
333
334 // find the reference
335 if link_b == link_e {
336 if text_has_nl {
337 b := bytes.NewBuffer(nil)
338
339 for j := 1; j < txt_e; j++ {
340 switch {
341 case data[j] != '\n':
342 b.WriteByte(data[j])
343 case data[j-1] != ' ':
344 b.WriteByte(' ')
345 }
346 }
347
348 id = b.Bytes()
349 } else {
350 id = data[1:txt_e]
351 }
352 } else {
353 id = data[link_b:link_e]
354 }
355
356 // find the reference with matching id (ids are case-insensitive)
357 key := string(bytes.ToLower(id))
358 lr, ok := rndr.refs[key]
359 if !ok {
360 return 0
361 }
362
363 // keep link and title from reference
364 link = lr.link
365 title = lr.title
366 i++
367
368 // shortcut reference style link
369 default:
370 var id []byte
371
372 // craft the id
373 if text_has_nl {
374 b := bytes.NewBuffer(nil)
375
376 for j := 1; j < txt_e; j++ {
377 switch {
378 case data[j] != '\n':
379 b.WriteByte(data[j])
380 case data[j-1] != ' ':
381 b.WriteByte(' ')
382 }
383 }
384
385 id = b.Bytes()
386 } else {
387 id = data[1:txt_e]
388 }
389
390 // find the reference with matching id
391 key := string(bytes.ToLower(id))
392 lr, ok := rndr.refs[key]
393 if !ok {
394 return 0
395 }
396
397 // keep link and title from reference
398 link = lr.link
399 title = lr.title
400
401 // rewind the whitespace
402 i = txt_e + 1
403 }
404
405 // build content: img alt is escaped, link content is parsed
406 content := bytes.NewBuffer(nil)
407 if txt_e > 1 {
408 if isImg {
409 content.Write(data[1:txt_e])
410 } else {
411 parseInline(content, rndr, data[1:txt_e])
412 }
413 }
414
415 var u_link []byte
416 if len(link) > 0 {
417 u_link_buf := bytes.NewBuffer(nil)
418 unescapeText(u_link_buf, link)
419 u_link = u_link_buf.Bytes()
420 }
421
422 // call the relevant rendering function
423 ret := 0
424 if isImg {
425 outSize := out.Len()
426 outBytes := out.Bytes()
427 if outSize > 0 && outBytes[outSize-1] == '!' {
428 out.Truncate(outSize - 1)
429 }
430
431 ret = rndr.mk.image(out, u_link, title, content.Bytes(), rndr.mk.opaque)
432 } else {
433 ret = rndr.mk.link(out, u_link, title, content.Bytes(), rndr.mk.opaque)
434 }
435
436 if ret > 0 {
437 return i
438 }
439 return 0
440}
441
442// '<' when tags or autolinks are allowed
443func inlineLangle(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
444 data = data[offset:]
445 altype := LINK_TYPE_NOT_AUTOLINK
446 end := tagLength(data, &altype)
447 ret := 0
448
449 if end > 2 {
450 switch {
451 case rndr.mk.autolink != nil && altype != LINK_TYPE_NOT_AUTOLINK:
452 u_link := bytes.NewBuffer(nil)
453 unescapeText(u_link, data[1:end+1-2])
454 ret = rndr.mk.autolink(out, u_link.Bytes(), altype, rndr.mk.opaque)
455 case rndr.mk.rawHtmlTag != nil:
456 ret = rndr.mk.rawHtmlTag(out, data[:end], rndr.mk.opaque)
457 }
458 }
459
460 if ret == 0 {
461 return 0
462 }
463 return end
464}
465
466// '\\' backslash escape
467var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>")
468
469func inlineEscape(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
470 data = data[offset:]
471
472 if len(data) > 1 {
473 if bytes.IndexByte(escapeChars, data[1]) < 0 {
474 return 0
475 }
476
477 if rndr.mk.normalText != nil {
478 rndr.mk.normalText(out, data[1:2], rndr.mk.opaque)
479 } else {
480 out.WriteByte(data[1])
481 }
482 }
483
484 return 2
485}
486
487func unescapeText(ob *bytes.Buffer, src []byte) {
488 i := 0
489 for i < len(src) {
490 org := i
491 for i < len(src) && src[i] != '\\' {
492 i++
493 }
494
495 if i > org {
496 ob.Write(src[org:i])
497 }
498
499 if i+1 >= len(src) {
500 break
501 }
502
503 ob.WriteByte(src[i+1])
504 i += 2
505 }
506}
507
508// '&' escaped when it doesn't belong to an entity
509// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
510func inlineEntity(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
511 data = data[offset:]
512
513 end := 1
514
515 if end < len(data) && data[end] == '#' {
516 end++
517 }
518
519 for end < len(data) && isalnum(data[end]) {
520 end++
521 }
522
523 if end < len(data) && data[end] == ';' {
524 end++ // real entity
525 } else {
526 return 0 // lone '&'
527 }
528
529 if rndr.mk.entity != nil {
530 rndr.mk.entity(out, data[:end], rndr.mk.opaque)
531 } else {
532 out.Write(data[:end])
533 }
534
535 return end
536}
537
538func inlineAutolink(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
539 orig_data := data
540 data = data[offset:]
541
542 if offset > 0 {
543 if !isspace(orig_data[offset-1]) && !ispunct(orig_data[offset-1]) {
544 return 0
545 }
546 }
547
548 if !isSafeLink(data) {
549 return 0
550 }
551
552 link_end := 0
553 for link_end < len(data) && !isspace(data[link_end]) {
554 link_end++
555 }
556
557 // Skip punctuation at the end of the link
558 if (data[link_end-1] == '.' || data[link_end-1] == ',' || data[link_end-1] == ';') && data[link_end-2] != '\\' {
559 link_end--
560 }
561
562 // See if the link finishes with a punctuation sign that can be closed.
563 var copen byte
564 switch data[link_end-1] {
565 case '"':
566 copen = '"'
567 case '\'':
568 copen = '\''
569 case ')':
570 copen = '('
571 case ']':
572 copen = '['
573 case '}':
574 copen = '{'
575 default:
576 copen = 0
577 }
578
579 if copen != 0 {
580 buf_end := offset + link_end - 2
581
582 open_delim := 1
583
584 /* Try to close the final punctuation sign in this same line;
585 * if we managed to close it outside of the URL, that means that it's
586 * not part of the URL. If it closes inside the URL, that means it
587 * is part of the URL.
588 *
589 * Examples:
590 *
591 * foo http://www.pokemon.com/Pikachu_(Electric) bar
592 * => http://www.pokemon.com/Pikachu_(Electric)
593 *
594 * foo (http://www.pokemon.com/Pikachu_(Electric)) bar
595 * => http://www.pokemon.com/Pikachu_(Electric)
596 *
597 * foo http://www.pokemon.com/Pikachu_(Electric)) bar
598 * => http://www.pokemon.com/Pikachu_(Electric))
599 *
600 * (foo http://www.pokemon.com/Pikachu_(Electric)) bar
601 * => foo http://www.pokemon.com/Pikachu_(Electric)
602 */
603
604 for buf_end >= 0 && orig_data[buf_end] != '\n' && open_delim != 0 {
605 if orig_data[buf_end] == data[link_end-1] {
606 open_delim++
607 }
608
609 if orig_data[buf_end] == copen {
610 open_delim--
611 }
612
613 buf_end--
614 }
615
616 if open_delim == 0 {
617 link_end--
618 }
619 }
620
621 if rndr.mk.autolink != nil {
622 u_link := bytes.NewBuffer(nil)
623 unescapeText(u_link, data[:link_end])
624
625 rndr.mk.autolink(out, u_link.Bytes(), LINK_TYPE_NORMAL, rndr.mk.opaque)
626 }
627
628 return link_end
629}
630
631var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
632
633func isSafeLink(link []byte) bool {
634 for _, prefix := range validUris {
635 // TODO: handle unicode here
636 // case-insensitive prefix test
637 if len(link) > len(prefix) && !less(link[:len(prefix)], prefix) && !less(prefix, link[:len(prefix)]) && isalnum(link[len(prefix)]) {
638 return true
639 }
640 }
641
642 return false
643}
644
645// return the length of the given tag, or 0 is it's not valid
646func tagLength(data []byte, autolink *int) int {
647 var i, j int
648
649 // a valid tag can't be shorter than 3 chars
650 if len(data) < 3 {
651 return 0
652 }
653
654 // begins with a '<' optionally followed by '/', followed by letter or number
655 if data[0] != '<' {
656 return 0
657 }
658 if data[1] == '/' {
659 i = 2
660 } else {
661 i = 1
662 }
663
664 if !isalnum(data[i]) {
665 return 0
666 }
667
668 // scheme test
669 *autolink = LINK_TYPE_NOT_AUTOLINK
670
671 // try to find the beggining of an URI
672 for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
673 i++
674 }
675
676 if i > 1 && data[i] == '@' {
677 if j = isMailtoAutolink(data[i:]); j != 0 {
678 *autolink = LINK_TYPE_EMAIL
679 return i + j
680 }
681 }
682
683 if i > 2 && data[i] == ':' {
684 *autolink = LINK_TYPE_NORMAL
685 i++
686 }
687
688 // complete autolink test: no whitespace or ' or "
689 switch {
690 case i >= len(data):
691 *autolink = LINK_TYPE_NOT_AUTOLINK
692 case *autolink != 0:
693 j = i
694
695 for i < len(data) {
696 if data[i] == '\\' {
697 i += 2
698 } else {
699 if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
700 break
701 } else {
702 i++
703 }
704 }
705
706 }
707
708 if i >= len(data) {
709 return 0
710 }
711 if i > j && data[i] == '>' {
712 return i + 1
713 }
714
715 // one of the forbidden chars has been found
716 *autolink = LINK_TYPE_NOT_AUTOLINK
717 }
718
719 // look for something looking like a tag end
720 for i < len(data) && data[i] != '>' {
721 i++
722 }
723 if i >= len(data) {
724 return 0
725 }
726 return i + 1
727}
728
729// look for the address part of a mail autolink and '>'
730// this is less strict than the original markdown e-mail address matching
731func isMailtoAutolink(data []byte) int {
732 nb := 0
733
734 // address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
735 for i := 0; i < len(data); i++ {
736 if isalnum(data[i]) {
737 continue
738 }
739
740 switch data[i] {
741 case '@':
742 nb++
743
744 case '-', '.', '_':
745 break
746
747 case '>':
748 if nb == 1 {
749 return i + 1
750 } else {
751 return 0
752 }
753 default:
754 return 0
755 }
756 }
757
758 return 0
759}
760
761// look for the next emph char, skipping other constructs
762func inlineHelperFindEmphChar(data []byte, c byte) int {
763 i := 1
764
765 for i < len(data) {
766 for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
767 i++
768 }
769 if i >= len(data) {
770 return 0
771 }
772 if data[i] == c {
773 return i
774 }
775
776 // do not count escaped chars
777 if i != 0 && data[i-1] == '\\' {
778 i++
779 continue
780 }
781
782 if data[i] == '`' {
783 // skip a code span
784 tmp_i := 0
785 i++
786 for i < len(data) && data[i] != '`' {
787 if tmp_i == 0 && data[i] == c {
788 tmp_i = i
789 }
790 i++
791 }
792 if i >= len(data) {
793 return tmp_i
794 }
795 i++
796 } else {
797 if data[i] == '[' {
798 // skip a link
799 tmp_i := 0
800 i++
801 for i < len(data) && data[i] != ']' {
802 if tmp_i == 0 && data[i] == c {
803 tmp_i = i
804 }
805 i++
806 }
807 i++
808 for i < len(data) && (data[i] == ' ' || data[i] == '\t' || data[i] == '\n') {
809 i++
810 }
811 if i >= len(data) {
812 return tmp_i
813 }
814 if data[i] != '[' && data[i] != '(' { // not a link
815 if tmp_i > 0 {
816 return tmp_i
817 } else {
818 continue
819 }
820 }
821 cc := data[i]
822 i++
823 for i < len(data) && data[i] != cc {
824 if tmp_i == 0 && data[i] == c {
825 tmp_i = i
826 }
827 i++
828 }
829 if i >= len(data) {
830 return tmp_i
831 }
832 i++
833 }
834 }
835 }
836 return 0
837}
838
839func inlineHelperEmph1(out *bytes.Buffer, rndr *render, data []byte, c byte) int {
840 i := 0
841
842 if rndr.mk.emphasis == nil {
843 return 0
844 }
845
846 // skip one symbol if coming from emph3
847 if len(data) > 1 && data[0] == c && data[1] == c {
848 i = 1
849 }
850
851 for i < len(data) {
852 length := inlineHelperFindEmphChar(data[i:], c)
853 if length == 0 {
854 return 0
855 }
856 i += length
857 if i >= len(data) {
858 return 0
859 }
860
861 if i+1 < len(data) && data[i+1] == c {
862 i++
863 continue
864 }
865
866 if data[i] == c && !isspace(data[i-1]) {
867
868 if rndr.flags&EXTENSION_NO_INTRA_EMPHASIS != 0 {
869 if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
870 continue
871 }
872 }
873
874 work := bytes.NewBuffer(nil)
875 parseInline(work, rndr, data[:i])
876 r := rndr.mk.emphasis(out, work.Bytes(), rndr.mk.opaque)
877 if r > 0 {
878 return i + 1
879 } else {
880 return 0
881 }
882 }
883 }
884
885 return 0
886}
887
888func inlineHelperEmph2(out *bytes.Buffer, rndr *render, data []byte, c byte) int {
889 render_method := rndr.mk.doubleEmphasis
890 if c == '~' {
891 render_method = rndr.mk.strikethrough
892 }
893
894 if render_method == nil {
895 return 0
896 }
897
898 i := 0
899
900 for i < len(data) {
901 length := inlineHelperFindEmphChar(data[i:], c)
902 if length == 0 {
903 return 0
904 }
905 i += length
906
907 if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
908 work := bytes.NewBuffer(nil)
909 parseInline(work, rndr, data[:i])
910 r := render_method(out, work.Bytes(), rndr.mk.opaque)
911 if r > 0 {
912 return i + 2
913 } else {
914 return 0
915 }
916 }
917 i++
918 }
919 return 0
920}
921
922func inlineHelperEmph3(out *bytes.Buffer, rndr *render, data []byte, offset int, c byte) int {
923 i := 0
924 orig_data := data
925 data = data[offset:]
926
927 for i < len(data) {
928 length := inlineHelperFindEmphChar(data[i:], c)
929 if length == 0 {
930 return 0
931 }
932 i += length
933
934 // skip whitespace preceded symbols
935 if data[i] != c || isspace(data[i-1]) {
936 continue
937 }
938
939 switch {
940 case (i+2 < len(data) && data[i+1] == c && data[i+2] == c && rndr.mk.tripleEmphasis != nil):
941 // triple symbol found
942 work := bytes.NewBuffer(nil)
943
944 parseInline(work, rndr, data[:i])
945 r := rndr.mk.tripleEmphasis(out, work.Bytes(), rndr.mk.opaque)
946 if r > 0 {
947 return i + 3
948 } else {
949 return 0
950 }
951 case (i+1 < len(data) && data[i+1] == c):
952 // double symbol found, hand over to emph1
953 length = inlineHelperEmph1(out, rndr, orig_data[offset-2:], c)
954 if length == 0 {
955 return 0
956 } else {
957 return length - 2
958 }
959 default:
960 // single symbol found, hand over to emph2
961 length = inlineHelperEmph2(out, rndr, orig_data[offset-1:], c)
962 if length == 0 {
963 return 0
964 } else {
965 return length - 1
966 }
967 }
968 }
969 return 0
970}