inline.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11// Functions to parse inline elements.
12//
13
14package blackfriday
15
16import (
17 "bytes"
18)
19
20// Functions to parse text within a block
21// Each function returns the number of chars taken care of
22// data is the complete block being rendered
23// offset is the number of valid chars before the current cursor
24
25func (parser *Parser) parseInline(out *bytes.Buffer, data []byte) {
26 // this is called recursively: enforce a maximum depth
27 if parser.nesting >= parser.maxNesting {
28 return
29 }
30 parser.nesting++
31
32 i, end := 0, 0
33 for i < len(data) {
34 // copy inactive chars into the output
35 for end < len(data) && parser.inline[data[end]] == nil {
36 end++
37 }
38
39 parser.r.NormalText(out, data[i:end])
40
41 if end >= len(data) {
42 break
43 }
44 i = end
45
46 // call the trigger
47 handler := parser.inline[data[end]]
48 if consumed := handler(parser, out, data, i); consumed == 0 {
49 // no action from the callback; buffer the byte for later
50 end = i + 1
51 } else {
52 // skip past whatever the callback used
53 i += consumed
54 end = i
55 }
56 }
57
58 parser.nesting--
59}
60
61// single and double emphasis parsing
62func inlineEmphasis(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
63 data = data[offset:]
64 c := data[0]
65 ret := 0
66
67 if len(data) > 2 && data[1] != c {
68 // whitespace cannot follow an opening emphasis;
69 // strikethrough only takes two characters '~~'
70 if c == '~' || isspace(data[1]) {
71 return 0
72 }
73 if ret = inlineHelperEmph1(parser, out, data[1:], c); ret == 0 {
74 return 0
75 }
76
77 return ret + 1
78 }
79
80 if len(data) > 3 && data[1] == c && data[2] != c {
81 if isspace(data[2]) {
82 return 0
83 }
84 if ret = inlineHelperEmph2(parser, out, data[2:], c); ret == 0 {
85 return 0
86 }
87
88 return ret + 2
89 }
90
91 if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
92 if c == '~' || isspace(data[3]) {
93 return 0
94 }
95 if ret = inlineHelperEmph3(parser, out, data, 3, c); ret == 0 {
96 return 0
97 }
98
99 return ret + 3
100 }
101
102 return 0
103}
104
105func inlineCodeSpan(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
106 data = data[offset:]
107
108 nb := 0
109
110 // count the number of backticks in the delimiter
111 for nb < len(data) && data[nb] == '`' {
112 nb++
113 }
114
115 // find the next delimiter
116 i, end := 0, 0
117 for end = nb; end < len(data) && i < nb; end++ {
118 if data[end] == '`' {
119 i++
120 } else {
121 i = 0
122 }
123 }
124
125 // no matching delimiter?
126 if i < nb && end >= len(data) {
127 return 0
128 }
129
130 // trim outside whitespace
131 fBegin := nb
132 for fBegin < end && (data[fBegin] == ' ' || data[fBegin] == '\t') {
133 fBegin++
134 }
135
136 fEnd := end - nb
137 for fEnd > fBegin && (data[fEnd-1] == ' ' || data[fEnd-1] == '\t') {
138 fEnd--
139 }
140
141 // render the code span
142 parser.r.CodeSpan(out, data[fBegin:fEnd])
143
144 return end
145
146}
147
148// newline preceded by two spaces becomes <br>
149// newline without two spaces works when EXTENSION_HARD_LINE_BREAK is enabled
150func inlineLineBreak(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
151 // remove trailing spaces from out
152 outBytes := out.Bytes()
153 end := len(outBytes)
154 eol := end
155 for eol > 0 && (outBytes[eol-1] == ' ' || outBytes[eol-1] == '\t') {
156 eol--
157 }
158 out.Truncate(eol)
159
160 // should there be a hard line break here?
161 if parser.flags&EXTENSION_HARD_LINE_BREAK == 0 && end-eol < 2 {
162 return 0
163 }
164
165 parser.r.LineBreak(out)
166 return 1
167}
168
169// '[': parse a link or an image
170func inlineLink(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
171 // no links allowed inside other links
172 if parser.insideLink {
173 return 0
174 }
175
176 isImg := offset > 0 && data[offset-1] == '!'
177
178 data = data[offset:]
179
180 i := 1
181 var title, link []byte
182 textHasNl := false
183
184 // look for the matching closing bracket
185 for level := 1; level > 0 && i < len(data); i++ {
186 switch {
187 case data[i] == '\n':
188 textHasNl = true
189
190 case data[i-1] == '\\':
191 continue
192
193 case data[i] == '[':
194 level++
195
196 case data[i] == ']':
197 level--
198 if level <= 0 {
199 i-- // compensate for extra i++ in for loop
200 }
201 }
202 }
203
204 if i >= len(data) {
205 return 0
206 }
207
208 txtE := i
209 i++
210
211 // skip any amount of whitespace or newline
212 // (this is much more lax than original markdown syntax)
213 for i < len(data) && isspace(data[i]) {
214 i++
215 }
216
217 // inline style link
218 switch {
219 case i < len(data) && data[i] == '(':
220 // skip initial whitespace
221 i++
222
223 for i < len(data) && isspace(data[i]) {
224 i++
225 }
226
227 linkB := i
228
229 // look for link end: ' " )
230 for i < len(data) {
231 if data[i] == '\\' {
232 i += 2
233 } else {
234 if data[i] == ')' || data[i] == '\'' || data[i] == '"' {
235 break
236 }
237 i++
238 }
239 }
240
241 if i >= len(data) {
242 return 0
243 }
244 linkE := i
245
246 // look for title end if present
247 titleB, titleE := 0, 0
248 if data[i] == '\'' || data[i] == '"' {
249 i++
250 titleB = i
251
252 for i < len(data) {
253 if data[i] == '\\' {
254 i += 2
255 } else {
256 if data[i] == ')' {
257 break
258 }
259 i++
260 }
261 }
262
263 if i >= len(data) {
264 return 0
265 }
266
267 // skip whitespace after title
268 titleE = i - 1
269 for titleE > titleB && isspace(data[titleE]) {
270 titleE--
271 }
272
273 // check for closing quote presence
274 if data[titleE] != '\'' && data[titleE] != '"' {
275 titleB, titleE = 0, 0
276 linkE = i
277 }
278 }
279
280 // remove whitespace at the end of the link
281 for linkE > linkB && isspace(data[linkE-1]) {
282 linkE--
283 }
284
285 // remove optional angle brackets around the link
286 if data[linkB] == '<' {
287 linkB++
288 }
289 if data[linkE-1] == '>' {
290 linkE--
291 }
292
293 // build escaped link and title
294 if linkE > linkB {
295 link = data[linkB:linkE]
296 }
297
298 if titleE > titleB {
299 title = data[titleB:titleE]
300 }
301
302 i++
303
304 // reference style link
305 case i < len(data) && data[i] == '[':
306 var id []byte
307
308 // look for the id
309 i++
310 linkB := i
311 for i < len(data) && data[i] != ']' {
312 i++
313 }
314 if i >= len(data) {
315 return 0
316 }
317 linkE := i
318
319 // find the reference
320 if linkB == linkE {
321 if textHasNl {
322 var b bytes.Buffer
323
324 for j := 1; j < txtE; j++ {
325 switch {
326 case data[j] != '\n':
327 b.WriteByte(data[j])
328 case data[j-1] != ' ':
329 b.WriteByte(' ')
330 }
331 }
332
333 id = b.Bytes()
334 } else {
335 id = data[1:txtE]
336 }
337 } else {
338 id = data[linkB:linkE]
339 }
340
341 // find the reference with matching id (ids are case-insensitive)
342 key := string(bytes.ToLower(id))
343 lr, ok := parser.refs[key]
344 if !ok {
345 return 0
346 }
347
348 // keep link and title from reference
349 link = lr.link
350 title = lr.title
351 i++
352
353 // shortcut reference style link
354 default:
355 var id []byte
356
357 // craft the id
358 if textHasNl {
359 var b bytes.Buffer
360
361 for j := 1; j < txtE; j++ {
362 switch {
363 case data[j] != '\n':
364 b.WriteByte(data[j])
365 case data[j-1] != ' ':
366 b.WriteByte(' ')
367 }
368 }
369
370 id = b.Bytes()
371 } else {
372 id = data[1:txtE]
373 }
374
375 // find the reference with matching id
376 key := string(bytes.ToLower(id))
377 lr, ok := parser.refs[key]
378 if !ok {
379 return 0
380 }
381
382 // keep link and title from reference
383 link = lr.link
384 title = lr.title
385
386 // rewind the whitespace
387 i = txtE + 1
388 }
389
390 // build content: img alt is escaped, link content is parsed
391 var content bytes.Buffer
392 if txtE > 1 {
393 if isImg {
394 content.Write(data[1:txtE])
395 } else {
396 // links cannot contain other links, so turn off link parsing temporarily
397 insideLink := parser.insideLink
398 parser.insideLink = true
399 parser.parseInline(&content, data[1:txtE])
400 parser.insideLink = insideLink
401 }
402 }
403
404 var uLink []byte
405 if len(link) > 0 {
406 var uLinkBuf bytes.Buffer
407 unescapeText(&uLinkBuf, link)
408 uLink = uLinkBuf.Bytes()
409 }
410
411 // links need something to click on and somewhere to go
412 if len(uLink) == 0 || content.Len() == 0 {
413 return 0
414 }
415
416 // call the relevant rendering function
417 if isImg {
418 outSize := out.Len()
419 outBytes := out.Bytes()
420 if outSize > 0 && outBytes[outSize-1] == '!' {
421 out.Truncate(outSize - 1)
422 }
423
424 parser.r.Image(out, uLink, title, content.Bytes())
425 } else {
426 parser.r.Link(out, uLink, title, content.Bytes())
427 }
428
429 return i
430}
431
432// '<' when tags or autolinks are allowed
433func inlineLAngle(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
434 data = data[offset:]
435 altype := LINK_TYPE_NOT_AUTOLINK
436 end := tagLength(data, &altype)
437
438 if end > 2 {
439 if altype != LINK_TYPE_NOT_AUTOLINK {
440 var uLink bytes.Buffer
441 unescapeText(&uLink, data[1:end+1-2])
442 parser.r.AutoLink(out, uLink.Bytes(), altype)
443 } else {
444 parser.r.RawHtmlTag(out, data[:end])
445 }
446 }
447
448 return end
449}
450
451// '\\' backslash escape
452var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>")
453
454func inlineEscape(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
455 data = data[offset:]
456
457 if len(data) > 1 {
458 if bytes.IndexByte(escapeChars, data[1]) < 0 {
459 return 0
460 }
461
462 parser.r.NormalText(out, data[1:2])
463 }
464
465 return 2
466}
467
468func unescapeText(ob *bytes.Buffer, src []byte) {
469 i := 0
470 for i < len(src) {
471 org := i
472 for i < len(src) && src[i] != '\\' {
473 i++
474 }
475
476 if i > org {
477 ob.Write(src[org:i])
478 }
479
480 if i+1 >= len(src) {
481 break
482 }
483
484 ob.WriteByte(src[i+1])
485 i += 2
486 }
487}
488
489// '&' escaped when it doesn't belong to an entity
490// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
491func inlineEntity(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
492 data = data[offset:]
493
494 end := 1
495
496 if end < len(data) && data[end] == '#' {
497 end++
498 }
499
500 for end < len(data) && isalnum(data[end]) {
501 end++
502 }
503
504 if end < len(data) && data[end] == ';' {
505 end++ // real entity
506 } else {
507 return 0 // lone '&'
508 }
509
510 parser.r.Entity(out, data[:end])
511
512 return end
513}
514
515func inlineAutoLink(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
516 // quick check to rule out most false hits on ':'
517 if parser.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' {
518 return 0
519 }
520
521 // scan backward for a word boundary
522 rewind := 0
523 for offset-rewind > 0 && rewind <= 7 && !isspace(data[offset-rewind-1]) && !isspace(data[offset-rewind-1]) {
524 rewind++
525 }
526 if rewind > 6 { // longest supported protocol is "mailto" which has 6 letters
527 return 0
528 }
529
530 origData := data
531 data = data[offset-rewind:]
532
533 if !isSafeLink(data) {
534 return 0
535 }
536
537 linkEnd := 0
538 for linkEnd < len(data) && !isspace(data[linkEnd]) {
539 linkEnd++
540 }
541
542 // Skip punctuation at the end of the link
543 if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' {
544 linkEnd--
545 }
546
547 // See if the link finishes with a punctuation sign that can be closed.
548 var copen byte
549 switch data[linkEnd-1] {
550 case '"':
551 copen = '"'
552 case '\'':
553 copen = '\''
554 case ')':
555 copen = '('
556 case ']':
557 copen = '['
558 case '}':
559 copen = '{'
560 default:
561 copen = 0
562 }
563
564 if copen != 0 {
565 bufEnd := offset - rewind + linkEnd - 2
566
567 openDelim := 1
568
569 /* Try to close the final punctuation sign in this same line;
570 * if we managed to close it outside of the URL, that means that it's
571 * not part of the URL. If it closes inside the URL, that means it
572 * is part of the URL.
573 *
574 * Examples:
575 *
576 * foo http://www.pokemon.com/Pikachu_(Electric) bar
577 * => http://www.pokemon.com/Pikachu_(Electric)
578 *
579 * foo (http://www.pokemon.com/Pikachu_(Electric)) bar
580 * => http://www.pokemon.com/Pikachu_(Electric)
581 *
582 * foo http://www.pokemon.com/Pikachu_(Electric)) bar
583 * => http://www.pokemon.com/Pikachu_(Electric))
584 *
585 * (foo http://www.pokemon.com/Pikachu_(Electric)) bar
586 * => foo http://www.pokemon.com/Pikachu_(Electric)
587 */
588
589 for bufEnd >= 0 && origData[bufEnd] != '\n' && openDelim != 0 {
590 if origData[bufEnd] == data[linkEnd-1] {
591 openDelim++
592 }
593
594 if origData[bufEnd] == copen {
595 openDelim--
596 }
597
598 bufEnd--
599 }
600
601 if openDelim == 0 {
602 linkEnd--
603 }
604 }
605
606 // we were triggered on the ':', so we need to rewind the output a bit
607 if out.Len() >= rewind {
608 out.Truncate(len(out.Bytes()) - rewind)
609 }
610
611 var uLink bytes.Buffer
612 unescapeText(&uLink, data[:linkEnd])
613
614 parser.r.AutoLink(out, uLink.Bytes(), LINK_TYPE_NORMAL)
615
616 return linkEnd - rewind
617}
618
619var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
620
621func isSafeLink(link []byte) bool {
622 for _, prefix := range validUris {
623 // TODO: handle unicode here
624 // case-insensitive prefix test
625 if len(link) > len(prefix) && bytes.Equal(bytes.ToLower(link[:len(prefix)]), prefix) && isalnum(link[len(prefix)]) {
626 return true
627 }
628 }
629
630 return false
631}
632
633// return the length of the given tag, or 0 is it's not valid
634func tagLength(data []byte, autolink *int) int {
635 var i, j int
636
637 // a valid tag can't be shorter than 3 chars
638 if len(data) < 3 {
639 return 0
640 }
641
642 // begins with a '<' optionally followed by '/', followed by letter or number
643 if data[0] != '<' {
644 return 0
645 }
646 if data[1] == '/' {
647 i = 2
648 } else {
649 i = 1
650 }
651
652 if !isalnum(data[i]) {
653 return 0
654 }
655
656 // scheme test
657 *autolink = LINK_TYPE_NOT_AUTOLINK
658
659 // try to find the beginning of an URI
660 for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
661 i++
662 }
663
664 if i > 1 && data[i] == '@' {
665 if j = isMailtoAutoLink(data[i:]); j != 0 {
666 *autolink = LINK_TYPE_EMAIL
667 return i + j
668 }
669 }
670
671 if i > 2 && data[i] == ':' {
672 *autolink = LINK_TYPE_NORMAL
673 i++
674 }
675
676 // complete autolink test: no whitespace or ' or "
677 switch {
678 case i >= len(data):
679 *autolink = LINK_TYPE_NOT_AUTOLINK
680 case *autolink != 0:
681 j = i
682
683 for i < len(data) {
684 if data[i] == '\\' {
685 i += 2
686 } else {
687 if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
688 break
689 } else {
690 i++
691 }
692 }
693
694 }
695
696 if i >= len(data) {
697 return 0
698 }
699 if i > j && data[i] == '>' {
700 return i + 1
701 }
702
703 // one of the forbidden chars has been found
704 *autolink = LINK_TYPE_NOT_AUTOLINK
705 }
706
707 // look for something looking like a tag end
708 for i < len(data) && data[i] != '>' {
709 i++
710 }
711 if i >= len(data) {
712 return 0
713 }
714 return i + 1
715}
716
717// look for the address part of a mail autolink and '>'
718// this is less strict than the original markdown e-mail address matching
719func isMailtoAutoLink(data []byte) int {
720 nb := 0
721
722 // address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
723 for i := 0; i < len(data); i++ {
724 if isalnum(data[i]) {
725 continue
726 }
727
728 switch data[i] {
729 case '@':
730 nb++
731
732 case '-', '.', '_':
733 break
734
735 case '>':
736 if nb == 1 {
737 return i + 1
738 } else {
739 return 0
740 }
741 default:
742 return 0
743 }
744 }
745
746 return 0
747}
748
749// look for the next emph char, skipping other constructs
750func inlineHelperFindEmphChar(data []byte, c byte) int {
751 i := 1
752
753 for i < len(data) {
754 for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
755 i++
756 }
757 if i >= len(data) {
758 return 0
759 }
760 if data[i] == c {
761 return i
762 }
763
764 // do not count escaped chars
765 if i != 0 && data[i-1] == '\\' {
766 i++
767 continue
768 }
769
770 if data[i] == '`' {
771 // skip a code span
772 tmpI := 0
773 i++
774 for i < len(data) && data[i] != '`' {
775 if tmpI == 0 && data[i] == c {
776 tmpI = i
777 }
778 i++
779 }
780 if i >= len(data) {
781 return tmpI
782 }
783 i++
784 } else {
785 if data[i] == '[' {
786 // skip a link
787 tmpI := 0
788 i++
789 for i < len(data) && data[i] != ']' {
790 if tmpI == 0 && data[i] == c {
791 tmpI = i
792 }
793 i++
794 }
795 i++
796 for i < len(data) && (data[i] == ' ' || data[i] == '\t' || data[i] == '\n') {
797 i++
798 }
799 if i >= len(data) {
800 return tmpI
801 }
802 if data[i] != '[' && data[i] != '(' { // not a link
803 if tmpI > 0 {
804 return tmpI
805 } else {
806 continue
807 }
808 }
809 cc := data[i]
810 i++
811 for i < len(data) && data[i] != cc {
812 if tmpI == 0 && data[i] == c {
813 tmpI = i
814 }
815 i++
816 }
817 if i >= len(data) {
818 return tmpI
819 }
820 i++
821 }
822 }
823 }
824 return 0
825}
826
827func inlineHelperEmph1(parser *Parser, out *bytes.Buffer, data []byte, c byte) int {
828 i := 0
829
830 // skip one symbol if coming from emph3
831 if len(data) > 1 && data[0] == c && data[1] == c {
832 i = 1
833 }
834
835 for i < len(data) {
836 length := inlineHelperFindEmphChar(data[i:], c)
837 if length == 0 {
838 return 0
839 }
840 i += length
841 if i >= len(data) {
842 return 0
843 }
844
845 if i+1 < len(data) && data[i+1] == c {
846 i++
847 continue
848 }
849
850 if data[i] == c && !isspace(data[i-1]) {
851
852 if parser.flags&EXTENSION_NO_INTRA_EMPHASIS != 0 {
853 if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
854 continue
855 }
856 }
857
858 var work bytes.Buffer
859 parser.parseInline(&work, data[:i])
860 parser.r.Emphasis(out, work.Bytes())
861 return i + 1
862 }
863 }
864
865 return 0
866}
867
868func inlineHelperEmph2(parser *Parser, out *bytes.Buffer, data []byte, c byte) int {
869 i := 0
870
871 for i < len(data) {
872 length := inlineHelperFindEmphChar(data[i:], c)
873 if length == 0 {
874 return 0
875 }
876 i += length
877
878 if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
879 var work bytes.Buffer
880 parser.parseInline(&work, data[:i])
881
882 // pick the right renderer
883 if c == '~' {
884 parser.r.StrikeThrough(out, work.Bytes())
885 } else {
886 parser.r.DoubleEmphasis(out, work.Bytes())
887 }
888 return i + 2
889 }
890 i++
891 }
892 return 0
893}
894
895func inlineHelperEmph3(parser *Parser, out *bytes.Buffer, data []byte, offset int, c byte) int {
896 i := 0
897 origData := data
898 data = data[offset:]
899
900 for i < len(data) {
901 length := inlineHelperFindEmphChar(data[i:], c)
902 if length == 0 {
903 return 0
904 }
905 i += length
906
907 // skip whitespace preceded symbols
908 if data[i] != c || isspace(data[i-1]) {
909 continue
910 }
911
912 switch {
913 case i+2 < len(data) && data[i+1] == c && data[i+2] == c:
914 // triple symbol found
915 var work bytes.Buffer
916
917 parser.parseInline(&work, data[:i])
918 parser.r.TripleEmphasis(out, work.Bytes())
919 return i + 3
920 case (i+1 < len(data) && data[i+1] == c):
921 // double symbol found, hand over to emph1
922 length = inlineHelperEmph1(parser, out, origData[offset-2:], c)
923 if length == 0 {
924 return 0
925 } else {
926 return length - 2
927 }
928 default:
929 // single symbol found, hand over to emph2
930 length = inlineHelperEmph2(parser, out, origData[offset-1:], c)
931 if length == 0 {
932 return 0
933 } else {
934 return length - 1
935 }
936 }
937 }
938 return 0
939}