inline.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11// Functions to parse inline elements.
12//
13
14package blackfriday
15
16import (
17 "bytes"
18)
19
20// Functions to parse text within a block
21// Each function returns the number of chars taken care of
22// data is the complete block being rendered
23// offset is the number of valid chars before the current cursor
24
25func (p *parser) inline(out *bytes.Buffer, data []byte) {
26 // this is called recursively: enforce a maximum depth
27 if p.nesting >= p.maxNesting {
28 return
29 }
30 p.nesting++
31
32 i, end := 0, 0
33 for i < len(data) {
34 // copy inactive chars into the output
35 for end < len(data) && p.inlineCallback[data[end]] == nil {
36 end++
37 }
38
39 p.r.NormalText(out, data[i:end])
40
41 if end >= len(data) {
42 break
43 }
44 i = end
45
46 // call the trigger
47 handler := p.inlineCallback[data[end]]
48 if consumed := handler(p, out, data, i); consumed == 0 {
49 // no action from the callback; buffer the byte for later
50 end = i + 1
51 } else {
52 // skip past whatever the callback used
53 i += consumed
54 end = i
55 }
56 }
57
58 p.nesting--
59}
60
61// single and double emphasis parsing
62func emphasis(p *parser, out *bytes.Buffer, data []byte, offset int) int {
63 data = data[offset:]
64 c := data[0]
65 ret := 0
66
67 if len(data) > 2 && data[1] != c {
68 // whitespace cannot follow an opening emphasis;
69 // strikethrough only takes two characters '~~'
70 if c == '~' || isspace(data[1]) {
71 return 0
72 }
73 if ret = helperEmphasis(p, out, data[1:], c); ret == 0 {
74 return 0
75 }
76
77 return ret + 1
78 }
79
80 if len(data) > 3 && data[1] == c && data[2] != c {
81 if isspace(data[2]) {
82 return 0
83 }
84 if ret = helperDoubleEmphasis(p, out, data[2:], c); ret == 0 {
85 return 0
86 }
87
88 return ret + 2
89 }
90
91 if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
92 if c == '~' || isspace(data[3]) {
93 return 0
94 }
95 if ret = helperTripleEmphasis(p, out, data, 3, c); ret == 0 {
96 return 0
97 }
98
99 return ret + 3
100 }
101
102 return 0
103}
104
105func codeSpan(p *parser, out *bytes.Buffer, data []byte, offset int) int {
106 data = data[offset:]
107
108 nb := 0
109
110 // count the number of backticks in the delimiter
111 for nb < len(data) && data[nb] == '`' {
112 nb++
113 }
114
115 // find the next delimiter
116 i, end := 0, 0
117 for end = nb; end < len(data) && i < nb; end++ {
118 if data[end] == '`' {
119 i++
120 } else {
121 i = 0
122 }
123 }
124
125 // no matching delimiter?
126 if i < nb && end >= len(data) {
127 return 0
128 }
129
130 // trim outside whitespace
131 fBegin := nb
132 for fBegin < end && data[fBegin] == ' ' {
133 fBegin++
134 }
135
136 fEnd := end - nb
137 for fEnd > fBegin && data[fEnd-1] == ' ' {
138 fEnd--
139 }
140
141 // render the code span
142 if fBegin != fEnd {
143 p.r.CodeSpan(out, data[fBegin:fEnd])
144 }
145
146 return end
147
148}
149
150// newline preceded by two spaces becomes <br>
151// newline without two spaces works when EXTENSION_HARD_LINE_BREAK is enabled
152func lineBreak(p *parser, out *bytes.Buffer, data []byte, offset int) int {
153 // remove trailing spaces from out
154 outBytes := out.Bytes()
155 end := len(outBytes)
156 eol := end
157 for eol > 0 && outBytes[eol-1] == ' ' {
158 eol--
159 }
160 out.Truncate(eol)
161
162 // should there be a hard line break here?
163 if p.flags&EXTENSION_HARD_LINE_BREAK == 0 && end-eol < 2 {
164 return 0
165 }
166
167 p.r.LineBreak(out)
168 return 1
169}
170
171// '[': parse a link or an image
172func link(p *parser, out *bytes.Buffer, data []byte, offset int) int {
173 // no links allowed inside other links
174 if p.insideLink {
175 return 0
176 }
177
178 isImg := offset > 0 && data[offset-1] == '!'
179
180 data = data[offset:]
181
182 i := 1
183 var title, link []byte
184 textHasNl := false
185
186 // look for the matching closing bracket
187 for level := 1; level > 0 && i < len(data); i++ {
188 switch {
189 case data[i] == '\n':
190 textHasNl = true
191
192 case data[i-1] == '\\':
193 continue
194
195 case data[i] == '[':
196 level++
197
198 case data[i] == ']':
199 level--
200 if level <= 0 {
201 i-- // compensate for extra i++ in for loop
202 }
203 }
204 }
205
206 if i >= len(data) {
207 return 0
208 }
209
210 txtE := i
211 i++
212
213 // skip any amount of whitespace or newline
214 // (this is much more lax than original markdown syntax)
215 for i < len(data) && isspace(data[i]) {
216 i++
217 }
218
219 // inline style link
220 switch {
221 case i < len(data) && data[i] == '(':
222 // skip initial whitespace
223 i++
224
225 for i < len(data) && isspace(data[i]) {
226 i++
227 }
228
229 linkB := i
230
231 // look for link end: ' " )
232 findlinkend:
233 for i < len(data) {
234 switch {
235 case data[i] == '\\':
236 i += 2
237
238 case data[i] == ')' || data[i] == '\'' || data[i] == '"':
239 break findlinkend
240
241 default:
242 i++
243 }
244 }
245
246 if i >= len(data) {
247 return 0
248 }
249 linkE := i
250
251 // look for title end if present
252 titleB, titleE := 0, 0
253 if data[i] == '\'' || data[i] == '"' {
254 i++
255 titleB = i
256
257 findtitleend:
258 for i < len(data) {
259 switch {
260 case data[i] == '\\':
261 i += 2
262
263 case data[i] == ')':
264 break findtitleend
265
266 default:
267 i++
268 }
269 }
270
271 if i >= len(data) {
272 return 0
273 }
274
275 // skip whitespace after title
276 titleE = i - 1
277 for titleE > titleB && isspace(data[titleE]) {
278 titleE--
279 }
280
281 // check for closing quote presence
282 if data[titleE] != '\'' && data[titleE] != '"' {
283 titleB, titleE = 0, 0
284 linkE = i
285 }
286 }
287
288 // remove whitespace at the end of the link
289 for linkE > linkB && isspace(data[linkE-1]) {
290 linkE--
291 }
292
293 // remove optional angle brackets around the link
294 if data[linkB] == '<' {
295 linkB++
296 }
297 if data[linkE-1] == '>' {
298 linkE--
299 }
300
301 // build escaped link and title
302 if linkE > linkB {
303 link = data[linkB:linkE]
304 }
305
306 if titleE > titleB {
307 title = data[titleB:titleE]
308 }
309
310 i++
311
312 // reference style link
313 case i < len(data) && data[i] == '[':
314 var id []byte
315
316 // look for the id
317 i++
318 linkB := i
319 for i < len(data) && data[i] != ']' {
320 i++
321 }
322 if i >= len(data) {
323 return 0
324 }
325 linkE := i
326
327 // find the reference
328 if linkB == linkE {
329 if textHasNl {
330 var b bytes.Buffer
331
332 for j := 1; j < txtE; j++ {
333 switch {
334 case data[j] != '\n':
335 b.WriteByte(data[j])
336 case data[j-1] != ' ':
337 b.WriteByte(' ')
338 }
339 }
340
341 id = b.Bytes()
342 } else {
343 id = data[1:txtE]
344 }
345 } else {
346 id = data[linkB:linkE]
347 }
348
349 // find the reference with matching id (ids are case-insensitive)
350 key := string(bytes.ToLower(id))
351 lr, ok := p.refs[key]
352 if !ok {
353 return 0
354 }
355
356 // keep link and title from reference
357 link = lr.link
358 title = lr.title
359 i++
360
361 // shortcut reference style link
362 default:
363 var id []byte
364
365 // craft the id
366 if textHasNl {
367 var b bytes.Buffer
368
369 for j := 1; j < txtE; j++ {
370 switch {
371 case data[j] != '\n':
372 b.WriteByte(data[j])
373 case data[j-1] != ' ':
374 b.WriteByte(' ')
375 }
376 }
377
378 id = b.Bytes()
379 } else {
380 id = data[1:txtE]
381 }
382
383 // find the reference with matching id
384 key := string(bytes.ToLower(id))
385 lr, ok := p.refs[key]
386 if !ok {
387 return 0
388 }
389
390 // keep link and title from reference
391 link = lr.link
392 title = lr.title
393
394 // rewind the whitespace
395 i = txtE + 1
396 }
397
398 // build content: img alt is escaped, link content is parsed
399 var content bytes.Buffer
400 if txtE > 1 {
401 if isImg {
402 content.Write(data[1:txtE])
403 } else {
404 // links cannot contain other links, so turn off link parsing temporarily
405 insideLink := p.insideLink
406 p.insideLink = true
407 p.inline(&content, data[1:txtE])
408 p.insideLink = insideLink
409 }
410 }
411
412 var uLink []byte
413 if len(link) > 0 {
414 var uLinkBuf bytes.Buffer
415 unescapeText(&uLinkBuf, link)
416 uLink = uLinkBuf.Bytes()
417 }
418
419 // links need something to click on and somewhere to go
420 if len(uLink) == 0 || (!isImg && content.Len() == 0) {
421 return 0
422 }
423
424 // call the relevant rendering function
425 if isImg {
426 outSize := out.Len()
427 outBytes := out.Bytes()
428 if outSize > 0 && outBytes[outSize-1] == '!' {
429 out.Truncate(outSize - 1)
430 }
431
432 p.r.Image(out, uLink, title, content.Bytes())
433 } else {
434 p.r.Link(out, uLink, title, content.Bytes())
435 }
436
437 return i
438}
439
440// '<' when tags or autolinks are allowed
441func leftAngle(p *parser, out *bytes.Buffer, data []byte, offset int) int {
442 data = data[offset:]
443 altype := LINK_TYPE_NOT_AUTOLINK
444 end := tagLength(data, &altype)
445
446 if end > 2 {
447 if altype != LINK_TYPE_NOT_AUTOLINK {
448 var uLink bytes.Buffer
449 unescapeText(&uLink, data[1:end+1-2])
450 if uLink.Len() > 0 {
451 p.r.AutoLink(out, uLink.Bytes(), altype)
452 }
453 } else {
454 p.r.RawHtmlTag(out, data[:end])
455 }
456 }
457
458 return end
459}
460
461// '\\' backslash escape
462var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>")
463
464func escape(p *parser, out *bytes.Buffer, data []byte, offset int) int {
465 data = data[offset:]
466
467 if len(data) > 1 {
468 if bytes.IndexByte(escapeChars, data[1]) < 0 {
469 return 0
470 }
471
472 p.r.NormalText(out, data[1:2])
473 }
474
475 return 2
476}
477
478func unescapeText(ob *bytes.Buffer, src []byte) {
479 i := 0
480 for i < len(src) {
481 org := i
482 for i < len(src) && src[i] != '\\' {
483 i++
484 }
485
486 if i > org {
487 ob.Write(src[org:i])
488 }
489
490 if i+1 >= len(src) {
491 break
492 }
493
494 ob.WriteByte(src[i+1])
495 i += 2
496 }
497}
498
499// '&' escaped when it doesn't belong to an entity
500// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
501func entity(p *parser, out *bytes.Buffer, data []byte, offset int) int {
502 data = data[offset:]
503
504 end := 1
505
506 if end < len(data) && data[end] == '#' {
507 end++
508 }
509
510 for end < len(data) && isalnum(data[end]) {
511 end++
512 }
513
514 if end < len(data) && data[end] == ';' {
515 end++ // real entity
516 } else {
517 return 0 // lone '&'
518 }
519
520 p.r.Entity(out, data[:end])
521
522 return end
523}
524
525func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
526 // quick check to rule out most false hits on ':'
527 if p.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' {
528 return 0
529 }
530
531 // scan backward for a word boundary
532 rewind := 0
533 for offset-rewind > 0 && rewind <= 7 && !isspace(data[offset-rewind-1]) && !isspace(data[offset-rewind-1]) {
534 rewind++
535 }
536 if rewind > 6 { // longest supported protocol is "mailto" which has 6 letters
537 return 0
538 }
539
540 origData := data
541 data = data[offset-rewind:]
542
543 if !isSafeLink(data) {
544 return 0
545 }
546
547 linkEnd := 0
548 for linkEnd < len(data) && !isspace(data[linkEnd]) {
549 linkEnd++
550 }
551
552 // Skip punctuation at the end of the link
553 if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' {
554 linkEnd--
555 }
556
557 // See if the link finishes with a punctuation sign that can be closed.
558 var copen byte
559 switch data[linkEnd-1] {
560 case '"':
561 copen = '"'
562 case '\'':
563 copen = '\''
564 case ')':
565 copen = '('
566 case ']':
567 copen = '['
568 case '}':
569 copen = '{'
570 default:
571 copen = 0
572 }
573
574 if copen != 0 {
575 bufEnd := offset - rewind + linkEnd - 2
576
577 openDelim := 1
578
579 /* Try to close the final punctuation sign in this same line;
580 * if we managed to close it outside of the URL, that means that it's
581 * not part of the URL. If it closes inside the URL, that means it
582 * is part of the URL.
583 *
584 * Examples:
585 *
586 * foo http://www.pokemon.com/Pikachu_(Electric) bar
587 * => http://www.pokemon.com/Pikachu_(Electric)
588 *
589 * foo (http://www.pokemon.com/Pikachu_(Electric)) bar
590 * => http://www.pokemon.com/Pikachu_(Electric)
591 *
592 * foo http://www.pokemon.com/Pikachu_(Electric)) bar
593 * => http://www.pokemon.com/Pikachu_(Electric))
594 *
595 * (foo http://www.pokemon.com/Pikachu_(Electric)) bar
596 * => foo http://www.pokemon.com/Pikachu_(Electric)
597 */
598
599 for bufEnd >= 0 && origData[bufEnd] != '\n' && openDelim != 0 {
600 if origData[bufEnd] == data[linkEnd-1] {
601 openDelim++
602 }
603
604 if origData[bufEnd] == copen {
605 openDelim--
606 }
607
608 bufEnd--
609 }
610
611 if openDelim == 0 {
612 linkEnd--
613 }
614 }
615
616 // we were triggered on the ':', so we need to rewind the output a bit
617 if out.Len() >= rewind {
618 out.Truncate(len(out.Bytes()) - rewind)
619 }
620
621 var uLink bytes.Buffer
622 unescapeText(&uLink, data[:linkEnd])
623
624 if uLink.Len() > 0 {
625 p.r.AutoLink(out, uLink.Bytes(), LINK_TYPE_NORMAL)
626 }
627
628 return linkEnd - rewind
629}
630
631var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
632
633func isSafeLink(link []byte) bool {
634 for _, prefix := range validUris {
635 // TODO: handle unicode here
636 // case-insensitive prefix test
637 if len(link) > len(prefix) && bytes.Equal(bytes.ToLower(link[:len(prefix)]), prefix) && isalnum(link[len(prefix)]) {
638 return true
639 }
640 }
641
642 return false
643}
644
645// return the length of the given tag, or 0 is it's not valid
646func tagLength(data []byte, autolink *int) int {
647 var i, j int
648
649 // a valid tag can't be shorter than 3 chars
650 if len(data) < 3 {
651 return 0
652 }
653
654 // begins with a '<' optionally followed by '/', followed by letter or number
655 if data[0] != '<' {
656 return 0
657 }
658 if data[1] == '/' {
659 i = 2
660 } else {
661 i = 1
662 }
663
664 if !isalnum(data[i]) {
665 return 0
666 }
667
668 // scheme test
669 *autolink = LINK_TYPE_NOT_AUTOLINK
670
671 // try to find the beginning of an URI
672 for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
673 i++
674 }
675
676 if i > 1 && i < len(data) && data[i] == '@' {
677 if j = isMailtoAutoLink(data[i:]); j != 0 {
678 *autolink = LINK_TYPE_EMAIL
679 return i + j
680 }
681 }
682
683 if i > 2 && i < len(data) && data[i] == ':' {
684 *autolink = LINK_TYPE_NORMAL
685 i++
686 }
687
688 // complete autolink test: no whitespace or ' or "
689 switch {
690 case i >= len(data):
691 *autolink = LINK_TYPE_NOT_AUTOLINK
692 case *autolink != 0:
693 j = i
694
695 for i < len(data) {
696 if data[i] == '\\' {
697 i += 2
698 } else if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
699 break
700 } else {
701 i++
702 }
703
704 }
705
706 if i >= len(data) {
707 return 0
708 }
709 if i > j && data[i] == '>' {
710 return i + 1
711 }
712
713 // one of the forbidden chars has been found
714 *autolink = LINK_TYPE_NOT_AUTOLINK
715 }
716
717 // look for something looking like a tag end
718 for i < len(data) && data[i] != '>' {
719 i++
720 }
721 if i >= len(data) {
722 return 0
723 }
724 return i + 1
725}
726
727// look for the address part of a mail autolink and '>'
728// this is less strict than the original markdown e-mail address matching
729func isMailtoAutoLink(data []byte) int {
730 nb := 0
731
732 // address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
733 for i := 0; i < len(data); i++ {
734 if isalnum(data[i]) {
735 continue
736 }
737
738 switch data[i] {
739 case '@':
740 nb++
741
742 case '-', '.', '_':
743 break
744
745 case '>':
746 if nb == 1 {
747 return i + 1
748 } else {
749 return 0
750 }
751 default:
752 return 0
753 }
754 }
755
756 return 0
757}
758
759// look for the next emph char, skipping other constructs
760func helperFindEmphChar(data []byte, c byte) int {
761 i := 1
762
763 for i < len(data) {
764 for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
765 i++
766 }
767 if i >= len(data) {
768 return 0
769 }
770 if data[i] == c {
771 return i
772 }
773
774 // do not count escaped chars
775 if i != 0 && data[i-1] == '\\' {
776 i++
777 continue
778 }
779
780 if data[i] == '`' {
781 // skip a code span
782 tmpI := 0
783 i++
784 for i < len(data) && data[i] != '`' {
785 if tmpI == 0 && data[i] == c {
786 tmpI = i
787 }
788 i++
789 }
790 if i >= len(data) {
791 return tmpI
792 }
793 i++
794 } else if data[i] == '[' {
795 // skip a link
796 tmpI := 0
797 i++
798 for i < len(data) && data[i] != ']' {
799 if tmpI == 0 && data[i] == c {
800 tmpI = i
801 }
802 i++
803 }
804 i++
805 for i < len(data) && (data[i] == ' ' || data[i] == '\n') {
806 i++
807 }
808 if i >= len(data) {
809 return tmpI
810 }
811 if data[i] != '[' && data[i] != '(' { // not a link
812 if tmpI > 0 {
813 return tmpI
814 } else {
815 continue
816 }
817 }
818 cc := data[i]
819 i++
820 for i < len(data) && data[i] != cc {
821 if tmpI == 0 && data[i] == c {
822 tmpI = i
823 }
824 i++
825 }
826 if i >= len(data) {
827 return tmpI
828 }
829 i++
830 }
831 }
832 return 0
833}
834
835func helperEmphasis(p *parser, out *bytes.Buffer, data []byte, c byte) int {
836 i := 0
837
838 // skip one symbol if coming from emph3
839 if len(data) > 1 && data[0] == c && data[1] == c {
840 i = 1
841 }
842
843 for i < len(data) {
844 length := helperFindEmphChar(data[i:], c)
845 if length == 0 {
846 return 0
847 }
848 i += length
849 if i >= len(data) {
850 return 0
851 }
852
853 if i+1 < len(data) && data[i+1] == c {
854 i++
855 continue
856 }
857
858 if data[i] == c && !isspace(data[i-1]) {
859
860 if p.flags&EXTENSION_NO_INTRA_EMPHASIS != 0 {
861 if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
862 continue
863 }
864 }
865
866 var work bytes.Buffer
867 p.inline(&work, data[:i])
868 p.r.Emphasis(out, work.Bytes())
869 return i + 1
870 }
871 }
872
873 return 0
874}
875
876func helperDoubleEmphasis(p *parser, out *bytes.Buffer, data []byte, c byte) int {
877 i := 0
878
879 for i < len(data) {
880 length := helperFindEmphChar(data[i:], c)
881 if length == 0 {
882 return 0
883 }
884 i += length
885
886 if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
887 var work bytes.Buffer
888 p.inline(&work, data[:i])
889
890 if work.Len() > 0 {
891 // pick the right renderer
892 if c == '~' {
893 p.r.StrikeThrough(out, work.Bytes())
894 } else {
895 p.r.DoubleEmphasis(out, work.Bytes())
896 }
897 }
898 return i + 2
899 }
900 i++
901 }
902 return 0
903}
904
905func helperTripleEmphasis(p *parser, out *bytes.Buffer, data []byte, offset int, c byte) int {
906 i := 0
907 origData := data
908 data = data[offset:]
909
910 for i < len(data) {
911 length := helperFindEmphChar(data[i:], c)
912 if length == 0 {
913 return 0
914 }
915 i += length
916
917 // skip whitespace preceded symbols
918 if data[i] != c || isspace(data[i-1]) {
919 continue
920 }
921
922 switch {
923 case i+2 < len(data) && data[i+1] == c && data[i+2] == c:
924 // triple symbol found
925 var work bytes.Buffer
926
927 p.inline(&work, data[:i])
928 if work.Len() > 0 {
929 p.r.TripleEmphasis(out, work.Bytes())
930 }
931 return i + 3
932 case (i+1 < len(data) && data[i+1] == c):
933 // double symbol found, hand over to emph1
934 length = helperEmphasis(p, out, origData[offset-2:], c)
935 if length == 0 {
936 return 0
937 } else {
938 return length - 2
939 }
940 default:
941 // single symbol found, hand over to emph2
942 length = helperDoubleEmphasis(p, out, origData[offset-1:], c)
943 if length == 0 {
944 return 0
945 } else {
946 return length - 1
947 }
948 }
949 }
950 return 0
951}