inline.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11// Functions to parse inline elements.
12//
13
14package blackfriday
15
16import (
17 "bytes"
18)
19
20// Functions to parse text within a block
21// Each function returns the number of chars taken care of
22// data is the complete block being rendered
23// offset is the number of valid chars before the current cursor
24
25func (p *parser) inline(out *bytes.Buffer, data []byte) {
26 // this is called recursively: enforce a maximum depth
27 if p.nesting >= p.maxNesting {
28 return
29 }
30 p.nesting++
31
32 i, end := 0, 0
33 for i < len(data) {
34 // copy inactive chars into the output
35 for end < len(data) && p.inlineCallback[data[end]] == nil {
36 end++
37 }
38
39 p.r.NormalText(out, data[i:end])
40
41 if end >= len(data) {
42 break
43 }
44 i = end
45
46 // call the trigger
47 handler := p.inlineCallback[data[end]]
48 if consumed := handler(p, out, data, i); consumed == 0 {
49 // no action from the callback; buffer the byte for later
50 end = i + 1
51 } else {
52 // skip past whatever the callback used
53 i += consumed
54 end = i
55 }
56 }
57
58 p.nesting--
59}
60
61// single and double emphasis parsing
62func emphasis(p *parser, out *bytes.Buffer, data []byte, offset int) int {
63 data = data[offset:]
64 c := data[0]
65 ret := 0
66
67 if len(data) > 2 && data[1] != c {
68 // whitespace cannot follow an opening emphasis;
69 // strikethrough only takes two characters '~~'
70 if c == '~' || isspace(data[1]) {
71 return 0
72 }
73 if ret = helperEmphasis(p, out, data[1:], c); ret == 0 {
74 return 0
75 }
76
77 return ret + 1
78 }
79
80 if len(data) > 3 && data[1] == c && data[2] != c {
81 if isspace(data[2]) {
82 return 0
83 }
84 if ret = helperDoubleEmphasis(p, out, data[2:], c); ret == 0 {
85 return 0
86 }
87
88 return ret + 2
89 }
90
91 if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
92 if c == '~' || isspace(data[3]) {
93 return 0
94 }
95 if ret = helperTripleEmphasis(p, out, data, 3, c); ret == 0 {
96 return 0
97 }
98
99 return ret + 3
100 }
101
102 return 0
103}
104
105func codeSpan(p *parser, out *bytes.Buffer, data []byte, offset int) int {
106 data = data[offset:]
107
108 nb := 0
109
110 // count the number of backticks in the delimiter
111 for nb < len(data) && data[nb] == '`' {
112 nb++
113 }
114
115 // find the next delimiter
116 i, end := 0, 0
117 for end = nb; end < len(data) && i < nb; end++ {
118 if data[end] == '`' {
119 i++
120 } else {
121 i = 0
122 }
123 }
124
125 // no matching delimiter?
126 if i < nb && end >= len(data) {
127 return 0
128 }
129
130 // trim outside whitespace
131 fBegin := nb
132 for fBegin < end && data[fBegin] == ' ' {
133 fBegin++
134 }
135
136 fEnd := end - nb
137 for fEnd > fBegin && data[fEnd-1] == ' ' {
138 fEnd--
139 }
140
141 // render the code span
142 if fBegin != fEnd {
143 p.r.CodeSpan(out, data[fBegin:fEnd])
144 }
145
146 return end
147
148}
149
150// newline preceded by two spaces becomes <br>
151// newline without two spaces works when EXTENSION_HARD_LINE_BREAK is enabled
152func lineBreak(p *parser, out *bytes.Buffer, data []byte, offset int) int {
153 // remove trailing spaces from out
154 outBytes := out.Bytes()
155 end := len(outBytes)
156 eol := end
157 for eol > 0 && outBytes[eol-1] == ' ' {
158 eol--
159 }
160 out.Truncate(eol)
161
162 // should there be a hard line break here?
163 if p.flags&EXTENSION_HARD_LINE_BREAK == 0 && end-eol < 2 {
164 return 0
165 }
166
167 p.r.LineBreak(out)
168 return 1
169}
170
171type linkType int
172
173const (
174 linkNormal linkType = iota
175 linkImg
176 linkDeferredFootnote
177
178// linkInlineFootnote
179)
180
181// '[': parse a link or an image or a footnote
182func link(p *parser, out *bytes.Buffer, data []byte, offset int) int {
183 // no links allowed inside other links
184 if p.insideLink {
185 return 0
186 }
187
188 // [text] == regular link
189 // ![alt] == image
190 // ^[text] == inline footnote
191 // [^refId] == deferred footnote
192 var t linkType
193 if offset > 0 && data[offset-1] == '!' {
194 t = linkImg
195 } else if p.flags&EXTENSION_FOOTNOTES != 0 {
196 if len(data)-1 > offset && data[offset+1] == '^' {
197 t = linkDeferredFootnote
198 }
199 }
200
201 data = data[offset:]
202
203 var (
204 i = 1
205 noteId int
206 title, link []byte
207 textHasNl = false
208 )
209
210 if t == linkDeferredFootnote {
211 i++
212 }
213
214 // look for the matching closing bracket
215 for level := 1; level > 0 && i < len(data); i++ {
216 switch {
217 case data[i] == '\n':
218 textHasNl = true
219
220 case data[i-1] == '\\':
221 continue
222
223 case data[i] == '[':
224 level++
225
226 case data[i] == ']':
227 level--
228 if level <= 0 {
229 i-- // compensate for extra i++ in for loop
230 }
231 }
232 }
233
234 if i >= len(data) {
235 return 0
236 }
237
238 txtE := i
239 i++
240
241 // skip any amount of whitespace or newline
242 // (this is much more lax than original markdown syntax)
243 for i < len(data) && isspace(data[i]) {
244 i++
245 }
246
247 // inline style link
248 switch {
249 case i < len(data) && data[i] == '(':
250 // skip initial whitespace
251 i++
252
253 for i < len(data) && isspace(data[i]) {
254 i++
255 }
256
257 linkB := i
258
259 // look for link end: ' " )
260 findlinkend:
261 for i < len(data) {
262 switch {
263 case data[i] == '\\':
264 i += 2
265
266 case data[i] == ')' || data[i] == '\'' || data[i] == '"':
267 break findlinkend
268
269 default:
270 i++
271 }
272 }
273
274 if i >= len(data) {
275 return 0
276 }
277 linkE := i
278
279 // look for title end if present
280 titleB, titleE := 0, 0
281 if data[i] == '\'' || data[i] == '"' {
282 i++
283 titleB = i
284
285 findtitleend:
286 for i < len(data) {
287 switch {
288 case data[i] == '\\':
289 i += 2
290
291 case data[i] == ')':
292 break findtitleend
293
294 default:
295 i++
296 }
297 }
298
299 if i >= len(data) {
300 return 0
301 }
302
303 // skip whitespace after title
304 titleE = i - 1
305 for titleE > titleB && isspace(data[titleE]) {
306 titleE--
307 }
308
309 // check for closing quote presence
310 if data[titleE] != '\'' && data[titleE] != '"' {
311 titleB, titleE = 0, 0
312 linkE = i
313 }
314 }
315
316 // remove whitespace at the end of the link
317 for linkE > linkB && isspace(data[linkE-1]) {
318 linkE--
319 }
320
321 // remove optional angle brackets around the link
322 if data[linkB] == '<' {
323 linkB++
324 }
325 if data[linkE-1] == '>' {
326 linkE--
327 }
328
329 // build escaped link and title
330 if linkE > linkB {
331 link = data[linkB:linkE]
332 }
333
334 if titleE > titleB {
335 title = data[titleB:titleE]
336 }
337
338 i++
339
340 // reference style link
341 case i < len(data) && data[i] == '[':
342 var id []byte
343
344 // look for the id
345 i++
346 linkB := i
347 for i < len(data) && data[i] != ']' {
348 i++
349 }
350 if i >= len(data) {
351 return 0
352 }
353 linkE := i
354
355 // find the reference
356 if linkB == linkE {
357 if textHasNl {
358 var b bytes.Buffer
359
360 for j := 1; j < txtE; j++ {
361 switch {
362 case data[j] != '\n':
363 b.WriteByte(data[j])
364 case data[j-1] != ' ':
365 b.WriteByte(' ')
366 }
367 }
368
369 id = b.Bytes()
370 } else {
371 id = data[1:txtE]
372 }
373 } else {
374 id = data[linkB:linkE]
375 }
376
377 // find the reference with matching id (ids are case-insensitive)
378 key := string(bytes.ToLower(id))
379 lr, ok := p.refs[key]
380 if !ok {
381 return 0
382
383 }
384
385 // keep link and title from reference
386 link = lr.link
387 title = lr.title
388 i++
389
390 // shortcut reference style link or footnote
391 default:
392 var id []byte
393
394 // craft the id
395 if textHasNl {
396 var b bytes.Buffer
397
398 for j := 1; j < txtE; j++ {
399 switch {
400 case data[j] != '\n':
401 b.WriteByte(data[j])
402 case data[j-1] != ' ':
403 b.WriteByte(' ')
404 }
405 }
406
407 id = b.Bytes()
408 } else {
409 if t == linkDeferredFootnote {
410 id = data[2:txtE]
411 } else {
412 id = data[1:txtE]
413 }
414 }
415
416 // find the reference with matching id
417 key := string(bytes.ToLower(id))
418 lr, ok := p.refs[key]
419 if !ok {
420 return 0
421 }
422
423 // keep link and title from reference
424 link = lr.link
425 // if inline footnote, title == footnote contents
426 title = lr.title
427 noteId = lr.noteId
428
429 // rewind the whitespace
430 i = txtE + 1
431 }
432
433 // build content: img alt is escaped, link content is parsed
434 var content bytes.Buffer
435 if txtE > 1 {
436 if t == linkImg {
437 content.Write(data[1:txtE])
438 } else {
439 // links cannot contain other links, so turn off link parsing temporarily
440 insideLink := p.insideLink
441 p.insideLink = true
442 p.inline(&content, data[1:txtE])
443 p.insideLink = insideLink
444 }
445 }
446
447 var uLink []byte
448 if len(link) > 0 {
449 var uLinkBuf bytes.Buffer
450 unescapeText(&uLinkBuf, link)
451 uLink = uLinkBuf.Bytes()
452 }
453
454 // links need something to click on and somewhere to go
455 if len(uLink) == 0 || (t == linkNormal && content.Len() == 0) {
456 return 0
457 }
458
459 // call the relevant rendering function
460 switch t {
461 case linkNormal:
462 p.r.Link(out, uLink, title, content.Bytes())
463
464 case linkImg:
465 outSize := out.Len()
466 outBytes := out.Bytes()
467 if outSize > 0 && outBytes[outSize-1] == '!' {
468 out.Truncate(outSize - 1)
469 }
470
471 p.r.Image(out, uLink, title, content.Bytes())
472
473 case linkDeferredFootnote:
474 p.r.FootnoteRef(out, link, noteId)
475
476 default:
477 return 0
478 }
479
480 return i
481}
482
483// '<' when tags or autolinks are allowed
484func leftAngle(p *parser, out *bytes.Buffer, data []byte, offset int) int {
485 data = data[offset:]
486 altype := LINK_TYPE_NOT_AUTOLINK
487 end := tagLength(data, &altype)
488
489 if end > 2 {
490 if altype != LINK_TYPE_NOT_AUTOLINK {
491 var uLink bytes.Buffer
492 unescapeText(&uLink, data[1:end+1-2])
493 if uLink.Len() > 0 {
494 p.r.AutoLink(out, uLink.Bytes(), altype)
495 }
496 } else {
497 p.r.RawHtmlTag(out, data[:end])
498 }
499 }
500
501 return end
502}
503
504// '\\' backslash escape
505var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>")
506
507func escape(p *parser, out *bytes.Buffer, data []byte, offset int) int {
508 data = data[offset:]
509
510 if len(data) > 1 {
511 if bytes.IndexByte(escapeChars, data[1]) < 0 {
512 return 0
513 }
514
515 p.r.NormalText(out, data[1:2])
516 }
517
518 return 2
519}
520
521func unescapeText(ob *bytes.Buffer, src []byte) {
522 i := 0
523 for i < len(src) {
524 org := i
525 for i < len(src) && src[i] != '\\' {
526 i++
527 }
528
529 if i > org {
530 ob.Write(src[org:i])
531 }
532
533 if i+1 >= len(src) {
534 break
535 }
536
537 ob.WriteByte(src[i+1])
538 i += 2
539 }
540}
541
542// '&' escaped when it doesn't belong to an entity
543// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
544func entity(p *parser, out *bytes.Buffer, data []byte, offset int) int {
545 data = data[offset:]
546
547 end := 1
548
549 if end < len(data) && data[end] == '#' {
550 end++
551 }
552
553 for end < len(data) && isalnum(data[end]) {
554 end++
555 }
556
557 if end < len(data) && data[end] == ';' {
558 end++ // real entity
559 } else {
560 return 0 // lone '&'
561 }
562
563 p.r.Entity(out, data[:end])
564
565 return end
566}
567
568func autoLink(p *parser, out *bytes.Buffer, data []byte, offset int) int {
569 // quick check to rule out most false hits on ':'
570 if p.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' {
571 return 0
572 }
573
574 // scan backward for a word boundary
575 rewind := 0
576 for offset-rewind > 0 && rewind <= 7 && !isspace(data[offset-rewind-1]) && !isspace(data[offset-rewind-1]) {
577 rewind++
578 }
579 if rewind > 6 { // longest supported protocol is "mailto" which has 6 letters
580 return 0
581 }
582
583 origData := data
584 data = data[offset-rewind:]
585
586 if !isSafeLink(data) {
587 return 0
588 }
589
590 linkEnd := 0
591 for linkEnd < len(data) && !isspace(data[linkEnd]) {
592 linkEnd++
593 }
594
595 // Skip punctuation at the end of the link
596 if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' {
597 linkEnd--
598 }
599
600 // See if the link finishes with a punctuation sign that can be closed.
601 var copen byte
602 switch data[linkEnd-1] {
603 case '"':
604 copen = '"'
605 case '\'':
606 copen = '\''
607 case ')':
608 copen = '('
609 case ']':
610 copen = '['
611 case '}':
612 copen = '{'
613 default:
614 copen = 0
615 }
616
617 if copen != 0 {
618 bufEnd := offset - rewind + linkEnd - 2
619
620 openDelim := 1
621
622 /* Try to close the final punctuation sign in this same line;
623 * if we managed to close it outside of the URL, that means that it's
624 * not part of the URL. If it closes inside the URL, that means it
625 * is part of the URL.
626 *
627 * Examples:
628 *
629 * foo http://www.pokemon.com/Pikachu_(Electric) bar
630 * => http://www.pokemon.com/Pikachu_(Electric)
631 *
632 * foo (http://www.pokemon.com/Pikachu_(Electric)) bar
633 * => http://www.pokemon.com/Pikachu_(Electric)
634 *
635 * foo http://www.pokemon.com/Pikachu_(Electric)) bar
636 * => http://www.pokemon.com/Pikachu_(Electric))
637 *
638 * (foo http://www.pokemon.com/Pikachu_(Electric)) bar
639 * => foo http://www.pokemon.com/Pikachu_(Electric)
640 */
641
642 for bufEnd >= 0 && origData[bufEnd] != '\n' && openDelim != 0 {
643 if origData[bufEnd] == data[linkEnd-1] {
644 openDelim++
645 }
646
647 if origData[bufEnd] == copen {
648 openDelim--
649 }
650
651 bufEnd--
652 }
653
654 if openDelim == 0 {
655 linkEnd--
656 }
657 }
658
659 // we were triggered on the ':', so we need to rewind the output a bit
660 if out.Len() >= rewind {
661 out.Truncate(len(out.Bytes()) - rewind)
662 }
663
664 var uLink bytes.Buffer
665 unescapeText(&uLink, data[:linkEnd])
666
667 if uLink.Len() > 0 {
668 p.r.AutoLink(out, uLink.Bytes(), LINK_TYPE_NORMAL)
669 }
670
671 return linkEnd - rewind
672}
673
674var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
675
676func isSafeLink(link []byte) bool {
677 for _, prefix := range validUris {
678 // TODO: handle unicode here
679 // case-insensitive prefix test
680 if len(link) > len(prefix) && bytes.Equal(bytes.ToLower(link[:len(prefix)]), prefix) && isalnum(link[len(prefix)]) {
681 return true
682 }
683 }
684
685 return false
686}
687
688// return the length of the given tag, or 0 is it's not valid
689func tagLength(data []byte, autolink *int) int {
690 var i, j int
691
692 // a valid tag can't be shorter than 3 chars
693 if len(data) < 3 {
694 return 0
695 }
696
697 // begins with a '<' optionally followed by '/', followed by letter or number
698 if data[0] != '<' {
699 return 0
700 }
701 if data[1] == '/' {
702 i = 2
703 } else {
704 i = 1
705 }
706
707 if !isalnum(data[i]) {
708 return 0
709 }
710
711 // scheme test
712 *autolink = LINK_TYPE_NOT_AUTOLINK
713
714 // try to find the beginning of an URI
715 for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
716 i++
717 }
718
719 if i > 1 && i < len(data) && data[i] == '@' {
720 if j = isMailtoAutoLink(data[i:]); j != 0 {
721 *autolink = LINK_TYPE_EMAIL
722 return i + j
723 }
724 }
725
726 if i > 2 && i < len(data) && data[i] == ':' {
727 *autolink = LINK_TYPE_NORMAL
728 i++
729 }
730
731 // complete autolink test: no whitespace or ' or "
732 switch {
733 case i >= len(data):
734 *autolink = LINK_TYPE_NOT_AUTOLINK
735 case *autolink != 0:
736 j = i
737
738 for i < len(data) {
739 if data[i] == '\\' {
740 i += 2
741 } else if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
742 break
743 } else {
744 i++
745 }
746
747 }
748
749 if i >= len(data) {
750 return 0
751 }
752 if i > j && data[i] == '>' {
753 return i + 1
754 }
755
756 // one of the forbidden chars has been found
757 *autolink = LINK_TYPE_NOT_AUTOLINK
758 }
759
760 // look for something looking like a tag end
761 for i < len(data) && data[i] != '>' {
762 i++
763 }
764 if i >= len(data) {
765 return 0
766 }
767 return i + 1
768}
769
770// look for the address part of a mail autolink and '>'
771// this is less strict than the original markdown e-mail address matching
772func isMailtoAutoLink(data []byte) int {
773 nb := 0
774
775 // address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
776 for i := 0; i < len(data); i++ {
777 if isalnum(data[i]) {
778 continue
779 }
780
781 switch data[i] {
782 case '@':
783 nb++
784
785 case '-', '.', '_':
786 break
787
788 case '>':
789 if nb == 1 {
790 return i + 1
791 } else {
792 return 0
793 }
794 default:
795 return 0
796 }
797 }
798
799 return 0
800}
801
802// look for the next emph char, skipping other constructs
803func helperFindEmphChar(data []byte, c byte) int {
804 i := 1
805
806 for i < len(data) {
807 for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
808 i++
809 }
810 if i >= len(data) {
811 return 0
812 }
813 if data[i] == c {
814 return i
815 }
816
817 // do not count escaped chars
818 if i != 0 && data[i-1] == '\\' {
819 i++
820 continue
821 }
822
823 if data[i] == '`' {
824 // skip a code span
825 tmpI := 0
826 i++
827 for i < len(data) && data[i] != '`' {
828 if tmpI == 0 && data[i] == c {
829 tmpI = i
830 }
831 i++
832 }
833 if i >= len(data) {
834 return tmpI
835 }
836 i++
837 } else if data[i] == '[' {
838 // skip a link
839 tmpI := 0
840 i++
841 for i < len(data) && data[i] != ']' {
842 if tmpI == 0 && data[i] == c {
843 tmpI = i
844 }
845 i++
846 }
847 i++
848 for i < len(data) && (data[i] == ' ' || data[i] == '\n') {
849 i++
850 }
851 if i >= len(data) {
852 return tmpI
853 }
854 if data[i] != '[' && data[i] != '(' { // not a link
855 if tmpI > 0 {
856 return tmpI
857 } else {
858 continue
859 }
860 }
861 cc := data[i]
862 i++
863 for i < len(data) && data[i] != cc {
864 if tmpI == 0 && data[i] == c {
865 tmpI = i
866 }
867 i++
868 }
869 if i >= len(data) {
870 return tmpI
871 }
872 i++
873 }
874 }
875 return 0
876}
877
878func helperEmphasis(p *parser, out *bytes.Buffer, data []byte, c byte) int {
879 i := 0
880
881 // skip one symbol if coming from emph3
882 if len(data) > 1 && data[0] == c && data[1] == c {
883 i = 1
884 }
885
886 for i < len(data) {
887 length := helperFindEmphChar(data[i:], c)
888 if length == 0 {
889 return 0
890 }
891 i += length
892 if i >= len(data) {
893 return 0
894 }
895
896 if i+1 < len(data) && data[i+1] == c {
897 i++
898 continue
899 }
900
901 if data[i] == c && !isspace(data[i-1]) {
902
903 if p.flags&EXTENSION_NO_INTRA_EMPHASIS != 0 {
904 if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
905 continue
906 }
907 }
908
909 var work bytes.Buffer
910 p.inline(&work, data[:i])
911 p.r.Emphasis(out, work.Bytes())
912 return i + 1
913 }
914 }
915
916 return 0
917}
918
919func helperDoubleEmphasis(p *parser, out *bytes.Buffer, data []byte, c byte) int {
920 i := 0
921
922 for i < len(data) {
923 length := helperFindEmphChar(data[i:], c)
924 if length == 0 {
925 return 0
926 }
927 i += length
928
929 if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
930 var work bytes.Buffer
931 p.inline(&work, data[:i])
932
933 if work.Len() > 0 {
934 // pick the right renderer
935 if c == '~' {
936 p.r.StrikeThrough(out, work.Bytes())
937 } else {
938 p.r.DoubleEmphasis(out, work.Bytes())
939 }
940 }
941 return i + 2
942 }
943 i++
944 }
945 return 0
946}
947
948func helperTripleEmphasis(p *parser, out *bytes.Buffer, data []byte, offset int, c byte) int {
949 i := 0
950 origData := data
951 data = data[offset:]
952
953 for i < len(data) {
954 length := helperFindEmphChar(data[i:], c)
955 if length == 0 {
956 return 0
957 }
958 i += length
959
960 // skip whitespace preceded symbols
961 if data[i] != c || isspace(data[i-1]) {
962 continue
963 }
964
965 switch {
966 case i+2 < len(data) && data[i+1] == c && data[i+2] == c:
967 // triple symbol found
968 var work bytes.Buffer
969
970 p.inline(&work, data[:i])
971 if work.Len() > 0 {
972 p.r.TripleEmphasis(out, work.Bytes())
973 }
974 return i + 3
975 case (i+1 < len(data) && data[i+1] == c):
976 // double symbol found, hand over to emph1
977 length = helperEmphasis(p, out, origData[offset-2:], c)
978 if length == 0 {
979 return 0
980 } else {
981 return length - 2
982 }
983 default:
984 // single symbol found, hand over to emph2
985 length = helperDoubleEmphasis(p, out, origData[offset-1:], c)
986 if length == 0 {
987 return 0
988 } else {
989 return length - 1
990 }
991 }
992 }
993 return 0
994}