inline.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11// Functions to parse inline elements.
12//
13
14package blackfriday
15
16import (
17 "bytes"
18 "regexp"
19 "strconv"
20)
21
22var (
23 urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
24 anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`)
25
26 // TODO: improve this regexp to catch all possible entities:
27 htmlEntityRe = regexp.MustCompile(`&[a-z]{2,5};`)
28)
29
30// Functions to parse text within a block
31// Each function returns the number of chars taken care of
32// data is the complete block being rendered
33// offset is the number of valid chars before the current cursor
34
35func (p *parser) inline(data []byte) {
36 // this is called recursively: enforce a maximum depth
37 if p.nesting >= p.maxNesting {
38 return
39 }
40 p.nesting++
41
42 i, end := 0, 0
43 for i < len(data) {
44 // Stop at EOL
45 if data[i] == '\n' && i+1 == len(data) {
46 break
47 }
48 // Copy inactive chars into the output, but first check for one quirk:
49 // 'h', 'm' and 'f' all might trigger a check for autolink processing
50 // and end this run of inactive characters. However, there's one nasty
51 // case where breaking this run would be bad: in smartypants fraction
52 // detection, we expect things like "1/2th" to be in a single run. So
53 // we check here if an 'h' is followed by 't' (from 'http') and if it's
54 // not, we short circuit the 'h' into the run of inactive characters.
55 //
56 // Also, in a similar fashion maybeLineBreak breaks this run of chars,
57 // but smartDash processor relies on seeing context around the dashes.
58 // Fix this somehow.
59 for end < len(data) {
60 if data[end] == ' ' {
61 consumed, br := maybeLineBreak(p, data, end)
62 if consumed > 0 {
63 p.currBlock.appendChild(text(data[i:end]))
64 if br {
65 p.currBlock.appendChild(NewNode(Hardbreak))
66 }
67 i = end
68 i += consumed
69 end = i
70 } else {
71 end++
72 }
73 continue
74 }
75 if p.inlineCallback[data[end]] != nil {
76 if end+1 < len(data) && data[end] == 'h' && data[end+1] != 't' {
77 end++
78 } else {
79 break
80 }
81 } else {
82 end++
83 }
84 }
85
86 p.currBlock.appendChild(text(data[i:end]))
87
88 if end >= len(data) {
89 break
90 }
91 i = end
92
93 // call the trigger
94 handler := p.inlineCallback[data[end]]
95 if consumed := handler(p, data, i); consumed == 0 {
96 // no action from the callback; buffer the byte for later
97 end = i + 1
98 } else {
99 // skip past whatever the callback used
100 i += consumed
101 end = i
102 }
103 }
104
105 p.nesting--
106}
107
108// single and double emphasis parsing
109func emphasis(p *parser, data []byte, offset int) int {
110 data = data[offset:]
111 c := data[0]
112 ret := 0
113
114 if len(data) > 2 && data[1] != c {
115 // whitespace cannot follow an opening emphasis;
116 // strikethrough only takes two characters '~~'
117 if c == '~' || isspace(data[1]) {
118 return 0
119 }
120 if ret = helperEmphasis(p, data[1:], c); ret == 0 {
121 return 0
122 }
123
124 return ret + 1
125 }
126
127 if len(data) > 3 && data[1] == c && data[2] != c {
128 if isspace(data[2]) {
129 return 0
130 }
131 if ret = helperDoubleEmphasis(p, data[2:], c); ret == 0 {
132 return 0
133 }
134
135 return ret + 2
136 }
137
138 if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
139 if c == '~' || isspace(data[3]) {
140 return 0
141 }
142 if ret = helperTripleEmphasis(p, data, 3, c); ret == 0 {
143 return 0
144 }
145
146 return ret + 3
147 }
148
149 return 0
150}
151
152func codeSpan(p *parser, data []byte, offset int) int {
153 data = data[offset:]
154
155 nb := 0
156
157 // count the number of backticks in the delimiter
158 for nb < len(data) && data[nb] == '`' {
159 nb++
160 }
161
162 // find the next delimiter
163 i, end := 0, 0
164 for end = nb; end < len(data) && i < nb; end++ {
165 if data[end] == '`' {
166 i++
167 } else {
168 i = 0
169 }
170 }
171
172 // no matching delimiter?
173 if i < nb && end >= len(data) {
174 return 0
175 }
176
177 // trim outside whitespace
178 fBegin := nb
179 for fBegin < end && data[fBegin] == ' ' {
180 fBegin++
181 }
182
183 fEnd := end - nb
184 for fEnd > fBegin && data[fEnd-1] == ' ' {
185 fEnd--
186 }
187
188 // render the code span
189 if fBegin != fEnd {
190 code := NewNode(Code)
191 code.Literal = data[fBegin:fEnd]
192 p.currBlock.appendChild(code)
193 }
194
195 return end
196
197}
198
199// newline preceded by two spaces becomes <br>
200func maybeLineBreak(p *parser, data []byte, offset int) (int, bool) {
201 origOffset := offset
202 for offset < len(data) && data[offset] == ' ' {
203 offset++
204 }
205 if offset < len(data) && data[offset] == '\n' {
206 if offset-origOffset >= 2 {
207 return offset - origOffset + 1, true
208 }
209 return offset - origOffset, false
210 }
211 return 0, false
212}
213
214// newline without two spaces works when HardLineBreak is enabled
215func lineBreak(p *parser, data []byte, offset int) int {
216 if p.flags&HardLineBreak != 0 {
217 p.currBlock.appendChild(NewNode(Hardbreak))
218 return 1
219 }
220 return 0
221}
222
223type linkType int
224
225const (
226 linkNormal linkType = iota
227 linkImg
228 linkDeferredFootnote
229 linkInlineFootnote
230)
231
232func isReferenceStyleLink(data []byte, pos int, t linkType) bool {
233 if t == linkDeferredFootnote {
234 return false
235 }
236 return pos < len(data)-1 && data[pos] == '[' && data[pos+1] != '^'
237}
238
239func maybeImage(p *parser, data []byte, offset int) int {
240 if offset < len(data)-1 && data[offset+1] == '[' {
241 return link(p, data, offset)
242 }
243 return 0
244}
245
246func maybeInlineFootnote(p *parser, data []byte, offset int) int {
247 if offset < len(data)-1 && data[offset+1] == '[' {
248 return link(p, data, offset)
249 }
250 return 0
251}
252
253// '[': parse a link or an image or a footnote
254func link(p *parser, data []byte, offset int) int {
255 // no links allowed inside regular links, footnote, and deferred footnotes
256 if p.insideLink && (offset > 0 && data[offset-1] == '[' || len(data)-1 > offset && data[offset+1] == '^') {
257 return 0
258 }
259
260 var t linkType
261 switch {
262 // special case: ![^text] == deferred footnote (that follows something with
263 // an exclamation point)
264 case p.flags&Footnotes != 0 && len(data)-1 > offset && data[offset+1] == '^':
265 t = linkDeferredFootnote
266 // ![alt] == image
267 case offset >= 0 && data[offset] == '!':
268 t = linkImg
269 offset++
270 // ^[text] == inline footnote
271 // [^refId] == deferred footnote
272 case p.flags&Footnotes != 0:
273 if offset >= 0 && data[offset] == '^' {
274 t = linkInlineFootnote
275 offset++
276 } else if len(data)-1 > offset && data[offset+1] == '^' {
277 t = linkDeferredFootnote
278 }
279 // [text] == regular link
280 default:
281 t = linkNormal
282 }
283
284 data = data[offset:]
285
286 var (
287 i = 1
288 noteID int
289 title, link, altContent []byte
290 textHasNl = false
291 )
292
293 if t == linkDeferredFootnote {
294 i++
295 }
296
297 // look for the matching closing bracket
298 for level := 1; level > 0 && i < len(data); i++ {
299 switch {
300 case data[i] == '\n':
301 textHasNl = true
302
303 case data[i-1] == '\\':
304 continue
305
306 case data[i] == '[':
307 level++
308
309 case data[i] == ']':
310 level--
311 if level <= 0 {
312 i-- // compensate for extra i++ in for loop
313 }
314 }
315 }
316
317 if i >= len(data) {
318 return 0
319 }
320
321 txtE := i
322 i++
323
324 // skip any amount of whitespace or newline
325 // (this is much more lax than original markdown syntax)
326 for i < len(data) && isspace(data[i]) {
327 i++
328 }
329
330 // inline style link
331 switch {
332 case i < len(data) && data[i] == '(':
333 // skip initial whitespace
334 i++
335
336 for i < len(data) && isspace(data[i]) {
337 i++
338 }
339
340 linkB := i
341
342 // look for link end: ' " )
343 findlinkend:
344 for i < len(data) {
345 switch {
346 case data[i] == '\\':
347 i += 2
348
349 case data[i] == ')' || data[i] == '\'' || data[i] == '"':
350 break findlinkend
351
352 default:
353 i++
354 }
355 }
356
357 if i >= len(data) {
358 return 0
359 }
360 linkE := i
361
362 // look for title end if present
363 titleB, titleE := 0, 0
364 if data[i] == '\'' || data[i] == '"' {
365 i++
366 titleB = i
367
368 findtitleend:
369 for i < len(data) {
370 switch {
371 case data[i] == '\\':
372 i += 2
373
374 case data[i] == ')':
375 break findtitleend
376
377 default:
378 i++
379 }
380 }
381
382 if i >= len(data) {
383 return 0
384 }
385
386 // skip whitespace after title
387 titleE = i - 1
388 for titleE > titleB && isspace(data[titleE]) {
389 titleE--
390 }
391
392 // check for closing quote presence
393 if data[titleE] != '\'' && data[titleE] != '"' {
394 titleB, titleE = 0, 0
395 linkE = i
396 }
397 }
398
399 // remove whitespace at the end of the link
400 for linkE > linkB && isspace(data[linkE-1]) {
401 linkE--
402 }
403
404 // remove optional angle brackets around the link
405 if data[linkB] == '<' {
406 linkB++
407 }
408 if data[linkE-1] == '>' {
409 linkE--
410 }
411
412 // build escaped link and title
413 if linkE > linkB {
414 link = data[linkB:linkE]
415 }
416
417 if titleE > titleB {
418 title = data[titleB:titleE]
419 }
420
421 i++
422
423 // reference style link
424 case isReferenceStyleLink(data, i, t):
425 var id []byte
426 altContentConsidered := false
427
428 // look for the id
429 i++
430 linkB := i
431 for i < len(data) && data[i] != ']' {
432 i++
433 }
434 if i >= len(data) {
435 return 0
436 }
437 linkE := i
438
439 // find the reference
440 if linkB == linkE {
441 if textHasNl {
442 var b bytes.Buffer
443
444 for j := 1; j < txtE; j++ {
445 switch {
446 case data[j] != '\n':
447 b.WriteByte(data[j])
448 case data[j-1] != ' ':
449 b.WriteByte(' ')
450 }
451 }
452
453 id = b.Bytes()
454 } else {
455 id = data[1:txtE]
456 altContentConsidered = true
457 }
458 } else {
459 id = data[linkB:linkE]
460 }
461
462 // find the reference with matching id
463 lr, ok := p.getRef(string(id))
464 if !ok {
465 return 0
466 }
467
468 // keep link and title from reference
469 link = lr.link
470 title = lr.title
471 if altContentConsidered {
472 altContent = lr.text
473 }
474 i++
475
476 // shortcut reference style link or reference or inline footnote
477 default:
478 var id []byte
479
480 // craft the id
481 if textHasNl {
482 var b bytes.Buffer
483
484 for j := 1; j < txtE; j++ {
485 switch {
486 case data[j] != '\n':
487 b.WriteByte(data[j])
488 case data[j-1] != ' ':
489 b.WriteByte(' ')
490 }
491 }
492
493 id = b.Bytes()
494 } else {
495 if t == linkDeferredFootnote {
496 id = data[2:txtE] // get rid of the ^
497 } else {
498 id = data[1:txtE]
499 }
500 }
501
502 if t == linkInlineFootnote {
503 // create a new reference
504 noteID = len(p.notes) + 1
505
506 var fragment []byte
507 if len(id) > 0 {
508 if len(id) < 16 {
509 fragment = make([]byte, len(id))
510 } else {
511 fragment = make([]byte, 16)
512 }
513 copy(fragment, slugify(id))
514 } else {
515 fragment = append([]byte("footnote-"), []byte(strconv.Itoa(noteID))...)
516 }
517
518 ref := &reference{
519 noteID: noteID,
520 hasBlock: false,
521 link: fragment,
522 title: id,
523 }
524
525 p.notes = append(p.notes, ref)
526
527 link = ref.link
528 title = ref.title
529 } else {
530 // find the reference with matching id
531 lr, ok := p.getRef(string(id))
532 if !ok {
533 return 0
534 }
535
536 if t == linkDeferredFootnote {
537 lr.noteID = len(p.notes) + 1
538 p.notes = append(p.notes, lr)
539 }
540
541 // keep link and title from reference
542 link = lr.link
543 // if inline footnote, title == footnote contents
544 title = lr.title
545 noteID = lr.noteID
546 }
547
548 // rewind the whitespace
549 i = txtE + 1
550 }
551
552 var uLink []byte
553 if t == linkNormal || t == linkImg {
554 if len(link) > 0 {
555 var uLinkBuf bytes.Buffer
556 unescapeText(&uLinkBuf, link)
557 uLink = uLinkBuf.Bytes()
558 }
559
560 // links need something to click on and somewhere to go
561 if len(uLink) == 0 || (t == linkNormal && txtE <= 1) {
562 return 0
563 }
564 }
565
566 // call the relevant rendering function
567 switch t {
568 case linkNormal:
569 linkNode := NewNode(Link)
570 linkNode.Destination = normalizeURI(uLink)
571 linkNode.Title = title
572 p.currBlock.appendChild(linkNode)
573 if len(altContent) > 0 {
574 linkNode.appendChild(text(altContent))
575 } else {
576 // links cannot contain other links, so turn off link parsing
577 // temporarily and recurse
578 insideLink := p.insideLink
579 p.insideLink = true
580 tmpNode := p.currBlock
581 p.currBlock = linkNode
582 p.inline(data[1:txtE])
583 p.currBlock = tmpNode
584 p.insideLink = insideLink
585 }
586
587 case linkImg:
588 linkNode := NewNode(Image)
589 linkNode.Destination = uLink
590 linkNode.Title = title
591 p.currBlock.appendChild(linkNode)
592 linkNode.appendChild(text(data[1:txtE]))
593 i++
594
595 case linkInlineFootnote, linkDeferredFootnote:
596 linkNode := NewNode(Link)
597 linkNode.Destination = link
598 linkNode.Title = title
599 linkNode.NoteID = noteID
600 p.currBlock.appendChild(linkNode)
601 if t == linkInlineFootnote {
602 i++
603 }
604
605 default:
606 return 0
607 }
608
609 return i
610}
611
612func (p *parser) inlineHTMLComment(data []byte) int {
613 if len(data) < 5 {
614 return 0
615 }
616 if data[0] != '<' || data[1] != '!' || data[2] != '-' || data[3] != '-' {
617 return 0
618 }
619 i := 5
620 // scan for an end-of-comment marker, across lines if necessary
621 for i < len(data) && !(data[i-2] == '-' && data[i-1] == '-' && data[i] == '>') {
622 i++
623 }
624 // no end-of-comment marker
625 if i >= len(data) {
626 return 0
627 }
628 return i + 1
629}
630
631func stripMailto(link []byte) []byte {
632 if bytes.HasPrefix(link, []byte("mailto://")) {
633 return link[9:]
634 } else if bytes.HasPrefix(link, []byte("mailto:")) {
635 return link[7:]
636 } else {
637 return link
638 }
639}
640
641// '<' when tags or autolinks are allowed
642func leftAngle(p *parser, data []byte, offset int) int {
643 data = data[offset:]
644 altype := LinkTypeNotAutolink
645 end := tagLength(data, &altype)
646 if size := p.inlineHTMLComment(data); size > 0 {
647 end = size
648 }
649 if end > 2 {
650 if altype != LinkTypeNotAutolink {
651 var uLink bytes.Buffer
652 unescapeText(&uLink, data[1:end+1-2])
653 if uLink.Len() > 0 {
654 link := uLink.Bytes()
655 node := NewNode(Link)
656 node.Destination = link
657 if altype == LinkTypeEmail {
658 node.Destination = append([]byte("mailto:"), link...)
659 }
660 p.currBlock.appendChild(node)
661 node.appendChild(text(stripMailto(link)))
662 }
663 } else {
664 htmlTag := NewNode(HTMLSpan)
665 htmlTag.Literal = data[:end]
666 p.currBlock.appendChild(htmlTag)
667 }
668 }
669
670 return end
671}
672
673// '\\' backslash escape
674var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>~")
675
676func escape(p *parser, data []byte, offset int) int {
677 data = data[offset:]
678
679 if len(data) > 1 {
680 if p.flags&BackslashLineBreak != 0 && data[1] == '\n' {
681 p.currBlock.appendChild(NewNode(Hardbreak))
682 return 2
683 }
684 if bytes.IndexByte(escapeChars, data[1]) < 0 {
685 return 0
686 }
687
688 p.currBlock.appendChild(text(data[1:2]))
689 }
690
691 return 2
692}
693
694func unescapeText(ob *bytes.Buffer, src []byte) {
695 i := 0
696 for i < len(src) {
697 org := i
698 for i < len(src) && src[i] != '\\' {
699 i++
700 }
701
702 if i > org {
703 ob.Write(src[org:i])
704 }
705
706 if i+1 >= len(src) {
707 break
708 }
709
710 ob.WriteByte(src[i+1])
711 i += 2
712 }
713}
714
715// '&' escaped when it doesn't belong to an entity
716// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
717func entity(p *parser, data []byte, offset int) int {
718 data = data[offset:]
719
720 end := 1
721
722 if end < len(data) && data[end] == '#' {
723 end++
724 }
725
726 for end < len(data) && isalnum(data[end]) {
727 end++
728 }
729
730 if end < len(data) && data[end] == ';' {
731 end++ // real entity
732 } else {
733 return 0 // lone '&'
734 }
735
736 ent := data[:end]
737 // undo & escaping or it will be converted to &amp; by another
738 // escaper in the renderer
739 if bytes.Equal(ent, []byte("&")) {
740 ent = []byte{'&'}
741 }
742 p.currBlock.appendChild(text(ent))
743
744 return end
745}
746
747func linkEndsWithEntity(data []byte, linkEnd int) bool {
748 entityRanges := htmlEntityRe.FindAllIndex(data[:linkEnd], -1)
749 return entityRanges != nil && entityRanges[len(entityRanges)-1][1] == linkEnd
750}
751
752func maybeAutoLink(p *parser, data []byte, offset int) int {
753 // quick check to rule out most false hits
754 if p.insideLink || len(data) < offset+6 { // 6 is the len() of the shortest prefix below
755 return 0
756 }
757 prefixes := []string{
758 "http://",
759 "https://",
760 "ftp://",
761 "file://",
762 "mailto:",
763 }
764 for _, prefix := range prefixes {
765 endOfHead := offset + 8 // 8 is the len() of the longest prefix
766 if endOfHead > len(data) {
767 endOfHead = len(data)
768 }
769 head := bytes.ToLower(data[offset:endOfHead])
770 if bytes.HasPrefix(head, []byte(prefix)) {
771 return autoLink(p, data, offset)
772 }
773 }
774 return 0
775}
776
777func autoLink(p *parser, data []byte, offset int) int {
778 // Now a more expensive check to see if we're not inside an anchor element
779 anchorStart := offset
780 offsetFromAnchor := 0
781 for anchorStart > 0 && data[anchorStart] != '<' {
782 anchorStart--
783 offsetFromAnchor++
784 }
785
786 anchorStr := anchorRe.Find(data[anchorStart:])
787 if anchorStr != nil {
788 anchorClose := NewNode(HTMLSpan)
789 anchorClose.Literal = anchorStr[offsetFromAnchor:]
790 p.currBlock.appendChild(anchorClose)
791 return len(anchorStr) - offsetFromAnchor
792 }
793
794 // scan backward for a word boundary
795 rewind := 0
796 for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) {
797 rewind++
798 }
799 if rewind > 6 { // longest supported protocol is "mailto" which has 6 letters
800 return 0
801 }
802
803 origData := data
804 data = data[offset-rewind:]
805
806 if !isSafeLink(data) {
807 return 0
808 }
809
810 linkEnd := 0
811 for linkEnd < len(data) && !isEndOfLink(data[linkEnd]) {
812 linkEnd++
813 }
814
815 // Skip punctuation at the end of the link
816 if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',') && data[linkEnd-2] != '\\' {
817 linkEnd--
818 }
819
820 // But don't skip semicolon if it's a part of escaped entity:
821 if data[linkEnd-1] == ';' && data[linkEnd-2] != '\\' && !linkEndsWithEntity(data, linkEnd) {
822 linkEnd--
823 }
824
825 // See if the link finishes with a punctuation sign that can be closed.
826 var copen byte
827 switch data[linkEnd-1] {
828 case '"':
829 copen = '"'
830 case '\'':
831 copen = '\''
832 case ')':
833 copen = '('
834 case ']':
835 copen = '['
836 case '}':
837 copen = '{'
838 default:
839 copen = 0
840 }
841
842 if copen != 0 {
843 bufEnd := offset - rewind + linkEnd - 2
844
845 openDelim := 1
846
847 /* Try to close the final punctuation sign in this same line;
848 * if we managed to close it outside of the URL, that means that it's
849 * not part of the URL. If it closes inside the URL, that means it
850 * is part of the URL.
851 *
852 * Examples:
853 *
854 * foo http://www.pokemon.com/Pikachu_(Electric) bar
855 * => http://www.pokemon.com/Pikachu_(Electric)
856 *
857 * foo (http://www.pokemon.com/Pikachu_(Electric)) bar
858 * => http://www.pokemon.com/Pikachu_(Electric)
859 *
860 * foo http://www.pokemon.com/Pikachu_(Electric)) bar
861 * => http://www.pokemon.com/Pikachu_(Electric))
862 *
863 * (foo http://www.pokemon.com/Pikachu_(Electric)) bar
864 * => foo http://www.pokemon.com/Pikachu_(Electric)
865 */
866
867 for bufEnd >= 0 && origData[bufEnd] != '\n' && openDelim != 0 {
868 if origData[bufEnd] == data[linkEnd-1] {
869 openDelim++
870 }
871
872 if origData[bufEnd] == copen {
873 openDelim--
874 }
875
876 bufEnd--
877 }
878
879 if openDelim == 0 {
880 linkEnd--
881 }
882 }
883
884 var uLink bytes.Buffer
885 unescapeText(&uLink, data[:linkEnd])
886
887 if uLink.Len() > 0 {
888 node := NewNode(Link)
889 node.Destination = uLink.Bytes()
890 p.currBlock.appendChild(node)
891 node.appendChild(text(uLink.Bytes()))
892 }
893
894 return linkEnd
895}
896
897func isEndOfLink(char byte) bool {
898 return isspace(char) || char == '<'
899}
900
901var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
902var validPaths = [][]byte{[]byte("/"), []byte("./"), []byte("../")}
903
904func isSafeLink(link []byte) bool {
905 for _, path := range validPaths {
906 if len(link) >= len(path) && bytes.Equal(link[:len(path)], path) {
907 if len(link) == len(path) {
908 return true
909 } else if isalnum(link[len(path)]) {
910 return true
911 }
912 }
913 }
914
915 for _, prefix := range validUris {
916 // TODO: handle unicode here
917 // case-insensitive prefix test
918 if len(link) > len(prefix) && bytes.Equal(bytes.ToLower(link[:len(prefix)]), prefix) && isalnum(link[len(prefix)]) {
919 return true
920 }
921 }
922
923 return false
924}
925
926// return the length of the given tag, or 0 is it's not valid
927func tagLength(data []byte, autolink *LinkType) int {
928 var i, j int
929
930 // a valid tag can't be shorter than 3 chars
931 if len(data) < 3 {
932 return 0
933 }
934
935 // begins with a '<' optionally followed by '/', followed by letter or number
936 if data[0] != '<' {
937 return 0
938 }
939 if data[1] == '/' {
940 i = 2
941 } else {
942 i = 1
943 }
944
945 if !isalnum(data[i]) {
946 return 0
947 }
948
949 // scheme test
950 *autolink = LinkTypeNotAutolink
951
952 // try to find the beginning of an URI
953 for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
954 i++
955 }
956
957 if i > 1 && i < len(data) && data[i] == '@' {
958 if j = isMailtoAutoLink(data[i:]); j != 0 {
959 *autolink = LinkTypeEmail
960 return i + j
961 }
962 }
963
964 if i > 2 && i < len(data) && data[i] == ':' {
965 *autolink = LinkTypeNormal
966 i++
967 }
968
969 // complete autolink test: no whitespace or ' or "
970 switch {
971 case i >= len(data):
972 *autolink = LinkTypeNotAutolink
973 case *autolink != 0:
974 j = i
975
976 for i < len(data) {
977 if data[i] == '\\' {
978 i += 2
979 } else if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
980 break
981 } else {
982 i++
983 }
984
985 }
986
987 if i >= len(data) {
988 return 0
989 }
990 if i > j && data[i] == '>' {
991 return i + 1
992 }
993
994 // one of the forbidden chars has been found
995 *autolink = LinkTypeNotAutolink
996 }
997
998 // look for something looking like a tag end
999 for i < len(data) && data[i] != '>' {
1000 i++
1001 }
1002 if i >= len(data) {
1003 return 0
1004 }
1005 return i + 1
1006}
1007
1008// look for the address part of a mail autolink and '>'
1009// this is less strict than the original markdown e-mail address matching
1010func isMailtoAutoLink(data []byte) int {
1011 nb := 0
1012
1013 // address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
1014 for i := 0; i < len(data); i++ {
1015 if isalnum(data[i]) {
1016 continue
1017 }
1018
1019 switch data[i] {
1020 case '@':
1021 nb++
1022
1023 case '-', '.', '_':
1024 break
1025
1026 case '>':
1027 if nb == 1 {
1028 return i + 1
1029 }
1030 return 0
1031 default:
1032 return 0
1033 }
1034 }
1035
1036 return 0
1037}
1038
1039// look for the next emph char, skipping other constructs
1040func helperFindEmphChar(data []byte, c byte) int {
1041 i := 0
1042
1043 for i < len(data) {
1044 for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
1045 i++
1046 }
1047 if i >= len(data) {
1048 return 0
1049 }
1050 // do not count escaped chars
1051 if i != 0 && data[i-1] == '\\' {
1052 i++
1053 continue
1054 }
1055 if data[i] == c {
1056 return i
1057 }
1058
1059 if data[i] == '`' {
1060 // skip a code span
1061 tmpI := 0
1062 i++
1063 for i < len(data) && data[i] != '`' {
1064 if tmpI == 0 && data[i] == c {
1065 tmpI = i
1066 }
1067 i++
1068 }
1069 if i >= len(data) {
1070 return tmpI
1071 }
1072 i++
1073 } else if data[i] == '[' {
1074 // skip a link
1075 tmpI := 0
1076 i++
1077 for i < len(data) && data[i] != ']' {
1078 if tmpI == 0 && data[i] == c {
1079 tmpI = i
1080 }
1081 i++
1082 }
1083 i++
1084 for i < len(data) && (data[i] == ' ' || data[i] == '\n') {
1085 i++
1086 }
1087 if i >= len(data) {
1088 return tmpI
1089 }
1090 if data[i] != '[' && data[i] != '(' { // not a link
1091 if tmpI > 0 {
1092 return tmpI
1093 }
1094 continue
1095 }
1096 cc := data[i]
1097 i++
1098 for i < len(data) && data[i] != cc {
1099 if tmpI == 0 && data[i] == c {
1100 return i
1101 }
1102 i++
1103 }
1104 if i >= len(data) {
1105 return tmpI
1106 }
1107 i++
1108 }
1109 }
1110 return 0
1111}
1112
1113func helperEmphasis(p *parser, data []byte, c byte) int {
1114 i := 0
1115
1116 // skip one symbol if coming from emph3
1117 if len(data) > 1 && data[0] == c && data[1] == c {
1118 i = 1
1119 }
1120
1121 for i < len(data) {
1122 length := helperFindEmphChar(data[i:], c)
1123 if length == 0 {
1124 return 0
1125 }
1126 i += length
1127 if i >= len(data) {
1128 return 0
1129 }
1130
1131 if i+1 < len(data) && data[i+1] == c {
1132 i++
1133 continue
1134 }
1135
1136 if data[i] == c && !isspace(data[i-1]) {
1137
1138 if p.flags&NoIntraEmphasis != 0 {
1139 if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
1140 continue
1141 }
1142 }
1143
1144 emph := NewNode(Emph)
1145 p.currBlock.appendChild(emph)
1146 tmp := p.currBlock
1147 p.currBlock = emph
1148 p.inline(data[:i])
1149 p.currBlock = tmp
1150 return i + 1
1151 }
1152 }
1153
1154 return 0
1155}
1156
1157func helperDoubleEmphasis(p *parser, data []byte, c byte) int {
1158 i := 0
1159
1160 for i < len(data) {
1161 length := helperFindEmphChar(data[i:], c)
1162 if length == 0 {
1163 return 0
1164 }
1165 i += length
1166
1167 if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
1168 nodeType := Strong
1169 if c == '~' {
1170 nodeType = Del
1171 }
1172 node := NewNode(nodeType)
1173 p.currBlock.appendChild(node)
1174 tmp := p.currBlock
1175 p.currBlock = node
1176 p.inline(data[:i])
1177 p.currBlock = tmp
1178 return i + 2
1179 }
1180 i++
1181 }
1182 return 0
1183}
1184
1185func helperTripleEmphasis(p *parser, data []byte, offset int, c byte) int {
1186 i := 0
1187 origData := data
1188 data = data[offset:]
1189
1190 for i < len(data) {
1191 length := helperFindEmphChar(data[i:], c)
1192 if length == 0 {
1193 return 0
1194 }
1195 i += length
1196
1197 // skip whitespace preceded symbols
1198 if data[i] != c || isspace(data[i-1]) {
1199 continue
1200 }
1201
1202 switch {
1203 case i+2 < len(data) && data[i+1] == c && data[i+2] == c:
1204 // triple symbol found
1205 strong := NewNode(Strong)
1206 em := NewNode(Emph)
1207 strong.appendChild(em)
1208 p.currBlock.appendChild(strong)
1209 tmp := p.currBlock
1210 p.currBlock = em
1211 p.inline(data[:i])
1212 p.currBlock = tmp
1213 return i + 3
1214 case (i+1 < len(data) && data[i+1] == c):
1215 // double symbol found, hand over to emph1
1216 length = helperEmphasis(p, origData[offset-2:], c)
1217 if length == 0 {
1218 return 0
1219 }
1220 return length - 2
1221 default:
1222 // single symbol found, hand over to emph2
1223 length = helperDoubleEmphasis(p, origData[offset-1:], c)
1224 if length == 0 {
1225 return 0
1226 }
1227 return length - 1
1228 }
1229 }
1230 return 0
1231}
1232
1233func text(s []byte) *Node {
1234 node := NewNode(Text)
1235 node.Literal = s
1236 return node
1237}
1238
1239func normalizeURI(s []byte) []byte {
1240 return s // TODO: implement
1241}