all repos — grayfriday @ 689f6cb79b7436f387720213cb6a9810649762b6

blackfriday fork with a few changes

inline.go (view raw)

  1//
  2// Blackfriday Markdown Processor
  3// Available at http://github.com/russross/blackfriday
  4//
  5// Copyright © 2011 Russ Ross <russ@russross.com>.
  6// Distributed under the Simplified BSD License.
  7// See README.md for details.
  8//
  9
 10//
 11// Functions to parse inline elements.
 12//
 13
 14package blackfriday
 15
 16import (
 17	"bytes"
 18)
 19
 20// Functions to parse text within a block
 21// Each function returns the number of chars taken care of
 22// data is the complete block being rendered
 23// offset is the number of valid chars before the current cursor
 24
 25func (parser *Parser) parseInline(out *bytes.Buffer, data []byte) {
 26	// this is called recursively: enforce a maximum depth
 27	if parser.nesting >= parser.maxNesting {
 28		return
 29	}
 30	parser.nesting++
 31
 32	i, end := 0, 0
 33	for i < len(data) {
 34		// copy inactive chars into the output
 35		for end < len(data) && parser.inline[data[end]] == nil {
 36			end++
 37		}
 38
 39		parser.r.NormalText(out, data[i:end])
 40
 41		if end >= len(data) {
 42			break
 43		}
 44		i = end
 45
 46		// call the trigger
 47		handler := parser.inline[data[end]]
 48		if consumed := handler(parser, out, data, i); consumed == 0 {
 49			// no action from the callback; buffer the byte for later
 50			end = i + 1
 51		} else {
 52			// skip past whatever the callback used
 53			i += consumed
 54			end = i
 55		}
 56	}
 57
 58	parser.nesting--
 59}
 60
 61// single and double emphasis parsing
 62func inlineEmphasis(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
 63	data = data[offset:]
 64	c := data[0]
 65	ret := 0
 66
 67	if len(data) > 2 && data[1] != c {
 68		// whitespace cannot follow an opening emphasis;
 69		// strikethrough only takes two characters '~~'
 70		if c == '~' || isspace(data[1]) {
 71			return 0
 72		}
 73		if ret = inlineHelperEmph1(parser, out, data[1:], c); ret == 0 {
 74			return 0
 75		}
 76
 77		return ret + 1
 78	}
 79
 80	if len(data) > 3 && data[1] == c && data[2] != c {
 81		if isspace(data[2]) {
 82			return 0
 83		}
 84		if ret = inlineHelperEmph2(parser, out, data[2:], c); ret == 0 {
 85			return 0
 86		}
 87
 88		return ret + 2
 89	}
 90
 91	if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
 92		if c == '~' || isspace(data[3]) {
 93			return 0
 94		}
 95		if ret = inlineHelperEmph3(parser, out, data, 3, c); ret == 0 {
 96			return 0
 97		}
 98
 99		return ret + 3
100	}
101
102	return 0
103}
104
105func inlineCodeSpan(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
106	data = data[offset:]
107
108	nb := 0
109
110	// count the number of backticks in the delimiter
111	for nb < len(data) && data[nb] == '`' {
112		nb++
113	}
114
115	// find the next delimiter
116	i, end := 0, 0
117	for end = nb; end < len(data) && i < nb; end++ {
118		if data[end] == '`' {
119			i++
120		} else {
121			i = 0
122		}
123	}
124
125	// no matching delimiter?
126	if i < nb && end >= len(data) {
127		return 0
128	}
129
130	// trim outside whitespace
131	fBegin := nb
132	for fBegin < end && data[fBegin] == ' ' {
133		fBegin++
134	}
135
136	fEnd := end - nb
137	for fEnd > fBegin && data[fEnd-1] == ' ' {
138		fEnd--
139	}
140
141	// render the code span
142	if fBegin != fEnd {
143		parser.r.CodeSpan(out, data[fBegin:fEnd])
144	}
145
146	return end
147
148}
149
150// newline preceded by two spaces becomes <br>
151// newline without two spaces works when EXTENSION_HARD_LINE_BREAK is enabled
152func inlineLineBreak(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
153	// remove trailing spaces from out
154	outBytes := out.Bytes()
155	end := len(outBytes)
156	eol := end
157	for eol > 0 && outBytes[eol-1] == ' ' {
158		eol--
159	}
160	out.Truncate(eol)
161
162	// should there be a hard line break here?
163	if parser.flags&EXTENSION_HARD_LINE_BREAK == 0 && end-eol < 2 {
164		return 0
165	}
166
167	parser.r.LineBreak(out)
168	return 1
169}
170
171// '[': parse a link or an image
172func inlineLink(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
173	// no links allowed inside other links
174	if parser.insideLink {
175		return 0
176	}
177
178	isImg := offset > 0 && data[offset-1] == '!'
179
180	data = data[offset:]
181
182	i := 1
183	var title, link []byte
184	textHasNl := false
185
186	// look for the matching closing bracket
187	for level := 1; level > 0 && i < len(data); i++ {
188		switch {
189		case data[i] == '\n':
190			textHasNl = true
191
192		case data[i-1] == '\\':
193			continue
194
195		case data[i] == '[':
196			level++
197
198		case data[i] == ']':
199			level--
200			if level <= 0 {
201				i-- // compensate for extra i++ in for loop
202			}
203		}
204	}
205
206	if i >= len(data) {
207		return 0
208	}
209
210	txtE := i
211	i++
212
213	// skip any amount of whitespace or newline
214	// (this is much more lax than original markdown syntax)
215	for i < len(data) && isspace(data[i]) {
216		i++
217	}
218
219	// inline style link
220	switch {
221	case i < len(data) && data[i] == '(':
222		// skip initial whitespace
223		i++
224
225		for i < len(data) && isspace(data[i]) {
226			i++
227		}
228
229		linkB := i
230
231		// look for link end: ' " )
232		for i < len(data) {
233			if data[i] == '\\' {
234				i += 2
235			} else {
236				if data[i] == ')' || data[i] == '\'' || data[i] == '"' {
237					break
238				}
239				i++
240			}
241		}
242
243		if i >= len(data) {
244			return 0
245		}
246		linkE := i
247
248		// look for title end if present
249		titleB, titleE := 0, 0
250		if data[i] == '\'' || data[i] == '"' {
251			i++
252			titleB = i
253
254			for i < len(data) {
255				if data[i] == '\\' {
256					i += 2
257				} else {
258					if data[i] == ')' {
259						break
260					}
261					i++
262				}
263			}
264
265			if i >= len(data) {
266				return 0
267			}
268
269			// skip whitespace after title
270			titleE = i - 1
271			for titleE > titleB && isspace(data[titleE]) {
272				titleE--
273			}
274
275			// check for closing quote presence
276			if data[titleE] != '\'' && data[titleE] != '"' {
277				titleB, titleE = 0, 0
278				linkE = i
279			}
280		}
281
282		// remove whitespace at the end of the link
283		for linkE > linkB && isspace(data[linkE-1]) {
284			linkE--
285		}
286
287		// remove optional angle brackets around the link
288		if data[linkB] == '<' {
289			linkB++
290		}
291		if data[linkE-1] == '>' {
292			linkE--
293		}
294
295		// build escaped link and title
296		if linkE > linkB {
297			link = data[linkB:linkE]
298		}
299
300		if titleE > titleB {
301			title = data[titleB:titleE]
302		}
303
304		i++
305
306	// reference style link
307	case i < len(data) && data[i] == '[':
308		var id []byte
309
310		// look for the id
311		i++
312		linkB := i
313		for i < len(data) && data[i] != ']' {
314			i++
315		}
316		if i >= len(data) {
317			return 0
318		}
319		linkE := i
320
321		// find the reference
322		if linkB == linkE {
323			if textHasNl {
324				var b bytes.Buffer
325
326				for j := 1; j < txtE; j++ {
327					switch {
328					case data[j] != '\n':
329						b.WriteByte(data[j])
330					case data[j-1] != ' ':
331						b.WriteByte(' ')
332					}
333				}
334
335				id = b.Bytes()
336			} else {
337				id = data[1:txtE]
338			}
339		} else {
340			id = data[linkB:linkE]
341		}
342
343		// find the reference with matching id (ids are case-insensitive)
344		key := string(bytes.ToLower(id))
345		lr, ok := parser.refs[key]
346		if !ok {
347			return 0
348		}
349
350		// keep link and title from reference
351		link = lr.link
352		title = lr.title
353		i++
354
355	// shortcut reference style link
356	default:
357		var id []byte
358
359		// craft the id
360		if textHasNl {
361			var b bytes.Buffer
362
363			for j := 1; j < txtE; j++ {
364				switch {
365				case data[j] != '\n':
366					b.WriteByte(data[j])
367				case data[j-1] != ' ':
368					b.WriteByte(' ')
369				}
370			}
371
372			id = b.Bytes()
373		} else {
374			id = data[1:txtE]
375		}
376
377		// find the reference with matching id
378		key := string(bytes.ToLower(id))
379		lr, ok := parser.refs[key]
380		if !ok {
381			return 0
382		}
383
384		// keep link and title from reference
385		link = lr.link
386		title = lr.title
387
388		// rewind the whitespace
389		i = txtE + 1
390	}
391
392	// build content: img alt is escaped, link content is parsed
393	var content bytes.Buffer
394	if txtE > 1 {
395		if isImg {
396			content.Write(data[1:txtE])
397		} else {
398			// links cannot contain other links, so turn off link parsing temporarily
399			insideLink := parser.insideLink
400			parser.insideLink = true
401			parser.parseInline(&content, data[1:txtE])
402			parser.insideLink = insideLink
403		}
404	}
405
406	var uLink []byte
407	if len(link) > 0 {
408		var uLinkBuf bytes.Buffer
409		unescapeText(&uLinkBuf, link)
410		uLink = uLinkBuf.Bytes()
411	}
412
413	// links need something to click on and somewhere to go
414	if len(uLink) == 0 || (!isImg && content.Len() == 0) {
415		return 0
416	}
417
418	// call the relevant rendering function
419	if isImg {
420		outSize := out.Len()
421		outBytes := out.Bytes()
422		if outSize > 0 && outBytes[outSize-1] == '!' {
423			out.Truncate(outSize - 1)
424		}
425
426		parser.r.Image(out, uLink, title, content.Bytes())
427	} else {
428		parser.r.Link(out, uLink, title, content.Bytes())
429	}
430
431	return i
432}
433
434// '<' when tags or autolinks are allowed
435func inlineLAngle(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
436	data = data[offset:]
437	altype := LINK_TYPE_NOT_AUTOLINK
438	end := tagLength(data, &altype)
439
440	if end > 2 {
441		if altype != LINK_TYPE_NOT_AUTOLINK {
442			var uLink bytes.Buffer
443			unescapeText(&uLink, data[1:end+1-2])
444			if uLink.Len() > 0 {
445				parser.r.AutoLink(out, uLink.Bytes(), altype)
446			}
447		} else {
448			parser.r.RawHtmlTag(out, data[:end])
449		}
450	}
451
452	return end
453}
454
455// '\\' backslash escape
456var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>")
457
458func inlineEscape(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
459	data = data[offset:]
460
461	if len(data) > 1 {
462		if bytes.IndexByte(escapeChars, data[1]) < 0 {
463			return 0
464		}
465
466		parser.r.NormalText(out, data[1:2])
467	}
468
469	return 2
470}
471
472func unescapeText(ob *bytes.Buffer, src []byte) {
473	i := 0
474	for i < len(src) {
475		org := i
476		for i < len(src) && src[i] != '\\' {
477			i++
478		}
479
480		if i > org {
481			ob.Write(src[org:i])
482		}
483
484		if i+1 >= len(src) {
485			break
486		}
487
488		ob.WriteByte(src[i+1])
489		i += 2
490	}
491}
492
493// '&' escaped when it doesn't belong to an entity
494// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
495func inlineEntity(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
496	data = data[offset:]
497
498	end := 1
499
500	if end < len(data) && data[end] == '#' {
501		end++
502	}
503
504	for end < len(data) && isalnum(data[end]) {
505		end++
506	}
507
508	if end < len(data) && data[end] == ';' {
509		end++ // real entity
510	} else {
511		return 0 // lone '&'
512	}
513
514	parser.r.Entity(out, data[:end])
515
516	return end
517}
518
519func inlineAutoLink(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
520	// quick check to rule out most false hits on ':'
521	if parser.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' {
522		return 0
523	}
524
525	// scan backward for a word boundary
526	rewind := 0
527	for offset-rewind > 0 && rewind <= 7 && !isspace(data[offset-rewind-1]) && !isspace(data[offset-rewind-1]) {
528		rewind++
529	}
530	if rewind > 6 { // longest supported protocol is "mailto" which has 6 letters
531		return 0
532	}
533
534	origData := data
535	data = data[offset-rewind:]
536
537	if !isSafeLink(data) {
538		return 0
539	}
540
541	linkEnd := 0
542	for linkEnd < len(data) && !isspace(data[linkEnd]) {
543		linkEnd++
544	}
545
546	// Skip punctuation at the end of the link
547	if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' {
548		linkEnd--
549	}
550
551	// See if the link finishes with a punctuation sign that can be closed.
552	var copen byte
553	switch data[linkEnd-1] {
554	case '"':
555		copen = '"'
556	case '\'':
557		copen = '\''
558	case ')':
559		copen = '('
560	case ']':
561		copen = '['
562	case '}':
563		copen = '{'
564	default:
565		copen = 0
566	}
567
568	if copen != 0 {
569		bufEnd := offset - rewind + linkEnd - 2
570
571		openDelim := 1
572
573		/* Try to close the final punctuation sign in this same line;
574		 * if we managed to close it outside of the URL, that means that it's
575		 * not part of the URL. If it closes inside the URL, that means it
576		 * is part of the URL.
577		 *
578		 * Examples:
579		 *
580		 *      foo http://www.pokemon.com/Pikachu_(Electric) bar
581		 *              => http://www.pokemon.com/Pikachu_(Electric)
582		 *
583		 *      foo (http://www.pokemon.com/Pikachu_(Electric)) bar
584		 *              => http://www.pokemon.com/Pikachu_(Electric)
585		 *
586		 *      foo http://www.pokemon.com/Pikachu_(Electric)) bar
587		 *              => http://www.pokemon.com/Pikachu_(Electric))
588		 *
589		 *      (foo http://www.pokemon.com/Pikachu_(Electric)) bar
590		 *              => foo http://www.pokemon.com/Pikachu_(Electric)
591		 */
592
593		for bufEnd >= 0 && origData[bufEnd] != '\n' && openDelim != 0 {
594			if origData[bufEnd] == data[linkEnd-1] {
595				openDelim++
596			}
597
598			if origData[bufEnd] == copen {
599				openDelim--
600			}
601
602			bufEnd--
603		}
604
605		if openDelim == 0 {
606			linkEnd--
607		}
608	}
609
610	// we were triggered on the ':', so we need to rewind the output a bit
611	if out.Len() >= rewind {
612		out.Truncate(len(out.Bytes()) - rewind)
613	}
614
615	var uLink bytes.Buffer
616	unescapeText(&uLink, data[:linkEnd])
617
618	if uLink.Len() > 0 {
619		parser.r.AutoLink(out, uLink.Bytes(), LINK_TYPE_NORMAL)
620	}
621
622	return linkEnd - rewind
623}
624
625var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
626
627func isSafeLink(link []byte) bool {
628	for _, prefix := range validUris {
629		// TODO: handle unicode here
630		// case-insensitive prefix test
631		if len(link) > len(prefix) && bytes.Equal(bytes.ToLower(link[:len(prefix)]), prefix) && isalnum(link[len(prefix)]) {
632			return true
633		}
634	}
635
636	return false
637}
638
639// return the length of the given tag, or 0 is it's not valid
640func tagLength(data []byte, autolink *int) int {
641	var i, j int
642
643	// a valid tag can't be shorter than 3 chars
644	if len(data) < 3 {
645		return 0
646	}
647
648	// begins with a '<' optionally followed by '/', followed by letter or number
649	if data[0] != '<' {
650		return 0
651	}
652	if data[1] == '/' {
653		i = 2
654	} else {
655		i = 1
656	}
657
658	if !isalnum(data[i]) {
659		return 0
660	}
661
662	// scheme test
663	*autolink = LINK_TYPE_NOT_AUTOLINK
664
665	// try to find the beginning of an URI
666	for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
667		i++
668	}
669
670	if i > 1 && data[i] == '@' {
671		if j = isMailtoAutoLink(data[i:]); j != 0 {
672			*autolink = LINK_TYPE_EMAIL
673			return i + j
674		}
675	}
676
677	if i > 2 && data[i] == ':' {
678		*autolink = LINK_TYPE_NORMAL
679		i++
680	}
681
682	// complete autolink test: no whitespace or ' or "
683	switch {
684	case i >= len(data):
685		*autolink = LINK_TYPE_NOT_AUTOLINK
686	case *autolink != 0:
687		j = i
688
689		for i < len(data) {
690			if data[i] == '\\' {
691				i += 2
692			} else {
693				if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
694					break
695				} else {
696					i++
697				}
698			}
699
700		}
701
702		if i >= len(data) {
703			return 0
704		}
705		if i > j && data[i] == '>' {
706			return i + 1
707		}
708
709		// one of the forbidden chars has been found
710		*autolink = LINK_TYPE_NOT_AUTOLINK
711	}
712
713	// look for something looking like a tag end
714	for i < len(data) && data[i] != '>' {
715		i++
716	}
717	if i >= len(data) {
718		return 0
719	}
720	return i + 1
721}
722
723// look for the address part of a mail autolink and '>'
724// this is less strict than the original markdown e-mail address matching
725func isMailtoAutoLink(data []byte) int {
726	nb := 0
727
728	// address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
729	for i := 0; i < len(data); i++ {
730		if isalnum(data[i]) {
731			continue
732		}
733
734		switch data[i] {
735		case '@':
736			nb++
737
738		case '-', '.', '_':
739			break
740
741		case '>':
742			if nb == 1 {
743				return i + 1
744			} else {
745				return 0
746			}
747		default:
748			return 0
749		}
750	}
751
752	return 0
753}
754
755// look for the next emph char, skipping other constructs
756func inlineHelperFindEmphChar(data []byte, c byte) int {
757	i := 1
758
759	for i < len(data) {
760		for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
761			i++
762		}
763		if i >= len(data) {
764			return 0
765		}
766		if data[i] == c {
767			return i
768		}
769
770		// do not count escaped chars
771		if i != 0 && data[i-1] == '\\' {
772			i++
773			continue
774		}
775
776		if data[i] == '`' {
777			// skip a code span
778			tmpI := 0
779			i++
780			for i < len(data) && data[i] != '`' {
781				if tmpI == 0 && data[i] == c {
782					tmpI = i
783				}
784				i++
785			}
786			if i >= len(data) {
787				return tmpI
788			}
789			i++
790		} else {
791			if data[i] == '[' {
792				// skip a link
793				tmpI := 0
794				i++
795				for i < len(data) && data[i] != ']' {
796					if tmpI == 0 && data[i] == c {
797						tmpI = i
798					}
799					i++
800				}
801				i++
802				for i < len(data) && (data[i] == ' ' || data[i] == '\n') {
803					i++
804				}
805				if i >= len(data) {
806					return tmpI
807				}
808				if data[i] != '[' && data[i] != '(' { // not a link
809					if tmpI > 0 {
810						return tmpI
811					} else {
812						continue
813					}
814				}
815				cc := data[i]
816				i++
817				for i < len(data) && data[i] != cc {
818					if tmpI == 0 && data[i] == c {
819						tmpI = i
820					}
821					i++
822				}
823				if i >= len(data) {
824					return tmpI
825				}
826				i++
827			}
828		}
829	}
830	return 0
831}
832
833func inlineHelperEmph1(parser *Parser, out *bytes.Buffer, data []byte, c byte) int {
834	i := 0
835
836	// skip one symbol if coming from emph3
837	if len(data) > 1 && data[0] == c && data[1] == c {
838		i = 1
839	}
840
841	for i < len(data) {
842		length := inlineHelperFindEmphChar(data[i:], c)
843		if length == 0 {
844			return 0
845		}
846		i += length
847		if i >= len(data) {
848			return 0
849		}
850
851		if i+1 < len(data) && data[i+1] == c {
852			i++
853			continue
854		}
855
856		if data[i] == c && !isspace(data[i-1]) {
857
858			if parser.flags&EXTENSION_NO_INTRA_EMPHASIS != 0 {
859				if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
860					continue
861				}
862			}
863
864			var work bytes.Buffer
865			parser.parseInline(&work, data[:i])
866			parser.r.Emphasis(out, work.Bytes())
867			return i + 1
868		}
869	}
870
871	return 0
872}
873
874func inlineHelperEmph2(parser *Parser, out *bytes.Buffer, data []byte, c byte) int {
875	i := 0
876
877	for i < len(data) {
878		length := inlineHelperFindEmphChar(data[i:], c)
879		if length == 0 {
880			return 0
881		}
882		i += length
883
884		if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
885			var work bytes.Buffer
886			parser.parseInline(&work, data[:i])
887
888			if work.Len() > 0 {
889				// pick the right renderer
890				if c == '~' {
891					parser.r.StrikeThrough(out, work.Bytes())
892				} else {
893					parser.r.DoubleEmphasis(out, work.Bytes())
894				}
895			}
896			return i + 2
897		}
898		i++
899	}
900	return 0
901}
902
903func inlineHelperEmph3(parser *Parser, out *bytes.Buffer, data []byte, offset int, c byte) int {
904	i := 0
905	origData := data
906	data = data[offset:]
907
908	for i < len(data) {
909		length := inlineHelperFindEmphChar(data[i:], c)
910		if length == 0 {
911			return 0
912		}
913		i += length
914
915		// skip whitespace preceded symbols
916		if data[i] != c || isspace(data[i-1]) {
917			continue
918		}
919
920		switch {
921		case i+2 < len(data) && data[i+1] == c && data[i+2] == c:
922			// triple symbol found
923			var work bytes.Buffer
924
925			parser.parseInline(&work, data[:i])
926			if work.Len() > 0 {
927				parser.r.TripleEmphasis(out, work.Bytes())
928			}
929			return i + 3
930		case (i+1 < len(data) && data[i+1] == c):
931			// double symbol found, hand over to emph1
932			length = inlineHelperEmph1(parser, out, origData[offset-2:], c)
933			if length == 0 {
934				return 0
935			} else {
936				return length - 2
937			}
938		default:
939			// single symbol found, hand over to emph2
940			length = inlineHelperEmph2(parser, out, origData[offset-1:], c)
941			if length == 0 {
942				return 0
943			} else {
944				return length - 1
945			}
946		}
947	}
948	return 0
949}