all repos — grayfriday @ 60cb261accdc72d2cea155d8e67013d346d0b340

blackfriday fork with a few changes

inline.go (view raw)

  1//
  2// Blackfriday Markdown Processor
  3// Available at http://github.com/russross/blackfriday
  4//
  5// Copyright © 2011 Russ Ross <russ@russross.com>.
  6// Distributed under the Simplified BSD License.
  7// See README.md for details.
  8//
  9
 10//
 11// Functions to parse inline elements.
 12//
 13
 14package blackfriday
 15
 16import (
 17	"bytes"
 18)
 19
 20// Functions to parse text within a block
 21// Each function returns the number of chars taken care of
 22// data is the complete block being rendered
 23// offset is the number of valid chars before the current cursor
 24
 25func (parser *Parser) inline(out *bytes.Buffer, data []byte) {
 26	// this is called recursively: enforce a maximum depth
 27	if parser.nesting >= parser.maxNesting {
 28		return
 29	}
 30	parser.nesting++
 31
 32	i, end := 0, 0
 33	for i < len(data) {
 34		// copy inactive chars into the output
 35		for end < len(data) && parser.inlineCallback[data[end]] == nil {
 36			end++
 37		}
 38
 39		parser.r.NormalText(out, data[i:end])
 40
 41		if end >= len(data) {
 42			break
 43		}
 44		i = end
 45
 46		// call the trigger
 47		handler := parser.inlineCallback[data[end]]
 48		if consumed := handler(parser, out, data, i); consumed == 0 {
 49			// no action from the callback; buffer the byte for later
 50			end = i + 1
 51		} else {
 52			// skip past whatever the callback used
 53			i += consumed
 54			end = i
 55		}
 56	}
 57
 58	parser.nesting--
 59}
 60
 61// single and double emphasis parsing
 62func emphasis(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
 63	data = data[offset:]
 64	c := data[0]
 65	ret := 0
 66
 67	if len(data) > 2 && data[1] != c {
 68		// whitespace cannot follow an opening emphasis;
 69		// strikethrough only takes two characters '~~'
 70		if c == '~' || isspace(data[1]) {
 71			return 0
 72		}
 73		if ret = helperEmphasis(parser, out, data[1:], c); ret == 0 {
 74			return 0
 75		}
 76
 77		return ret + 1
 78	}
 79
 80	if len(data) > 3 && data[1] == c && data[2] != c {
 81		if isspace(data[2]) {
 82			return 0
 83		}
 84		if ret = helperDoubleEmphasis(parser, out, data[2:], c); ret == 0 {
 85			return 0
 86		}
 87
 88		return ret + 2
 89	}
 90
 91	if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
 92		if c == '~' || isspace(data[3]) {
 93			return 0
 94		}
 95		if ret = helperTripleEmphasis(parser, out, data, 3, c); ret == 0 {
 96			return 0
 97		}
 98
 99		return ret + 3
100	}
101
102	return 0
103}
104
105func codeSpan(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
106	data = data[offset:]
107
108	nb := 0
109
110	// count the number of backticks in the delimiter
111	for nb < len(data) && data[nb] == '`' {
112		nb++
113	}
114
115	// find the next delimiter
116	i, end := 0, 0
117	for end = nb; end < len(data) && i < nb; end++ {
118		if data[end] == '`' {
119			i++
120		} else {
121			i = 0
122		}
123	}
124
125	// no matching delimiter?
126	if i < nb && end >= len(data) {
127		return 0
128	}
129
130	// trim outside whitespace
131	fBegin := nb
132	for fBegin < end && data[fBegin] == ' ' {
133		fBegin++
134	}
135
136	fEnd := end - nb
137	for fEnd > fBegin && data[fEnd-1] == ' ' {
138		fEnd--
139	}
140
141	// render the code span
142	if fBegin != fEnd {
143		parser.r.CodeSpan(out, data[fBegin:fEnd])
144	}
145
146	return end
147
148}
149
150// newline preceded by two spaces becomes <br>
151// newline without two spaces works when EXTENSION_HARD_LINE_BREAK is enabled
152func lineBreak(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
153	// remove trailing spaces from out
154	outBytes := out.Bytes()
155	end := len(outBytes)
156	eol := end
157	for eol > 0 && outBytes[eol-1] == ' ' {
158		eol--
159	}
160	out.Truncate(eol)
161
162	// should there be a hard line break here?
163	if parser.flags&EXTENSION_HARD_LINE_BREAK == 0 && end-eol < 2 {
164		return 0
165	}
166
167	parser.r.LineBreak(out)
168	return 1
169}
170
171// '[': parse a link or an image
172func link(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
173	// no links allowed inside other links
174	if parser.insideLink {
175		return 0
176	}
177
178	isImg := offset > 0 && data[offset-1] == '!'
179
180	data = data[offset:]
181
182	i := 1
183	var title, link []byte
184	textHasNl := false
185
186	// look for the matching closing bracket
187	for level := 1; level > 0 && i < len(data); i++ {
188		switch {
189		case data[i] == '\n':
190			textHasNl = true
191
192		case data[i-1] == '\\':
193			continue
194
195		case data[i] == '[':
196			level++
197
198		case data[i] == ']':
199			level--
200			if level <= 0 {
201				i-- // compensate for extra i++ in for loop
202			}
203		}
204	}
205
206	if i >= len(data) {
207		return 0
208	}
209
210	txtE := i
211	i++
212
213	// skip any amount of whitespace or newline
214	// (this is much more lax than original markdown syntax)
215	for i < len(data) && isspace(data[i]) {
216		i++
217	}
218
219	// inline style link
220	switch {
221	case i < len(data) && data[i] == '(':
222		// skip initial whitespace
223		i++
224
225		for i < len(data) && isspace(data[i]) {
226			i++
227		}
228
229		linkB := i
230
231		// look for link end: ' " )
232	findlinkend:
233		for i < len(data) {
234			switch {
235			case data[i] == '\\':
236				i += 2
237
238			case data[i] == ')' || data[i] == '\'' || data[i] == '"':
239				break findlinkend
240
241			default:
242				i++
243			}
244		}
245
246		if i >= len(data) {
247			return 0
248		}
249		linkE := i
250
251		// look for title end if present
252		titleB, titleE := 0, 0
253		if data[i] == '\'' || data[i] == '"' {
254			i++
255			titleB = i
256
257		findtitleend:
258			for i < len(data) {
259				switch {
260				case data[i] == '\\':
261					i += 2
262
263				case data[i] == ')':
264					break findtitleend
265
266				default:
267					i++
268				}
269			}
270
271			if i >= len(data) {
272				return 0
273			}
274
275			// skip whitespace after title
276			titleE = i - 1
277			for titleE > titleB && isspace(data[titleE]) {
278				titleE--
279			}
280
281			// check for closing quote presence
282			if data[titleE] != '\'' && data[titleE] != '"' {
283				titleB, titleE = 0, 0
284				linkE = i
285			}
286		}
287
288		// remove whitespace at the end of the link
289		for linkE > linkB && isspace(data[linkE-1]) {
290			linkE--
291		}
292
293		// remove optional angle brackets around the link
294		if data[linkB] == '<' {
295			linkB++
296		}
297		if data[linkE-1] == '>' {
298			linkE--
299		}
300
301		// build escaped link and title
302		if linkE > linkB {
303			link = data[linkB:linkE]
304		}
305
306		if titleE > titleB {
307			title = data[titleB:titleE]
308		}
309
310		i++
311
312	// reference style link
313	case i < len(data) && data[i] == '[':
314		var id []byte
315
316		// look for the id
317		i++
318		linkB := i
319		for i < len(data) && data[i] != ']' {
320			i++
321		}
322		if i >= len(data) {
323			return 0
324		}
325		linkE := i
326
327		// find the reference
328		if linkB == linkE {
329			if textHasNl {
330				var b bytes.Buffer
331
332				for j := 1; j < txtE; j++ {
333					switch {
334					case data[j] != '\n':
335						b.WriteByte(data[j])
336					case data[j-1] != ' ':
337						b.WriteByte(' ')
338					}
339				}
340
341				id = b.Bytes()
342			} else {
343				id = data[1:txtE]
344			}
345		} else {
346			id = data[linkB:linkE]
347		}
348
349		// find the reference with matching id (ids are case-insensitive)
350		key := string(bytes.ToLower(id))
351		lr, ok := parser.refs[key]
352		if !ok {
353			return 0
354		}
355
356		// keep link and title from reference
357		link = lr.link
358		title = lr.title
359		i++
360
361	// shortcut reference style link
362	default:
363		var id []byte
364
365		// craft the id
366		if textHasNl {
367			var b bytes.Buffer
368
369			for j := 1; j < txtE; j++ {
370				switch {
371				case data[j] != '\n':
372					b.WriteByte(data[j])
373				case data[j-1] != ' ':
374					b.WriteByte(' ')
375				}
376			}
377
378			id = b.Bytes()
379		} else {
380			id = data[1:txtE]
381		}
382
383		// find the reference with matching id
384		key := string(bytes.ToLower(id))
385		lr, ok := parser.refs[key]
386		if !ok {
387			return 0
388		}
389
390		// keep link and title from reference
391		link = lr.link
392		title = lr.title
393
394		// rewind the whitespace
395		i = txtE + 1
396	}
397
398	// build content: img alt is escaped, link content is parsed
399	var content bytes.Buffer
400	if txtE > 1 {
401		if isImg {
402			content.Write(data[1:txtE])
403		} else {
404			// links cannot contain other links, so turn off link parsing temporarily
405			insideLink := parser.insideLink
406			parser.insideLink = true
407			parser.inline(&content, data[1:txtE])
408			parser.insideLink = insideLink
409		}
410	}
411
412	var uLink []byte
413	if len(link) > 0 {
414		var uLinkBuf bytes.Buffer
415		unescapeText(&uLinkBuf, link)
416		uLink = uLinkBuf.Bytes()
417	}
418
419	// links need something to click on and somewhere to go
420	if len(uLink) == 0 || (!isImg && content.Len() == 0) {
421		return 0
422	}
423
424	// call the relevant rendering function
425	if isImg {
426		outSize := out.Len()
427		outBytes := out.Bytes()
428		if outSize > 0 && outBytes[outSize-1] == '!' {
429			out.Truncate(outSize - 1)
430		}
431
432		parser.r.Image(out, uLink, title, content.Bytes())
433	} else {
434		parser.r.Link(out, uLink, title, content.Bytes())
435	}
436
437	return i
438}
439
440// '<' when tags or autolinks are allowed
441func leftAngle(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
442	data = data[offset:]
443	altype := LINK_TYPE_NOT_AUTOLINK
444	end := tagLength(data, &altype)
445
446	if end > 2 {
447		if altype != LINK_TYPE_NOT_AUTOLINK {
448			var uLink bytes.Buffer
449			unescapeText(&uLink, data[1:end+1-2])
450			if uLink.Len() > 0 {
451				parser.r.AutoLink(out, uLink.Bytes(), altype)
452			}
453		} else {
454			parser.r.RawHtmlTag(out, data[:end])
455		}
456	}
457
458	return end
459}
460
461// '\\' backslash escape
462var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>")
463
464func escape(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
465	data = data[offset:]
466
467	if len(data) > 1 {
468		if bytes.IndexByte(escapeChars, data[1]) < 0 {
469			return 0
470		}
471
472		parser.r.NormalText(out, data[1:2])
473	}
474
475	return 2
476}
477
478func unescapeText(ob *bytes.Buffer, src []byte) {
479	i := 0
480	for i < len(src) {
481		org := i
482		for i < len(src) && src[i] != '\\' {
483			i++
484		}
485
486		if i > org {
487			ob.Write(src[org:i])
488		}
489
490		if i+1 >= len(src) {
491			break
492		}
493
494		ob.WriteByte(src[i+1])
495		i += 2
496	}
497}
498
499// '&' escaped when it doesn't belong to an entity
500// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
501func entity(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
502	data = data[offset:]
503
504	end := 1
505
506	if end < len(data) && data[end] == '#' {
507		end++
508	}
509
510	for end < len(data) && isalnum(data[end]) {
511		end++
512	}
513
514	if end < len(data) && data[end] == ';' {
515		end++ // real entity
516	} else {
517		return 0 // lone '&'
518	}
519
520	parser.r.Entity(out, data[:end])
521
522	return end
523}
524
525func autoLink(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
526	// quick check to rule out most false hits on ':'
527	if parser.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' {
528		return 0
529	}
530
531	// scan backward for a word boundary
532	rewind := 0
533	for offset-rewind > 0 && rewind <= 7 && !isspace(data[offset-rewind-1]) && !isspace(data[offset-rewind-1]) {
534		rewind++
535	}
536	if rewind > 6 { // longest supported protocol is "mailto" which has 6 letters
537		return 0
538	}
539
540	origData := data
541	data = data[offset-rewind:]
542
543	if !isSafeLink(data) {
544		return 0
545	}
546
547	linkEnd := 0
548	for linkEnd < len(data) && !isspace(data[linkEnd]) {
549		linkEnd++
550	}
551
552	// Skip punctuation at the end of the link
553	if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' {
554		linkEnd--
555	}
556
557	// See if the link finishes with a punctuation sign that can be closed.
558	var copen byte
559	switch data[linkEnd-1] {
560	case '"':
561		copen = '"'
562	case '\'':
563		copen = '\''
564	case ')':
565		copen = '('
566	case ']':
567		copen = '['
568	case '}':
569		copen = '{'
570	default:
571		copen = 0
572	}
573
574	if copen != 0 {
575		bufEnd := offset - rewind + linkEnd - 2
576
577		openDelim := 1
578
579		/* Try to close the final punctuation sign in this same line;
580		 * if we managed to close it outside of the URL, that means that it's
581		 * not part of the URL. If it closes inside the URL, that means it
582		 * is part of the URL.
583		 *
584		 * Examples:
585		 *
586		 *      foo http://www.pokemon.com/Pikachu_(Electric) bar
587		 *              => http://www.pokemon.com/Pikachu_(Electric)
588		 *
589		 *      foo (http://www.pokemon.com/Pikachu_(Electric)) bar
590		 *              => http://www.pokemon.com/Pikachu_(Electric)
591		 *
592		 *      foo http://www.pokemon.com/Pikachu_(Electric)) bar
593		 *              => http://www.pokemon.com/Pikachu_(Electric))
594		 *
595		 *      (foo http://www.pokemon.com/Pikachu_(Electric)) bar
596		 *              => foo http://www.pokemon.com/Pikachu_(Electric)
597		 */
598
599		for bufEnd >= 0 && origData[bufEnd] != '\n' && openDelim != 0 {
600			if origData[bufEnd] == data[linkEnd-1] {
601				openDelim++
602			}
603
604			if origData[bufEnd] == copen {
605				openDelim--
606			}
607
608			bufEnd--
609		}
610
611		if openDelim == 0 {
612			linkEnd--
613		}
614	}
615
616	// we were triggered on the ':', so we need to rewind the output a bit
617	if out.Len() >= rewind {
618		out.Truncate(len(out.Bytes()) - rewind)
619	}
620
621	var uLink bytes.Buffer
622	unescapeText(&uLink, data[:linkEnd])
623
624	if uLink.Len() > 0 {
625		parser.r.AutoLink(out, uLink.Bytes(), LINK_TYPE_NORMAL)
626	}
627
628	return linkEnd - rewind
629}
630
631var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
632
633func isSafeLink(link []byte) bool {
634	for _, prefix := range validUris {
635		// TODO: handle unicode here
636		// case-insensitive prefix test
637		if len(link) > len(prefix) && bytes.Equal(bytes.ToLower(link[:len(prefix)]), prefix) && isalnum(link[len(prefix)]) {
638			return true
639		}
640	}
641
642	return false
643}
644
645// return the length of the given tag, or 0 is it's not valid
646func tagLength(data []byte, autolink *int) int {
647	var i, j int
648
649	// a valid tag can't be shorter than 3 chars
650	if len(data) < 3 {
651		return 0
652	}
653
654	// begins with a '<' optionally followed by '/', followed by letter or number
655	if data[0] != '<' {
656		return 0
657	}
658	if data[1] == '/' {
659		i = 2
660	} else {
661		i = 1
662	}
663
664	if !isalnum(data[i]) {
665		return 0
666	}
667
668	// scheme test
669	*autolink = LINK_TYPE_NOT_AUTOLINK
670
671	// try to find the beginning of an URI
672	for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
673		i++
674	}
675
676	if i > 1 && i < len(data) && data[i] == '@' {
677		if j = isMailtoAutoLink(data[i:]); j != 0 {
678			*autolink = LINK_TYPE_EMAIL
679			return i + j
680		}
681	}
682
683	if i > 2 && i < len(data) && data[i] == ':' {
684		*autolink = LINK_TYPE_NORMAL
685		i++
686	}
687
688	// complete autolink test: no whitespace or ' or "
689	switch {
690	case i >= len(data):
691		*autolink = LINK_TYPE_NOT_AUTOLINK
692	case *autolink != 0:
693		j = i
694
695		for i < len(data) {
696			if data[i] == '\\' {
697				i += 2
698			} else if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
699				break
700			} else {
701				i++
702			}
703
704		}
705
706		if i >= len(data) {
707			return 0
708		}
709		if i > j && data[i] == '>' {
710			return i + 1
711		}
712
713		// one of the forbidden chars has been found
714		*autolink = LINK_TYPE_NOT_AUTOLINK
715	}
716
717	// look for something looking like a tag end
718	for i < len(data) && data[i] != '>' {
719		i++
720	}
721	if i >= len(data) {
722		return 0
723	}
724	return i + 1
725}
726
727// look for the address part of a mail autolink and '>'
728// this is less strict than the original markdown e-mail address matching
729func isMailtoAutoLink(data []byte) int {
730	nb := 0
731
732	// address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
733	for i := 0; i < len(data); i++ {
734		if isalnum(data[i]) {
735			continue
736		}
737
738		switch data[i] {
739		case '@':
740			nb++
741
742		case '-', '.', '_':
743			break
744
745		case '>':
746			if nb == 1 {
747				return i + 1
748			} else {
749				return 0
750			}
751		default:
752			return 0
753		}
754	}
755
756	return 0
757}
758
759// look for the next emph char, skipping other constructs
760func helperFindEmphChar(data []byte, c byte) int {
761	i := 1
762
763	for i < len(data) {
764		for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
765			i++
766		}
767		if i >= len(data) {
768			return 0
769		}
770		if data[i] == c {
771			return i
772		}
773
774		// do not count escaped chars
775		if i != 0 && data[i-1] == '\\' {
776			i++
777			continue
778		}
779
780		if data[i] == '`' {
781			// skip a code span
782			tmpI := 0
783			i++
784			for i < len(data) && data[i] != '`' {
785				if tmpI == 0 && data[i] == c {
786					tmpI = i
787				}
788				i++
789			}
790			if i >= len(data) {
791				return tmpI
792			}
793			i++
794		} else if data[i] == '[' {
795			// skip a link
796			tmpI := 0
797			i++
798			for i < len(data) && data[i] != ']' {
799				if tmpI == 0 && data[i] == c {
800					tmpI = i
801				}
802				i++
803			}
804			i++
805			for i < len(data) && (data[i] == ' ' || data[i] == '\n') {
806				i++
807			}
808			if i >= len(data) {
809				return tmpI
810			}
811			if data[i] != '[' && data[i] != '(' { // not a link
812				if tmpI > 0 {
813					return tmpI
814				} else {
815					continue
816				}
817			}
818			cc := data[i]
819			i++
820			for i < len(data) && data[i] != cc {
821				if tmpI == 0 && data[i] == c {
822					tmpI = i
823				}
824				i++
825			}
826			if i >= len(data) {
827				return tmpI
828			}
829			i++
830		}
831	}
832	return 0
833}
834
835func helperEmphasis(parser *Parser, out *bytes.Buffer, data []byte, c byte) int {
836	i := 0
837
838	// skip one symbol if coming from emph3
839	if len(data) > 1 && data[0] == c && data[1] == c {
840		i = 1
841	}
842
843	for i < len(data) {
844		length := helperFindEmphChar(data[i:], c)
845		if length == 0 {
846			return 0
847		}
848		i += length
849		if i >= len(data) {
850			return 0
851		}
852
853		if i+1 < len(data) && data[i+1] == c {
854			i++
855			continue
856		}
857
858		if data[i] == c && !isspace(data[i-1]) {
859
860			if parser.flags&EXTENSION_NO_INTRA_EMPHASIS != 0 {
861				if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
862					continue
863				}
864			}
865
866			var work bytes.Buffer
867			parser.inline(&work, data[:i])
868			parser.r.Emphasis(out, work.Bytes())
869			return i + 1
870		}
871	}
872
873	return 0
874}
875
876func helperDoubleEmphasis(parser *Parser, out *bytes.Buffer, data []byte, c byte) int {
877	i := 0
878
879	for i < len(data) {
880		length := helperFindEmphChar(data[i:], c)
881		if length == 0 {
882			return 0
883		}
884		i += length
885
886		if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
887			var work bytes.Buffer
888			parser.inline(&work, data[:i])
889
890			if work.Len() > 0 {
891				// pick the right renderer
892				if c == '~' {
893					parser.r.StrikeThrough(out, work.Bytes())
894				} else {
895					parser.r.DoubleEmphasis(out, work.Bytes())
896				}
897			}
898			return i + 2
899		}
900		i++
901	}
902	return 0
903}
904
905func helperTripleEmphasis(parser *Parser, out *bytes.Buffer, data []byte, offset int, c byte) int {
906	i := 0
907	origData := data
908	data = data[offset:]
909
910	for i < len(data) {
911		length := helperFindEmphChar(data[i:], c)
912		if length == 0 {
913			return 0
914		}
915		i += length
916
917		// skip whitespace preceded symbols
918		if data[i] != c || isspace(data[i-1]) {
919			continue
920		}
921
922		switch {
923		case i+2 < len(data) && data[i+1] == c && data[i+2] == c:
924			// triple symbol found
925			var work bytes.Buffer
926
927			parser.inline(&work, data[:i])
928			if work.Len() > 0 {
929				parser.r.TripleEmphasis(out, work.Bytes())
930			}
931			return i + 3
932		case (i+1 < len(data) && data[i+1] == c):
933			// double symbol found, hand over to emph1
934			length = helperEmphasis(parser, out, origData[offset-2:], c)
935			if length == 0 {
936				return 0
937			} else {
938				return length - 2
939			}
940		default:
941			// single symbol found, hand over to emph2
942			length = helperDoubleEmphasis(parser, out, origData[offset-1:], c)
943			if length == 0 {
944				return 0
945			} else {
946				return length - 1
947			}
948		}
949	}
950	return 0
951}