all repos — grayfriday @ 2aca6670787400c2c9c065606b5f9db2eb6ff525

blackfriday fork with a few changes

inline.go (view raw)

  1//
  2// Blackfriday Markdown Processor
  3// Available at http://github.com/russross/blackfriday
  4//
  5// Copyright © 2011 Russ Ross <russ@russross.com>.
  6// Distributed under the Simplified BSD License.
  7// See README.md for details.
  8//
  9
 10//
 11// Functions to parse inline elements.
 12//
 13
 14package blackfriday
 15
 16import (
 17	"bytes"
 18)
 19
 20// Functions to parse text within a block
 21// Each function returns the number of chars taken care of
 22// data is the complete block being rendered
 23// offset is the number of valid chars before the current cursor
 24
 25func (parser *Parser) parseInline(out *bytes.Buffer, data []byte) {
 26	// this is called recursively: enforce a maximum depth
 27	if parser.nesting >= parser.maxNesting {
 28		return
 29	}
 30	parser.nesting++
 31
 32	i, end := 0, 0
 33	for i < len(data) {
 34		// copy inactive chars into the output
 35		for end < len(data) && parser.inline[data[end]] == nil {
 36			end++
 37		}
 38
 39		parser.r.NormalText(out, data[i:end])
 40
 41		if end >= len(data) {
 42			break
 43		}
 44		i = end
 45
 46		// call the trigger
 47		handler := parser.inline[data[end]]
 48		if consumed := handler(parser, out, data, i); consumed == 0 {
 49			// no action from the callback; buffer the byte for later
 50			end = i + 1
 51		} else {
 52			// skip past whatever the callback used
 53			i += consumed
 54			end = i
 55		}
 56	}
 57
 58	parser.nesting--
 59}
 60
 61// single and double emphasis parsing
 62func inlineEmphasis(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
 63	data = data[offset:]
 64	c := data[0]
 65	ret := 0
 66
 67	if len(data) > 2 && data[1] != c {
 68		// whitespace cannot follow an opening emphasis;
 69		// strikethrough only takes two characters '~~'
 70		if c == '~' || isspace(data[1]) {
 71			return 0
 72		}
 73		if ret = inlineHelperEmph1(parser, out, data[1:], c); ret == 0 {
 74			return 0
 75		}
 76
 77		return ret + 1
 78	}
 79
 80	if len(data) > 3 && data[1] == c && data[2] != c {
 81		if isspace(data[2]) {
 82			return 0
 83		}
 84		if ret = inlineHelperEmph2(parser, out, data[2:], c); ret == 0 {
 85			return 0
 86		}
 87
 88		return ret + 2
 89	}
 90
 91	if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
 92		if c == '~' || isspace(data[3]) {
 93			return 0
 94		}
 95		if ret = inlineHelperEmph3(parser, out, data, 3, c); ret == 0 {
 96			return 0
 97		}
 98
 99		return ret + 3
100	}
101
102	return 0
103}
104
105func inlineCodeSpan(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
106	data = data[offset:]
107
108	nb := 0
109
110	// count the number of backticks in the delimiter
111	for nb < len(data) && data[nb] == '`' {
112		nb++
113	}
114
115	// find the next delimiter
116	i, end := 0, 0
117	for end = nb; end < len(data) && i < nb; end++ {
118		if data[end] == '`' {
119			i++
120		} else {
121			i = 0
122		}
123	}
124
125	// no matching delimiter?
126	if i < nb && end >= len(data) {
127		return 0
128	}
129
130	// trim outside whitespace
131	fBegin := nb
132	for fBegin < end && (data[fBegin] == ' ' || data[fBegin] == '\t') {
133		fBegin++
134	}
135
136	fEnd := end - nb
137	for fEnd > fBegin && (data[fEnd-1] == ' ' || data[fEnd-1] == '\t') {
138		fEnd--
139	}
140
141	// render the code span
142	parser.r.CodeSpan(out, data[fBegin:fEnd])
143
144	return end
145
146}
147
148// newline preceded by two spaces becomes <br>
149// newline without two spaces works when EXTENSION_HARD_LINE_BREAK is enabled
150func inlineLineBreak(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
151	// remove trailing spaces from out
152	outBytes := out.Bytes()
153	end := len(outBytes)
154	eol := end
155	for eol > 0 && (outBytes[eol-1] == ' ' || outBytes[eol-1] == '\t') {
156		eol--
157	}
158	out.Truncate(eol)
159
160	// should there be a hard line break here?
161	if parser.flags&EXTENSION_HARD_LINE_BREAK == 0 && end-eol < 2 {
162		return 0
163	}
164
165	parser.r.LineBreak(out)
166	return 1
167}
168
169// '[': parse a link or an image
170func inlineLink(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
171	// no links allowed inside other links
172	if parser.insideLink {
173		return 0
174	}
175
176	isImg := offset > 0 && data[offset-1] == '!'
177
178	data = data[offset:]
179
180	i := 1
181	var title, link []byte
182	textHasNl := false
183
184	// look for the matching closing bracket
185	for level := 1; level > 0 && i < len(data); i++ {
186		switch {
187		case data[i] == '\n':
188			textHasNl = true
189
190		case data[i-1] == '\\':
191			continue
192
193		case data[i] == '[':
194			level++
195
196		case data[i] == ']':
197			level--
198			if level <= 0 {
199				i-- // compensate for extra i++ in for loop
200			}
201		}
202	}
203
204	if i >= len(data) {
205		return 0
206	}
207
208	txtE := i
209	i++
210
211	// skip any amount of whitespace or newline
212	// (this is much more lax than original markdown syntax)
213	for i < len(data) && isspace(data[i]) {
214		i++
215	}
216
217	// inline style link
218	switch {
219	case i < len(data) && data[i] == '(':
220		// skip initial whitespace
221		i++
222
223		for i < len(data) && isspace(data[i]) {
224			i++
225		}
226
227		linkB := i
228
229		// look for link end: ' " )
230		for i < len(data) {
231			if data[i] == '\\' {
232				i += 2
233			} else {
234				if data[i] == ')' || data[i] == '\'' || data[i] == '"' {
235					break
236				}
237				i++
238			}
239		}
240
241		if i >= len(data) {
242			return 0
243		}
244		linkE := i
245
246		// look for title end if present
247		titleB, titleE := 0, 0
248		if data[i] == '\'' || data[i] == '"' {
249			i++
250			titleB = i
251
252			for i < len(data) {
253				if data[i] == '\\' {
254					i += 2
255				} else {
256					if data[i] == ')' {
257						break
258					}
259					i++
260				}
261			}
262
263			if i >= len(data) {
264				return 0
265			}
266
267			// skip whitespace after title
268			titleE = i - 1
269			for titleE > titleB && isspace(data[titleE]) {
270				titleE--
271			}
272
273			// check for closing quote presence
274			if data[titleE] != '\'' && data[titleE] != '"' {
275				titleB, titleE = 0, 0
276				linkE = i
277			}
278		}
279
280		// remove whitespace at the end of the link
281		for linkE > linkB && isspace(data[linkE-1]) {
282			linkE--
283		}
284
285		// remove optional angle brackets around the link
286		if data[linkB] == '<' {
287			linkB++
288		}
289		if data[linkE-1] == '>' {
290			linkE--
291		}
292
293		// build escaped link and title
294		if linkE > linkB {
295			link = data[linkB:linkE]
296		}
297
298		if titleE > titleB {
299			title = data[titleB:titleE]
300		}
301
302		i++
303
304	// reference style link
305	case i < len(data) && data[i] == '[':
306		var id []byte
307
308		// look for the id
309		i++
310		linkB := i
311		for i < len(data) && data[i] != ']' {
312			i++
313		}
314		if i >= len(data) {
315			return 0
316		}
317		linkE := i
318
319		// find the reference
320		if linkB == linkE {
321			if textHasNl {
322				var b bytes.Buffer
323
324				for j := 1; j < txtE; j++ {
325					switch {
326					case data[j] != '\n':
327						b.WriteByte(data[j])
328					case data[j-1] != ' ':
329						b.WriteByte(' ')
330					}
331				}
332
333				id = b.Bytes()
334			} else {
335				id = data[1:txtE]
336			}
337		} else {
338			id = data[linkB:linkE]
339		}
340
341		// find the reference with matching id (ids are case-insensitive)
342		key := string(bytes.ToLower(id))
343		lr, ok := parser.refs[key]
344		if !ok {
345			return 0
346		}
347
348		// keep link and title from reference
349		link = lr.link
350		title = lr.title
351		i++
352
353	// shortcut reference style link
354	default:
355		var id []byte
356
357		// craft the id
358		if textHasNl {
359			var b bytes.Buffer
360
361			for j := 1; j < txtE; j++ {
362				switch {
363				case data[j] != '\n':
364					b.WriteByte(data[j])
365				case data[j-1] != ' ':
366					b.WriteByte(' ')
367				}
368			}
369
370			id = b.Bytes()
371		} else {
372			id = data[1:txtE]
373		}
374
375		// find the reference with matching id
376		key := string(bytes.ToLower(id))
377		lr, ok := parser.refs[key]
378		if !ok {
379			return 0
380		}
381
382		// keep link and title from reference
383		link = lr.link
384		title = lr.title
385
386		// rewind the whitespace
387		i = txtE + 1
388	}
389
390	// build content: img alt is escaped, link content is parsed
391	var content bytes.Buffer
392	if txtE > 1 {
393		if isImg {
394			content.Write(data[1:txtE])
395		} else {
396			// links cannot contain other links, so turn off link parsing temporarily
397			insideLink := parser.insideLink
398			parser.insideLink = true
399			parser.parseInline(&content, data[1:txtE])
400			parser.insideLink = insideLink
401		}
402	}
403
404	var uLink []byte
405	if len(link) > 0 {
406		var uLinkBuf bytes.Buffer
407		unescapeText(&uLinkBuf, link)
408		uLink = uLinkBuf.Bytes()
409	}
410
411	// links need something to click on and somewhere to go
412	if len(uLink) == 0 || content.Len() == 0 {
413		return 0
414	}
415
416	// call the relevant rendering function
417	if isImg {
418		outSize := out.Len()
419		outBytes := out.Bytes()
420		if outSize > 0 && outBytes[outSize-1] == '!' {
421			out.Truncate(outSize - 1)
422		}
423
424		parser.r.Image(out, uLink, title, content.Bytes())
425	} else {
426		parser.r.Link(out, uLink, title, content.Bytes())
427	}
428
429	return i
430}
431
432// '<' when tags or autolinks are allowed
433func inlineLAngle(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
434	data = data[offset:]
435	altype := LINK_TYPE_NOT_AUTOLINK
436	end := tagLength(data, &altype)
437
438	if end > 2 {
439		if altype != LINK_TYPE_NOT_AUTOLINK {
440			var uLink bytes.Buffer
441			unescapeText(&uLink, data[1:end+1-2])
442			parser.r.AutoLink(out, uLink.Bytes(), altype)
443		} else {
444			parser.r.RawHtmlTag(out, data[:end])
445		}
446	}
447
448	return end
449}
450
451// '\\' backslash escape
452var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>")
453
454func inlineEscape(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
455	data = data[offset:]
456
457	if len(data) > 1 {
458		if bytes.IndexByte(escapeChars, data[1]) < 0 {
459			return 0
460		}
461
462		parser.r.NormalText(out, data[1:2])
463	}
464
465	return 2
466}
467
468func unescapeText(ob *bytes.Buffer, src []byte) {
469	i := 0
470	for i < len(src) {
471		org := i
472		for i < len(src) && src[i] != '\\' {
473			i++
474		}
475
476		if i > org {
477			ob.Write(src[org:i])
478		}
479
480		if i+1 >= len(src) {
481			break
482		}
483
484		ob.WriteByte(src[i+1])
485		i += 2
486	}
487}
488
489// '&' escaped when it doesn't belong to an entity
490// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
491func inlineEntity(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
492	data = data[offset:]
493
494	end := 1
495
496	if end < len(data) && data[end] == '#' {
497		end++
498	}
499
500	for end < len(data) && isalnum(data[end]) {
501		end++
502	}
503
504	if end < len(data) && data[end] == ';' {
505		end++ // real entity
506	} else {
507		return 0 // lone '&'
508	}
509
510	parser.r.Entity(out, data[:end])
511
512	return end
513}
514
515func inlineAutoLink(parser *Parser, out *bytes.Buffer, data []byte, offset int) int {
516	// quick check to rule out most false hits on ':'
517	if parser.insideLink || len(data) < offset+3 || data[offset+1] != '/' || data[offset+2] != '/' {
518		return 0
519	}
520
521	// scan backward for a word boundary
522	rewind := 0
523	for offset-rewind > 0 && rewind <= 7 && !isspace(data[offset-rewind-1]) && !isspace(data[offset-rewind-1]) {
524		rewind++
525	}
526	if rewind > 6 { // longest supported protocol is "mailto" which has 6 letters
527		return 0
528	}
529
530	origData := data
531	data = data[offset-rewind:]
532
533	if !isSafeLink(data) {
534		return 0
535	}
536
537	linkEnd := 0
538	for linkEnd < len(data) && !isspace(data[linkEnd]) {
539		linkEnd++
540	}
541
542	// Skip punctuation at the end of the link
543	if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',' || data[linkEnd-1] == ';') && data[linkEnd-2] != '\\' {
544		linkEnd--
545	}
546
547	// See if the link finishes with a punctuation sign that can be closed.
548	var copen byte
549	switch data[linkEnd-1] {
550	case '"':
551		copen = '"'
552	case '\'':
553		copen = '\''
554	case ')':
555		copen = '('
556	case ']':
557		copen = '['
558	case '}':
559		copen = '{'
560	default:
561		copen = 0
562	}
563
564	if copen != 0 {
565		bufEnd := offset - rewind + linkEnd - 2
566
567		openDelim := 1
568
569		/* Try to close the final punctuation sign in this same line;
570		 * if we managed to close it outside of the URL, that means that it's
571		 * not part of the URL. If it closes inside the URL, that means it
572		 * is part of the URL.
573		 *
574		 * Examples:
575		 *
576		 *      foo http://www.pokemon.com/Pikachu_(Electric) bar
577		 *              => http://www.pokemon.com/Pikachu_(Electric)
578		 *
579		 *      foo (http://www.pokemon.com/Pikachu_(Electric)) bar
580		 *              => http://www.pokemon.com/Pikachu_(Electric)
581		 *
582		 *      foo http://www.pokemon.com/Pikachu_(Electric)) bar
583		 *              => http://www.pokemon.com/Pikachu_(Electric))
584		 *
585		 *      (foo http://www.pokemon.com/Pikachu_(Electric)) bar
586		 *              => foo http://www.pokemon.com/Pikachu_(Electric)
587		 */
588
589		for bufEnd >= 0 && origData[bufEnd] != '\n' && openDelim != 0 {
590			if origData[bufEnd] == data[linkEnd-1] {
591				openDelim++
592			}
593
594			if origData[bufEnd] == copen {
595				openDelim--
596			}
597
598			bufEnd--
599		}
600
601		if openDelim == 0 {
602			linkEnd--
603		}
604	}
605
606	// we were triggered on the ':', so we need to rewind the output a bit
607	if out.Len() >= rewind {
608		out.Truncate(len(out.Bytes()) - rewind)
609	}
610
611	var uLink bytes.Buffer
612	unescapeText(&uLink, data[:linkEnd])
613
614	parser.r.AutoLink(out, uLink.Bytes(), LINK_TYPE_NORMAL)
615
616	return linkEnd - rewind
617}
618
619var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
620
621func isSafeLink(link []byte) bool {
622	for _, prefix := range validUris {
623		// TODO: handle unicode here
624		// case-insensitive prefix test
625		if len(link) > len(prefix) && bytes.Equal(bytes.ToLower(link[:len(prefix)]), prefix) && isalnum(link[len(prefix)]) {
626			return true
627		}
628	}
629
630	return false
631}
632
633// return the length of the given tag, or 0 is it's not valid
634func tagLength(data []byte, autolink *int) int {
635	var i, j int
636
637	// a valid tag can't be shorter than 3 chars
638	if len(data) < 3 {
639		return 0
640	}
641
642	// begins with a '<' optionally followed by '/', followed by letter or number
643	if data[0] != '<' {
644		return 0
645	}
646	if data[1] == '/' {
647		i = 2
648	} else {
649		i = 1
650	}
651
652	if !isalnum(data[i]) {
653		return 0
654	}
655
656	// scheme test
657	*autolink = LINK_TYPE_NOT_AUTOLINK
658
659	// try to find the beginning of an URI
660	for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
661		i++
662	}
663
664	if i > 1 && data[i] == '@' {
665		if j = isMailtoAutoLink(data[i:]); j != 0 {
666			*autolink = LINK_TYPE_EMAIL
667			return i + j
668		}
669	}
670
671	if i > 2 && data[i] == ':' {
672		*autolink = LINK_TYPE_NORMAL
673		i++
674	}
675
676	// complete autolink test: no whitespace or ' or "
677	switch {
678	case i >= len(data):
679		*autolink = LINK_TYPE_NOT_AUTOLINK
680	case *autolink != 0:
681		j = i
682
683		for i < len(data) {
684			if data[i] == '\\' {
685				i += 2
686			} else {
687				if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
688					break
689				} else {
690					i++
691				}
692			}
693
694		}
695
696		if i >= len(data) {
697			return 0
698		}
699		if i > j && data[i] == '>' {
700			return i + 1
701		}
702
703		// one of the forbidden chars has been found
704		*autolink = LINK_TYPE_NOT_AUTOLINK
705	}
706
707	// look for something looking like a tag end
708	for i < len(data) && data[i] != '>' {
709		i++
710	}
711	if i >= len(data) {
712		return 0
713	}
714	return i + 1
715}
716
717// look for the address part of a mail autolink and '>'
718// this is less strict than the original markdown e-mail address matching
719func isMailtoAutoLink(data []byte) int {
720	nb := 0
721
722	// address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
723	for i := 0; i < len(data); i++ {
724		if isalnum(data[i]) {
725			continue
726		}
727
728		switch data[i] {
729		case '@':
730			nb++
731
732		case '-', '.', '_':
733			break
734
735		case '>':
736			if nb == 1 {
737				return i + 1
738			} else {
739				return 0
740			}
741		default:
742			return 0
743		}
744	}
745
746	return 0
747}
748
749// look for the next emph char, skipping other constructs
750func inlineHelperFindEmphChar(data []byte, c byte) int {
751	i := 1
752
753	for i < len(data) {
754		for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
755			i++
756		}
757		if i >= len(data) {
758			return 0
759		}
760		if data[i] == c {
761			return i
762		}
763
764		// do not count escaped chars
765		if i != 0 && data[i-1] == '\\' {
766			i++
767			continue
768		}
769
770		if data[i] == '`' {
771			// skip a code span
772			tmpI := 0
773			i++
774			for i < len(data) && data[i] != '`' {
775				if tmpI == 0 && data[i] == c {
776					tmpI = i
777				}
778				i++
779			}
780			if i >= len(data) {
781				return tmpI
782			}
783			i++
784		} else {
785			if data[i] == '[' {
786				// skip a link
787				tmpI := 0
788				i++
789				for i < len(data) && data[i] != ']' {
790					if tmpI == 0 && data[i] == c {
791						tmpI = i
792					}
793					i++
794				}
795				i++
796				for i < len(data) && (data[i] == ' ' || data[i] == '\t' || data[i] == '\n') {
797					i++
798				}
799				if i >= len(data) {
800					return tmpI
801				}
802				if data[i] != '[' && data[i] != '(' { // not a link
803					if tmpI > 0 {
804						return tmpI
805					} else {
806						continue
807					}
808				}
809				cc := data[i]
810				i++
811				for i < len(data) && data[i] != cc {
812					if tmpI == 0 && data[i] == c {
813						tmpI = i
814					}
815					i++
816				}
817				if i >= len(data) {
818					return tmpI
819				}
820				i++
821			}
822		}
823	}
824	return 0
825}
826
827func inlineHelperEmph1(parser *Parser, out *bytes.Buffer, data []byte, c byte) int {
828	i := 0
829
830	// skip one symbol if coming from emph3
831	if len(data) > 1 && data[0] == c && data[1] == c {
832		i = 1
833	}
834
835	for i < len(data) {
836		length := inlineHelperFindEmphChar(data[i:], c)
837		if length == 0 {
838			return 0
839		}
840		i += length
841		if i >= len(data) {
842			return 0
843		}
844
845		if i+1 < len(data) && data[i+1] == c {
846			i++
847			continue
848		}
849
850		if data[i] == c && !isspace(data[i-1]) {
851
852			if parser.flags&EXTENSION_NO_INTRA_EMPHASIS != 0 {
853				if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
854					continue
855				}
856			}
857
858			var work bytes.Buffer
859			parser.parseInline(&work, data[:i])
860			parser.r.Emphasis(out, work.Bytes())
861			return i + 1
862		}
863	}
864
865	return 0
866}
867
868func inlineHelperEmph2(parser *Parser, out *bytes.Buffer, data []byte, c byte) int {
869	i := 0
870
871	for i < len(data) {
872		length := inlineHelperFindEmphChar(data[i:], c)
873		if length == 0 {
874			return 0
875		}
876		i += length
877
878		if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
879			var work bytes.Buffer
880			parser.parseInline(&work, data[:i])
881
882			// pick the right renderer
883			if c == '~' {
884				parser.r.StrikeThrough(out, work.Bytes())
885			} else {
886				parser.r.DoubleEmphasis(out, work.Bytes())
887			}
888			return i + 2
889		}
890		i++
891	}
892	return 0
893}
894
895func inlineHelperEmph3(parser *Parser, out *bytes.Buffer, data []byte, offset int, c byte) int {
896	i := 0
897	origData := data
898	data = data[offset:]
899
900	for i < len(data) {
901		length := inlineHelperFindEmphChar(data[i:], c)
902		if length == 0 {
903			return 0
904		}
905		i += length
906
907		// skip whitespace preceded symbols
908		if data[i] != c || isspace(data[i-1]) {
909			continue
910		}
911
912		switch {
913		case i+2 < len(data) && data[i+1] == c && data[i+2] == c:
914			// triple symbol found
915			var work bytes.Buffer
916
917			parser.parseInline(&work, data[:i])
918			parser.r.TripleEmphasis(out, work.Bytes())
919			return i + 3
920		case (i+1 < len(data) && data[i+1] == c):
921			// double symbol found, hand over to emph1
922			length = inlineHelperEmph1(parser, out, origData[offset-2:], c)
923			if length == 0 {
924				return 0
925			} else {
926				return length - 2
927			}
928		default:
929			// single symbol found, hand over to emph2
930			length = inlineHelperEmph2(parser, out, origData[offset-1:], c)
931			if length == 0 {
932				return 0
933			} else {
934				return length - 1
935			}
936		}
937	}
938	return 0
939}