all repos — grayfriday @ ee3fe992034b5e391daf418765e97b37fe2b06be

blackfriday fork with a few changes

inline.go (view raw)

  1//
  2// Black Friday Markdown Processor
  3// Originally based on http://github.com/tanoku/upskirt
  4// by Russ Ross <russ@russross.com>
  5//
  6
  7//
  8// Functions to parse inline elements.
  9//
 10
 11package blackfriday
 12
 13import (
 14	"bytes"
 15)
 16
 17// Functions to parse text within a block
 18// Each function returns the number of chars taken care of
 19// data is the complete block being rendered
 20// offset is the number of valid chars before the current cursor
 21
 22func parseInline(out *bytes.Buffer, rndr *render, data []byte) {
 23	if rndr.nesting >= rndr.maxNesting {
 24		return
 25	}
 26	rndr.nesting++
 27
 28	i, end := 0, 0
 29	for i < len(data) {
 30		// copy inactive chars into the output
 31		for end < len(data) && rndr.inline[data[end]] == nil {
 32			end++
 33		}
 34
 35		if rndr.mk.normalText != nil {
 36			rndr.mk.normalText(out, data[i:end], rndr.mk.opaque)
 37		} else {
 38			out.Write(data[i:end])
 39		}
 40
 41		if end >= len(data) {
 42			break
 43		}
 44		i = end
 45
 46		// call the trigger
 47		parser := rndr.inline[data[end]]
 48		end = parser(out, rndr, data, i)
 49
 50		if end == 0 { // no action from the callback
 51			end = i + 1
 52		} else {
 53			i += end
 54			end = i
 55		}
 56	}
 57
 58	rndr.nesting--
 59}
 60
 61// single and double emphasis parsing
 62func inlineEmphasis(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
 63	data = data[offset:]
 64	c := data[0]
 65	ret := 0
 66
 67	if len(data) > 2 && data[1] != c {
 68		// whitespace cannot follow an opening emphasis;
 69		// strikethrough only takes two characters '~~'
 70		if c == '~' || isspace(data[1]) {
 71			return 0
 72		}
 73		if ret = inlineHelperEmph1(out, rndr, data[1:], c); ret == 0 {
 74			return 0
 75		}
 76
 77		return ret + 1
 78	}
 79
 80	if len(data) > 3 && data[1] == c && data[2] != c {
 81		if isspace(data[2]) {
 82			return 0
 83		}
 84		if ret = inlineHelperEmph2(out, rndr, data[2:], c); ret == 0 {
 85			return 0
 86		}
 87
 88		return ret + 2
 89	}
 90
 91	if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
 92		if c == '~' || isspace(data[3]) {
 93			return 0
 94		}
 95		if ret = inlineHelperEmph3(out, rndr, data, 3, c); ret == 0 {
 96			return 0
 97		}
 98
 99		return ret + 3
100	}
101
102	return 0
103}
104
105func inlineCodespan(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
106	data = data[offset:]
107
108	nb := 0
109
110	// count the number of backticks in the delimiter
111	for nb < len(data) && data[nb] == '`' {
112		nb++
113	}
114
115	// find the next delimiter
116	i, end := 0, 0
117	for end = nb; end < len(data) && i < nb; end++ {
118		if data[end] == '`' {
119			i++
120		} else {
121			i = 0
122		}
123	}
124
125	if i < nb && end >= len(data) {
126		return 0 // no matching delimiter
127	}
128
129	// trim outside whitespace
130	f_begin := nb
131	for f_begin < end && (data[f_begin] == ' ' || data[f_begin] == '\t') {
132		f_begin++
133	}
134
135	f_end := end - nb
136	for f_end > nb && (data[f_end-1] == ' ' || data[f_end-1] == '\t') {
137		f_end--
138	}
139
140	// real code span
141	if rndr.mk.codespan == nil {
142		return 0
143	}
144	if f_begin < f_end {
145		if rndr.mk.codespan(out, data[f_begin:f_end], rndr.mk.opaque) == 0 {
146			end = 0
147		}
148	} else {
149		if rndr.mk.codespan(out, nil, rndr.mk.opaque) == 0 {
150			end = 0
151		}
152	}
153
154	return end
155
156}
157
158// '\n' preceded by two spaces
159func inlineLinebreak(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
160	if offset < 2 || data[offset-1] != ' ' || data[offset-2] != ' ' {
161		return 0
162	}
163
164	// remove trailing spaces from out and render
165	outBytes := out.Bytes()
166	end := len(outBytes)
167	for end > 0 && outBytes[end-1] == ' ' {
168		end--
169	}
170	out.Truncate(end)
171
172	if rndr.mk.linebreak == nil {
173		return 0
174	}
175	if rndr.mk.linebreak(out, rndr.mk.opaque) > 0 {
176		return 1
177	} else {
178		return 0
179	}
180
181	return 0
182}
183
184// '[': parse a link or an image
185func inlineLink(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
186	isImg := offset > 0 && data[offset-1] == '!'
187
188	data = data[offset:]
189
190	i := 1
191	var title, link []byte
192	text_has_nl := false
193
194	// check whether the correct renderer exists
195	if (isImg && rndr.mk.image == nil) || (!isImg && rndr.mk.link == nil) {
196		return 0
197	}
198
199	// look for the matching closing bracket
200	for level := 1; level > 0 && i < len(data); i++ {
201		switch {
202		case data[i] == '\n':
203			text_has_nl = true
204
205		case data[i-1] == '\\':
206			continue
207
208		case data[i] == '[':
209			level++
210
211		case data[i] == ']':
212			level--
213			if level <= 0 {
214				i-- // compensate for extra i++ in for loop
215			}
216		}
217	}
218
219	if i >= len(data) {
220		return 0
221	}
222
223	txt_e := i
224	i++
225
226	// skip any amount of whitespace or newline
227	// (this is much more lax than original markdown syntax)
228	for i < len(data) && isspace(data[i]) {
229		i++
230	}
231
232	// inline style link
233	switch {
234	case i < len(data) && data[i] == '(':
235		// skip initial whitespace
236		i++
237
238		for i < len(data) && isspace(data[i]) {
239			i++
240		}
241
242		link_b := i
243
244		// look for link end: ' " )
245		for i < len(data) {
246			if data[i] == '\\' {
247				i += 2
248			} else {
249				if data[i] == ')' || data[i] == '\'' || data[i] == '"' {
250					break
251				}
252				i++
253			}
254		}
255
256		if i >= len(data) {
257			return 0
258		}
259		link_e := i
260
261		// look for title end if present
262		title_b, title_e := 0, 0
263		if data[i] == '\'' || data[i] == '"' {
264			i++
265			title_b = i
266
267			for i < len(data) {
268				if data[i] == '\\' {
269					i += 2
270				} else {
271					if data[i] == ')' {
272						break
273					}
274					i++
275				}
276			}
277
278			if i >= len(data) {
279				return 0
280			}
281
282			// skip whitespace after title
283			title_e = i - 1
284			for title_e > title_b && isspace(data[title_e]) {
285				title_e--
286			}
287
288			// check for closing quote presence
289			if data[title_e] != '\'' && data[title_e] != '"' {
290				title_b, title_e = 0, 0
291				link_e = i
292			}
293		}
294
295		// remove whitespace at the end of the link
296		for link_e > link_b && isspace(data[link_e-1]) {
297			link_e--
298		}
299
300		// remove optional angle brackets around the link
301		if data[link_b] == '<' {
302			link_b++
303		}
304		if data[link_e-1] == '>' {
305			link_e--
306		}
307
308		// build escaped link and title
309		if link_e > link_b {
310			link = data[link_b:link_e]
311		}
312
313		if title_e > title_b {
314			title = data[title_b:title_e]
315		}
316
317		i++
318
319	// reference style link
320	case i < len(data) && data[i] == '[':
321		var id []byte
322
323		// look for the id
324		i++
325		link_b := i
326		for i < len(data) && data[i] != ']' {
327			i++
328		}
329		if i >= len(data) {
330			return 0
331		}
332		link_e := i
333
334		// find the reference
335		if link_b == link_e {
336			if text_has_nl {
337				b := bytes.NewBuffer(nil)
338
339				for j := 1; j < txt_e; j++ {
340					switch {
341					case data[j] != '\n':
342						b.WriteByte(data[j])
343					case data[j-1] != ' ':
344						b.WriteByte(' ')
345					}
346				}
347
348				id = b.Bytes()
349			} else {
350				id = data[1:txt_e]
351			}
352		} else {
353			id = data[link_b:link_e]
354		}
355
356		// find the reference with matching id (ids are case-insensitive)
357		key := string(bytes.ToLower(id))
358		lr, ok := rndr.refs[key]
359		if !ok {
360			return 0
361		}
362
363		// keep link and title from reference
364		link = lr.link
365		title = lr.title
366		i++
367
368	// shortcut reference style link
369	default:
370		var id []byte
371
372		// craft the id
373		if text_has_nl {
374			b := bytes.NewBuffer(nil)
375
376			for j := 1; j < txt_e; j++ {
377				switch {
378				case data[j] != '\n':
379					b.WriteByte(data[j])
380				case data[j-1] != ' ':
381					b.WriteByte(' ')
382				}
383			}
384
385			id = b.Bytes()
386		} else {
387			id = data[1:txt_e]
388		}
389
390		// find the reference with matching id
391		key := string(bytes.ToLower(id))
392		lr, ok := rndr.refs[key]
393		if !ok {
394			return 0
395		}
396
397		// keep link and title from reference
398		link = lr.link
399		title = lr.title
400
401		// rewind the whitespace
402		i = txt_e + 1
403	}
404
405	// build content: img alt is escaped, link content is parsed
406	content := bytes.NewBuffer(nil)
407	if txt_e > 1 {
408		if isImg {
409			content.Write(data[1:txt_e])
410		} else {
411			parseInline(content, rndr, data[1:txt_e])
412		}
413	}
414
415	var u_link []byte
416	if len(link) > 0 {
417		u_link_buf := bytes.NewBuffer(nil)
418		unescapeText(u_link_buf, link)
419		u_link = u_link_buf.Bytes()
420	}
421
422	// call the relevant rendering function
423	ret := 0
424	if isImg {
425		outSize := out.Len()
426		outBytes := out.Bytes()
427		if outSize > 0 && outBytes[outSize-1] == '!' {
428			out.Truncate(outSize - 1)
429		}
430
431		ret = rndr.mk.image(out, u_link, title, content.Bytes(), rndr.mk.opaque)
432	} else {
433		ret = rndr.mk.link(out, u_link, title, content.Bytes(), rndr.mk.opaque)
434	}
435
436	if ret > 0 {
437		return i
438	}
439	return 0
440}
441
442// '<' when tags or autolinks are allowed
443func inlineLangle(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
444	data = data[offset:]
445	altype := LINK_TYPE_NOT_AUTOLINK
446	end := tagLength(data, &altype)
447	ret := 0
448
449	if end > 2 {
450		switch {
451		case rndr.mk.autolink != nil && altype != LINK_TYPE_NOT_AUTOLINK:
452			u_link := bytes.NewBuffer(nil)
453			unescapeText(u_link, data[1:end+1-2])
454			ret = rndr.mk.autolink(out, u_link.Bytes(), altype, rndr.mk.opaque)
455		case rndr.mk.rawHtmlTag != nil:
456			ret = rndr.mk.rawHtmlTag(out, data[:end], rndr.mk.opaque)
457		}
458	}
459
460	if ret == 0 {
461		return 0
462	}
463	return end
464}
465
466// '\\' backslash escape
467var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>")
468
469func inlineEscape(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
470	data = data[offset:]
471
472	if len(data) > 1 {
473		if bytes.IndexByte(escapeChars, data[1]) < 0 {
474			return 0
475		}
476
477		if rndr.mk.normalText != nil {
478			rndr.mk.normalText(out, data[1:2], rndr.mk.opaque)
479		} else {
480			out.WriteByte(data[1])
481		}
482	}
483
484	return 2
485}
486
487func unescapeText(ob *bytes.Buffer, src []byte) {
488	i := 0
489	for i < len(src) {
490		org := i
491		for i < len(src) && src[i] != '\\' {
492			i++
493		}
494
495		if i > org {
496			ob.Write(src[org:i])
497		}
498
499		if i+1 >= len(src) {
500			break
501		}
502
503		ob.WriteByte(src[i+1])
504		i += 2
505	}
506}
507
508// '&' escaped when it doesn't belong to an entity
509// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
510func inlineEntity(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
511	data = data[offset:]
512
513	end := 1
514
515	if end < len(data) && data[end] == '#' {
516		end++
517	}
518
519	for end < len(data) && isalnum(data[end]) {
520		end++
521	}
522
523	if end < len(data) && data[end] == ';' {
524		end++ // real entity
525	} else {
526		return 0 // lone '&'
527	}
528
529	if rndr.mk.entity != nil {
530		rndr.mk.entity(out, data[:end], rndr.mk.opaque)
531	} else {
532		out.Write(data[:end])
533	}
534
535	return end
536}
537
538func inlineAutolink(out *bytes.Buffer, rndr *render, data []byte, offset int) int {
539	orig_data := data
540	data = data[offset:]
541
542	if offset > 0 {
543		if !isspace(orig_data[offset-1]) && !ispunct(orig_data[offset-1]) {
544			return 0
545		}
546	}
547
548	if !isSafeLink(data) {
549		return 0
550	}
551
552	link_end := 0
553	for link_end < len(data) && !isspace(data[link_end]) {
554		link_end++
555	}
556
557	// Skip punctuation at the end of the link
558	if (data[link_end-1] == '.' || data[link_end-1] == ',' || data[link_end-1] == ';') && data[link_end-2] != '\\' {
559		link_end--
560	}
561
562	// See if the link finishes with a punctuation sign that can be closed.
563	var copen byte
564	switch data[link_end-1] {
565	case '"':
566		copen = '"'
567	case '\'':
568		copen = '\''
569	case ')':
570		copen = '('
571	case ']':
572		copen = '['
573	case '}':
574		copen = '{'
575	default:
576		copen = 0
577	}
578
579	if copen != 0 {
580		buf_end := offset + link_end - 2
581
582		open_delim := 1
583
584		/* Try to close the final punctuation sign in this same line;
585		 * if we managed to close it outside of the URL, that means that it's
586		 * not part of the URL. If it closes inside the URL, that means it
587		 * is part of the URL.
588		 *
589		 * Examples:
590		 *
591		 *      foo http://www.pokemon.com/Pikachu_(Electric) bar
592		 *              => http://www.pokemon.com/Pikachu_(Electric)
593		 *
594		 *      foo (http://www.pokemon.com/Pikachu_(Electric)) bar
595		 *              => http://www.pokemon.com/Pikachu_(Electric)
596		 *
597		 *      foo http://www.pokemon.com/Pikachu_(Electric)) bar
598		 *              => http://www.pokemon.com/Pikachu_(Electric))
599		 *
600		 *      (foo http://www.pokemon.com/Pikachu_(Electric)) bar
601		 *              => foo http://www.pokemon.com/Pikachu_(Electric)
602		 */
603
604		for buf_end >= 0 && orig_data[buf_end] != '\n' && open_delim != 0 {
605			if orig_data[buf_end] == data[link_end-1] {
606				open_delim++
607			}
608
609			if orig_data[buf_end] == copen {
610				open_delim--
611			}
612
613			buf_end--
614		}
615
616		if open_delim == 0 {
617			link_end--
618		}
619	}
620
621	if rndr.mk.autolink != nil {
622		u_link := bytes.NewBuffer(nil)
623		unescapeText(u_link, data[:link_end])
624
625		rndr.mk.autolink(out, u_link.Bytes(), LINK_TYPE_NORMAL, rndr.mk.opaque)
626	}
627
628	return link_end
629}
630
631var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
632
633func isSafeLink(link []byte) bool {
634	for _, prefix := range validUris {
635		// TODO: handle unicode here
636		// case-insensitive prefix test
637		if len(link) > len(prefix) && !less(link[:len(prefix)], prefix) && !less(prefix, link[:len(prefix)]) && isalnum(link[len(prefix)]) {
638			return true
639		}
640	}
641
642	return false
643}
644
645// return the length of the given tag, or 0 is it's not valid
646func tagLength(data []byte, autolink *int) int {
647	var i, j int
648
649	// a valid tag can't be shorter than 3 chars
650	if len(data) < 3 {
651		return 0
652	}
653
654	// begins with a '<' optionally followed by '/', followed by letter or number
655	if data[0] != '<' {
656		return 0
657	}
658	if data[1] == '/' {
659		i = 2
660	} else {
661		i = 1
662	}
663
664	if !isalnum(data[i]) {
665		return 0
666	}
667
668	// scheme test
669	*autolink = LINK_TYPE_NOT_AUTOLINK
670
671	// try to find the beggining of an URI
672	for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
673		i++
674	}
675
676	if i > 1 && data[i] == '@' {
677		if j = isMailtoAutolink(data[i:]); j != 0 {
678			*autolink = LINK_TYPE_EMAIL
679			return i + j
680		}
681	}
682
683	if i > 2 && data[i] == ':' {
684		*autolink = LINK_TYPE_NORMAL
685		i++
686	}
687
688	// complete autolink test: no whitespace or ' or "
689	switch {
690	case i >= len(data):
691		*autolink = LINK_TYPE_NOT_AUTOLINK
692	case *autolink != 0:
693		j = i
694
695		for i < len(data) {
696			if data[i] == '\\' {
697				i += 2
698			} else {
699				if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
700					break
701				} else {
702					i++
703				}
704			}
705
706		}
707
708		if i >= len(data) {
709			return 0
710		}
711		if i > j && data[i] == '>' {
712			return i + 1
713		}
714
715		// one of the forbidden chars has been found
716		*autolink = LINK_TYPE_NOT_AUTOLINK
717	}
718
719	// look for something looking like a tag end
720	for i < len(data) && data[i] != '>' {
721		i++
722	}
723	if i >= len(data) {
724		return 0
725	}
726	return i + 1
727}
728
729// look for the address part of a mail autolink and '>'
730// this is less strict than the original markdown e-mail address matching
731func isMailtoAutolink(data []byte) int {
732	nb := 0
733
734	// address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
735	for i := 0; i < len(data); i++ {
736		if isalnum(data[i]) {
737			continue
738		}
739
740		switch data[i] {
741		case '@':
742			nb++
743
744		case '-', '.', '_':
745			break
746
747		case '>':
748			if nb == 1 {
749				return i + 1
750			} else {
751				return 0
752			}
753		default:
754			return 0
755		}
756	}
757
758	return 0
759}
760
761// look for the next emph char, skipping other constructs
762func inlineHelperFindEmphChar(data []byte, c byte) int {
763	i := 1
764
765	for i < len(data) {
766		for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
767			i++
768		}
769		if i >= len(data) {
770			return 0
771		}
772		if data[i] == c {
773			return i
774		}
775
776		// do not count escaped chars
777		if i != 0 && data[i-1] == '\\' {
778			i++
779			continue
780		}
781
782		if data[i] == '`' {
783			// skip a code span
784			tmp_i := 0
785			i++
786			for i < len(data) && data[i] != '`' {
787				if tmp_i == 0 && data[i] == c {
788					tmp_i = i
789				}
790				i++
791			}
792			if i >= len(data) {
793				return tmp_i
794			}
795			i++
796		} else {
797			if data[i] == '[' {
798				// skip a link
799				tmp_i := 0
800				i++
801				for i < len(data) && data[i] != ']' {
802					if tmp_i == 0 && data[i] == c {
803						tmp_i = i
804					}
805					i++
806				}
807				i++
808				for i < len(data) && (data[i] == ' ' || data[i] == '\t' || data[i] == '\n') {
809					i++
810				}
811				if i >= len(data) {
812					return tmp_i
813				}
814				if data[i] != '[' && data[i] != '(' { // not a link
815					if tmp_i > 0 {
816						return tmp_i
817					} else {
818						continue
819					}
820				}
821				cc := data[i]
822				i++
823				for i < len(data) && data[i] != cc {
824					if tmp_i == 0 && data[i] == c {
825						tmp_i = i
826					}
827					i++
828				}
829				if i >= len(data) {
830					return tmp_i
831				}
832				i++
833			}
834		}
835	}
836	return 0
837}
838
839func inlineHelperEmph1(out *bytes.Buffer, rndr *render, data []byte, c byte) int {
840	i := 0
841
842	if rndr.mk.emphasis == nil {
843		return 0
844	}
845
846	// skip one symbol if coming from emph3
847	if len(data) > 1 && data[0] == c && data[1] == c {
848		i = 1
849	}
850
851	for i < len(data) {
852		length := inlineHelperFindEmphChar(data[i:], c)
853		if length == 0 {
854			return 0
855		}
856		i += length
857		if i >= len(data) {
858			return 0
859		}
860
861		if i+1 < len(data) && data[i+1] == c {
862			i++
863			continue
864		}
865
866		if data[i] == c && !isspace(data[i-1]) {
867
868			if rndr.flags&EXTENSION_NO_INTRA_EMPHASIS != 0 {
869				if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
870					continue
871				}
872			}
873
874			work := bytes.NewBuffer(nil)
875			parseInline(work, rndr, data[:i])
876			r := rndr.mk.emphasis(out, work.Bytes(), rndr.mk.opaque)
877			if r > 0 {
878				return i + 1
879			} else {
880				return 0
881			}
882		}
883	}
884
885	return 0
886}
887
888func inlineHelperEmph2(out *bytes.Buffer, rndr *render, data []byte, c byte) int {
889	render_method := rndr.mk.doubleEmphasis
890	if c == '~' {
891		render_method = rndr.mk.strikethrough
892	}
893
894	if render_method == nil {
895		return 0
896	}
897
898	i := 0
899
900	for i < len(data) {
901		length := inlineHelperFindEmphChar(data[i:], c)
902		if length == 0 {
903			return 0
904		}
905		i += length
906
907		if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
908			work := bytes.NewBuffer(nil)
909			parseInline(work, rndr, data[:i])
910			r := render_method(out, work.Bytes(), rndr.mk.opaque)
911			if r > 0 {
912				return i + 2
913			} else {
914				return 0
915			}
916		}
917		i++
918	}
919	return 0
920}
921
922func inlineHelperEmph3(out *bytes.Buffer, rndr *render, data []byte, offset int, c byte) int {
923	i := 0
924	orig_data := data
925	data = data[offset:]
926
927	for i < len(data) {
928		length := inlineHelperFindEmphChar(data[i:], c)
929		if length == 0 {
930			return 0
931		}
932		i += length
933
934		// skip whitespace preceded symbols
935		if data[i] != c || isspace(data[i-1]) {
936			continue
937		}
938
939		switch {
940		case (i+2 < len(data) && data[i+1] == c && data[i+2] == c && rndr.mk.tripleEmphasis != nil):
941			// triple symbol found
942			work := bytes.NewBuffer(nil)
943
944			parseInline(work, rndr, data[:i])
945			r := rndr.mk.tripleEmphasis(out, work.Bytes(), rndr.mk.opaque)
946			if r > 0 {
947				return i + 3
948			} else {
949				return 0
950			}
951		case (i+1 < len(data) && data[i+1] == c):
952			// double symbol found, hand over to emph1
953			length = inlineHelperEmph1(out, rndr, orig_data[offset-2:], c)
954			if length == 0 {
955				return 0
956			} else {
957				return length - 2
958			}
959		default:
960			// single symbol found, hand over to emph2
961			length = inlineHelperEmph2(out, rndr, orig_data[offset-1:], c)
962			if length == 0 {
963				return 0
964			} else {
965				return length - 1
966			}
967		}
968	}
969	return 0
970}