icy does git — grayfriday (4170cc12d47178d054742ca80a4c719041f0d978): inline.go

inline.go (view raw)
   1//
   2// Blackfriday Markdown Processor
   3// Available at http://github.com/russross/blackfriday
   4//
   5// Copyright © 2011 Russ Ross <russ@russross.com>.
   6// Distributed under the Simplified BSD License.
   7// See README.md for details.
   8//
   9
  10//
  11// Functions to parse inline elements.
  12//
  13
  14package blackfriday
  15
  16import (
  17	"bytes"
  18	"regexp"
  19	"strconv"
  20	"strings"
  21)
  22
  23var (
  24	urlRe    = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
  25	anchorRe = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>` + urlRe + `<\/a>)`)
  26
  27	// https://www.w3.org/TR/html5/syntax.html#character-references
  28	// highest unicode code point in 17 planes (2^20): 1,114,112d =
  29	// 7 dec digits or 6 hex digits
  30	// named entity references can be 2-31 characters with stuff like &lt;
  31	// at one end and &CounterClockwiseContourIntegral; at the other. There
  32	// are also sometimes numbers at the end, although this isn't inherent
  33	// in the specification; there are never numbers anywhere else in
  34	// current character references, though; see &frac34; and &blk12;, etc.
  35	// https://www.w3.org/TR/html5/syntax.html#named-character-references
  36	//
  37	// entity := "&" (named group | number ref) ";"
  38	// named group := [a-zA-Z]{2,31}[0-9]{0,2}
  39	// number ref := "#" (dec ref | hex ref)
  40	// dec ref := [0-9]{1,7}
  41	// hex ref := ("x" | "X") [0-9a-fA-F]{1,6}
  42	htmlEntityRe = regexp.MustCompile(`&([a-zA-Z]{2,31}[0-9]{0,2}|#([0-9]{1,7}|[xX][0-9a-fA-F]{1,6}));`)
  43)
  44
  45// Functions to parse text within a block
  46// Each function returns the number of chars taken care of
  47// data is the complete block being rendered
  48// offset is the number of valid chars before the current cursor
  49
  50func (p *Markdown) inline(currBlock *Node, data []byte) {
  51	// handlers might call us recursively: enforce a maximum depth
  52	if p.nesting >= p.maxNesting || len(data) == 0 {
  53		return
  54	}
  55	p.nesting++
  56	beg, end := 0, 0
  57	for end < len(data) {
  58		handler := p.inlineCallback[data[end]]
  59		if handler != nil {
  60			if consumed, node := handler(p, data, end); consumed == 0 {
  61				// No action from the callback.
  62				end++
  63			} else {
  64				// Copy inactive chars into the output.
  65				currBlock.AppendChild(text(data[beg:end]))
  66				if node != nil {
  67					currBlock.AppendChild(node)
  68				}
  69				// Skip past whatever the callback used.
  70				beg = end + consumed
  71				end = beg
  72			}
  73		} else {
  74			end++
  75		}
  76	}
  77	if beg < len(data) {
  78		if data[end-1] == '\n' {
  79			end--
  80		}
  81		currBlock.AppendChild(text(data[beg:end]))
  82	}
  83	p.nesting--
  84}
  85
  86// single and double emphasis parsing
  87func emphasis(p *Markdown, data []byte, offset int) (int, *Node) {
  88	data = data[offset:]
  89	c := data[0]
  90
  91	if len(data) > 2 && data[1] != c {
  92		// whitespace cannot follow an opening emphasis;
  93		// strikethrough only takes two characters '~~'
  94		if c == '~' || isspace(data[1]) {
  95			return 0, nil
  96		}
  97		ret, node := helperEmphasis(p, data[1:], c)
  98		if ret == 0 {
  99			return 0, nil
 100		}
 101
 102		return ret + 1, node
 103	}
 104
 105	if len(data) > 3 && data[1] == c && data[2] != c {
 106		if isspace(data[2]) {
 107			return 0, nil
 108		}
 109		ret, node := helperDoubleEmphasis(p, data[2:], c)
 110		if ret == 0 {
 111			return 0, nil
 112		}
 113
 114		return ret + 2, node
 115	}
 116
 117	if len(data) > 4 && data[1] == c && data[2] == c && data[3] != c {
 118		if c == '~' || isspace(data[3]) {
 119			return 0, nil
 120		}
 121		ret, node := helperTripleEmphasis(p, data, 3, c)
 122		if ret == 0 {
 123			return 0, nil
 124		}
 125
 126		return ret + 3, node
 127	}
 128
 129	return 0, nil
 130}
 131
 132func codeSpan(p *Markdown, data []byte, offset int) (int, *Node) {
 133	data = data[offset:]
 134
 135	nb := 0
 136
 137	// count the number of backticks in the delimiter
 138	for nb < len(data) && data[nb] == '`' {
 139		nb++
 140	}
 141
 142	// find the next delimiter
 143	i, end := 0, 0
 144	for end = nb; end < len(data) && i < nb; end++ {
 145		if data[end] == '`' {
 146			i++
 147		} else {
 148			i = 0
 149		}
 150	}
 151
 152	// no matching delimiter?
 153	if i < nb && end >= len(data) {
 154		return 0, nil
 155	}
 156
 157	// trim outside whitespace
 158	fBegin := nb
 159	for fBegin < end && data[fBegin] == ' ' {
 160		fBegin++
 161	}
 162
 163	fEnd := end - nb
 164	for fEnd > fBegin && data[fEnd-1] == ' ' {
 165		fEnd--
 166	}
 167
 168	// render the code span
 169	if fBegin != fEnd {
 170		code := NewNode(Code)
 171		code.Literal = data[fBegin:fEnd]
 172		return end, code
 173	}
 174
 175	return end, nil
 176}
 177
 178// newline preceded by two spaces becomes <br>
 179func maybeLineBreak(p *Markdown, data []byte, offset int) (int, *Node) {
 180	origOffset := offset
 181	for offset < len(data) && data[offset] == ' ' {
 182		offset++
 183	}
 184
 185	if offset < len(data) && data[offset] == '\n' {
 186		if offset-origOffset >= 2 {
 187			return offset - origOffset + 1, NewNode(Hardbreak)
 188		}
 189		return offset - origOffset, nil
 190	}
 191	return 0, nil
 192}
 193
 194// newline without two spaces works when HardLineBreak is enabled
 195func lineBreak(p *Markdown, data []byte, offset int) (int, *Node) {
 196	if p.extensions&HardLineBreak != 0 {
 197		return 1, NewNode(Hardbreak)
 198	}
 199	return 0, nil
 200}
 201
 202type linkType int
 203
 204const (
 205	linkNormal linkType = iota
 206	linkImg
 207	linkDeferredFootnote
 208	linkInlineFootnote
 209)
 210
 211func isReferenceStyleLink(data []byte, pos int, t linkType) bool {
 212	if t == linkDeferredFootnote {
 213		return false
 214	}
 215	return pos < len(data)-1 && data[pos] == '[' && data[pos+1] != '^'
 216}
 217
 218func maybeImage(p *Markdown, data []byte, offset int) (int, *Node) {
 219	if offset < len(data)-1 && data[offset+1] == '[' {
 220		return link(p, data, offset)
 221	}
 222	return 0, nil
 223}
 224
 225func maybeInlineFootnote(p *Markdown, data []byte, offset int) (int, *Node) {
 226	if offset < len(data)-1 && data[offset+1] == '[' {
 227		return link(p, data, offset)
 228	}
 229	return 0, nil
 230}
 231
 232// '[': parse a link or an image or a footnote
 233func link(p *Markdown, data []byte, offset int) (int, *Node) {
 234	// no links allowed inside regular links, footnote, and deferred footnotes
 235	if p.insideLink && (offset > 0 && data[offset-1] == '[' || len(data)-1 > offset && data[offset+1] == '^') {
 236		return 0, nil
 237	}
 238
 239	var t linkType
 240	switch {
 241	// special case: ![^text] == deferred footnote (that follows something with
 242	// an exclamation point)
 243	case p.extensions&Footnotes != 0 && len(data)-1 > offset && data[offset+1] == '^':
 244		t = linkDeferredFootnote
 245	// ![alt] == image
 246	case offset >= 0 && data[offset] == '!':
 247		t = linkImg
 248		offset++
 249	// ^[text] == inline footnote
 250	// [^refId] == deferred footnote
 251	case p.extensions&Footnotes != 0:
 252		if offset >= 0 && data[offset] == '^' {
 253			t = linkInlineFootnote
 254			offset++
 255		} else if len(data)-1 > offset && data[offset+1] == '^' {
 256			t = linkDeferredFootnote
 257		}
 258	// [text] == regular link
 259	default:
 260		t = linkNormal
 261	}
 262
 263	data = data[offset:]
 264
 265	var (
 266		i                       = 1
 267		noteID                  int
 268		title, link, altContent []byte
 269		widthHeight             []byte
 270		textHasNl               = false
 271	)
 272
 273	if t == linkDeferredFootnote {
 274		i++
 275	}
 276
 277	// look for the matching closing bracket
 278	for level := 1; level > 0 && i < len(data); i++ {
 279		switch {
 280		case data[i] == '\n':
 281			textHasNl = true
 282
 283		case isBackslashEscaped(data, i):
 284			continue
 285
 286		case data[i] == '[':
 287			level++
 288
 289		case data[i] == ']':
 290			level--
 291			if level <= 0 {
 292				i-- // compensate for extra i++ in for loop
 293			}
 294		}
 295	}
 296
 297	if i >= len(data) {
 298		return 0, nil
 299	}
 300
 301	txtE := i
 302	i++
 303	var footnoteNode *Node
 304
 305	// skip any amount of whitespace or newline
 306	// (this is much more lax than original markdown syntax)
 307	for i < len(data) && isspace(data[i]) {
 308		i++
 309	}
 310
 311	// inline style link
 312	switch {
 313	case i < len(data) && data[i] == '(':
 314		// skip initial whitespace
 315		i++
 316
 317		for i < len(data) && isspace(data[i]) {
 318			i++
 319		}
 320
 321		linkB := i
 322
 323		// look for link end: ' " )
 324	findlinkend:
 325		for i < len(data) {
 326			switch {
 327			case data[i] == '\\':
 328				i += 2
 329
 330			case data[i] == ')' || data[i] == '\'' || data[i] == '"':
 331				break findlinkend
 332
 333			default:
 334				i++
 335			}
 336		}
 337
 338		if i >= len(data) {
 339			return 0, nil
 340		}
 341		linkE := i
 342
 343		// look for title end if present
 344		titleB, titleE := 0, 0
 345		if data[i] == '\'' || data[i] == '"' {
 346			i++
 347			titleB = i
 348
 349		findtitleend:
 350			for i < len(data) {
 351				switch {
 352				case data[i] == '\\':
 353					i += 2
 354
 355				case data[i] == ')':
 356					break findtitleend
 357
 358				default:
 359					i++
 360				}
 361			}
 362
 363			if i >= len(data) {
 364				return 0, nil
 365			}
 366
 367			// skip whitespace after title
 368			titleE = i - 1
 369			for titleE > titleB && isspace(data[titleE]) {
 370				titleE--
 371			}
 372
 373			// check for closing quote presence
 374			if data[titleE] != '\'' && data[titleE] != '"' {
 375				titleB, titleE = 0, 0
 376				linkE = i
 377			}
 378		}
 379
 380		// remove whitespace at the end of the link
 381		for linkE > linkB && isspace(data[linkE-1]) {
 382			linkE--
 383		}
 384
 385		// remove optional angle brackets around the link
 386		if data[linkB] == '<' {
 387			linkB++
 388		}
 389		if data[linkE-1] == '>' {
 390			linkE--
 391		}
 392
 393		// build escaped link and title
 394		if linkE > linkB {
 395			link = data[linkB:linkE]
 396		}
 397
 398		if titleE > titleB {
 399			title = data[titleB:titleE]
 400		}
 401
 402		i++
 403
 404		var whE, whB int
 405		if i < len(data) && data[i] == '{' {
 406			i++
 407			whB = i
 408
 409		findwidthheight:
 410			for i < len(data) {
 411				switch {
 412				case data[i] == '}':
 413					break findwidthheight
 414				default:
 415					i++
 416				}
 417			}
 418
 419			if i >= len(data) {
 420				return 0, nil
 421			}
 422
 423			whE = i
 424
 425			if whE > whB {
 426				widthHeight = data[whB:whE]
 427			}
 428			i++
 429		}
 430	// reference style link
 431	case isReferenceStyleLink(data, i, t):
 432		var id []byte
 433		altContentConsidered := false
 434
 435		// look for the id
 436		i++
 437		linkB := i
 438		for i < len(data) && data[i] != ']' {
 439			i++
 440		}
 441		if i >= len(data) {
 442			return 0, nil
 443		}
 444		linkE := i
 445
 446		// find the reference
 447		if linkB == linkE {
 448			if textHasNl {
 449				var b bytes.Buffer
 450
 451				for j := 1; j < txtE; j++ {
 452					switch {
 453					case data[j] != '\n':
 454						b.WriteByte(data[j])
 455					case data[j-1] != ' ':
 456						b.WriteByte(' ')
 457					}
 458				}
 459
 460				id = b.Bytes()
 461			} else {
 462				id = data[1:txtE]
 463				altContentConsidered = true
 464			}
 465		} else {
 466			id = data[linkB:linkE]
 467		}
 468
 469		// find the reference with matching id
 470		lr, ok := p.getRef(string(id))
 471		if !ok {
 472			return 0, nil
 473		}
 474
 475		// keep link and title from reference
 476		link = lr.link
 477		title = lr.title
 478		if altContentConsidered {
 479			altContent = lr.text
 480		}
 481		i++
 482
 483	// shortcut reference style link or reference or inline footnote
 484	default:
 485		var id []byte
 486
 487		// craft the id
 488		if textHasNl {
 489			var b bytes.Buffer
 490
 491			for j := 1; j < txtE; j++ {
 492				switch {
 493				case data[j] != '\n':
 494					b.WriteByte(data[j])
 495				case data[j-1] != ' ':
 496					b.WriteByte(' ')
 497				}
 498			}
 499
 500			id = b.Bytes()
 501		} else {
 502			if t == linkDeferredFootnote {
 503				id = data[2:txtE] // get rid of the ^
 504			} else {
 505				id = data[1:txtE]
 506			}
 507		}
 508
 509		footnoteNode = NewNode(Item)
 510		if t == linkInlineFootnote {
 511			// create a new reference
 512			noteID = len(p.notes) + 1
 513
 514			var fragment []byte
 515			if len(id) > 0 {
 516				if len(id) < 16 {
 517					fragment = make([]byte, len(id))
 518				} else {
 519					fragment = make([]byte, 16)
 520				}
 521				copy(fragment, slugify(id))
 522			} else {
 523				fragment = append([]byte("footnote-"), []byte(strconv.Itoa(noteID))...)
 524			}
 525
 526			ref := &reference{
 527				noteID:   noteID,
 528				hasBlock: false,
 529				link:     fragment,
 530				title:    id,
 531				footnote: footnoteNode,
 532			}
 533
 534			p.notes = append(p.notes, ref)
 535
 536			link = ref.link
 537			title = ref.title
 538		} else {
 539			// find the reference with matching id
 540			lr, ok := p.getRef(string(id))
 541			if !ok {
 542				return 0, nil
 543			}
 544
 545			if t == linkDeferredFootnote {
 546				lr.noteID = len(p.notes) + 1
 547				lr.footnote = footnoteNode
 548				p.notes = append(p.notes, lr)
 549			}
 550
 551			// keep link and title from reference
 552			link = lr.link
 553			// if inline footnote, title == footnote contents
 554			title = lr.title
 555			noteID = lr.noteID
 556		}
 557
 558		// rewind the whitespace
 559		i = txtE + 1
 560	}
 561
 562	var uLink []byte
 563	if t == linkNormal || t == linkImg {
 564		if len(link) > 0 {
 565			var uLinkBuf bytes.Buffer
 566			unescapeText(&uLinkBuf, link)
 567			uLink = uLinkBuf.Bytes()
 568		}
 569
 570		// links need something to click on and somewhere to go
 571		if len(uLink) == 0 || (t == linkNormal && txtE <= 1) {
 572			return 0, nil
 573		}
 574	}
 575
 576	// call the relevant rendering function
 577	var linkNode *Node
 578	switch t {
 579	case linkNormal:
 580		linkNode = NewNode(Link)
 581		linkNode.Destination = normalizeURI(uLink)
 582		linkNode.Title = title
 583		if len(altContent) > 0 {
 584			linkNode.AppendChild(text(altContent))
 585		} else {
 586			// links cannot contain other links, so turn off link parsing
 587			// temporarily and recurse
 588			insideLink := p.insideLink
 589			p.insideLink = true
 590			p.inline(linkNode, data[1:txtE])
 591			p.insideLink = insideLink
 592		}
 593
 594	case linkImg:
 595		linkNode = NewNode(Image)
 596		if len(widthHeight) > 0 {
 597			wh := strings.Split(string(widthHeight), "x")
 598			w, _ := strconv.Atoi(wh[0])
 599			h, _ := strconv.Atoi(wh[1])
 600			linkNode.Width, linkNode.Height = w, h
 601		}
 602		linkNode.Destination = uLink
 603		linkNode.Title = title
 604		linkNode.AppendChild(text(data[1:txtE]))
 605		i++
 606
 607	case linkInlineFootnote, linkDeferredFootnote:
 608		linkNode = NewNode(Link)
 609		linkNode.Destination = link
 610		linkNode.Title = title
 611		linkNode.NoteID = noteID
 612		linkNode.Footnote = footnoteNode
 613		if t == linkInlineFootnote {
 614			i++
 615		}
 616
 617	default:
 618		return 0, nil
 619	}
 620
 621	return i, linkNode
 622}
 623
 624func (p *Markdown) inlineHTMLComment(data []byte) int {
 625	if len(data) < 5 {
 626		return 0
 627	}
 628	if data[0] != '<' || data[1] != '!' || data[2] != '-' || data[3] != '-' {
 629		return 0
 630	}
 631	i := 5
 632	// scan for an end-of-comment marker, across lines if necessary
 633	for i < len(data) && !(data[i-2] == '-' && data[i-1] == '-' && data[i] == '>') {
 634		i++
 635	}
 636	// no end-of-comment marker
 637	if i >= len(data) {
 638		return 0
 639	}
 640	return i + 1
 641}
 642
 643func stripMailto(link []byte) []byte {
 644	if bytes.HasPrefix(link, []byte("mailto://")) {
 645		return link[9:]
 646	} else if bytes.HasPrefix(link, []byte("mailto:")) {
 647		return link[7:]
 648	} else {
 649		return link
 650	}
 651}
 652
 653// autolinkType specifies a kind of autolink that gets detected.
 654type autolinkType int
 655
 656// These are the possible flag values for the autolink renderer.
 657const (
 658	notAutolink autolinkType = iota
 659	normalAutolink
 660	emailAutolink
 661)
 662
 663// '<' when tags or autolinks are allowed
 664func leftAngle(p *Markdown, data []byte, offset int) (int, *Node) {
 665	data = data[offset:]
 666	altype, end := tagLength(data)
 667	if size := p.inlineHTMLComment(data); size > 0 {
 668		end = size
 669	}
 670	if end > 2 {
 671		if altype != notAutolink {
 672			var uLink bytes.Buffer
 673			unescapeText(&uLink, data[1:end+1-2])
 674			if uLink.Len() > 0 {
 675				link := uLink.Bytes()
 676				node := NewNode(Link)
 677				node.Destination = link
 678				if altype == emailAutolink {
 679					node.Destination = append([]byte("mailto:"), link...)
 680				}
 681				node.AppendChild(text(stripMailto(link)))
 682				return end, node
 683			}
 684		} else {
 685			htmlTag := NewNode(HTMLSpan)
 686			htmlTag.Literal = data[:end]
 687			return end, htmlTag
 688		}
 689	}
 690
 691	return end, nil
 692}
 693
 694// '\\' backslash escape
 695var escapeChars = []byte("\\`*_{}[]()#+-.!:|&<>~")
 696
 697func escape(p *Markdown, data []byte, offset int) (int, *Node) {
 698	data = data[offset:]
 699
 700	if len(data) > 1 {
 701		if p.extensions&BackslashLineBreak != 0 && data[1] == '\n' {
 702			return 2, NewNode(Hardbreak)
 703		}
 704		if bytes.IndexByte(escapeChars, data[1]) < 0 {
 705			return 0, nil
 706		}
 707
 708		return 2, text(data[1:2])
 709	}
 710
 711	return 2, nil
 712}
 713
 714func unescapeText(ob *bytes.Buffer, src []byte) {
 715	i := 0
 716	for i < len(src) {
 717		org := i
 718		for i < len(src) && src[i] != '\\' {
 719			i++
 720		}
 721
 722		if i > org {
 723			ob.Write(src[org:i])
 724		}
 725
 726		if i+1 >= len(src) {
 727			break
 728		}
 729
 730		ob.WriteByte(src[i+1])
 731		i += 2
 732	}
 733}
 734
 735// '&' escaped when it doesn't belong to an entity
 736// valid entities are assumed to be anything matching &#?[A-Za-z0-9]+;
 737func entity(p *Markdown, data []byte, offset int) (int, *Node) {
 738	data = data[offset:]
 739
 740	end := 1
 741
 742	if end < len(data) && data[end] == '#' {
 743		end++
 744	}
 745
 746	for end < len(data) && isalnum(data[end]) {
 747		end++
 748	}
 749
 750	if end < len(data) && data[end] == ';' {
 751		end++ // real entity
 752	} else {
 753		return 0, nil // lone '&'
 754	}
 755
 756	ent := data[:end]
 757	// undo &amp; escaping or it will be converted to &amp;amp; by another
 758	// escaper in the renderer
 759	if bytes.Equal(ent, []byte("&amp;")) {
 760		ent = []byte{'&'}
 761	}
 762
 763	return end, text(ent)
 764}
 765
 766func linkEndsWithEntity(data []byte, linkEnd int) bool {
 767	entityRanges := htmlEntityRe.FindAllIndex(data[:linkEnd], -1)
 768	return entityRanges != nil && entityRanges[len(entityRanges)-1][1] == linkEnd
 769}
 770
 771// hasPrefixCaseInsensitive is a custom implementation of
 772//     strings.HasPrefix(strings.ToLower(s), prefix)
 773// we rolled our own because ToLower pulls in a huge machinery of lowercasing
 774// anything from Unicode and that's very slow. Since this func will only be
 775// used on ASCII protocol prefixes, we can take shortcuts.
 776func hasPrefixCaseInsensitive(s, prefix []byte) bool {
 777	if len(s) < len(prefix) {
 778		return false
 779	}
 780	delta := byte('a' - 'A')
 781	for i, b := range prefix {
 782		if b != s[i] && b != s[i]+delta {
 783			return false
 784		}
 785	}
 786	return true
 787}
 788
 789var protocolPrefixes = [][]byte{
 790	[]byte("http://"),
 791	[]byte("https://"),
 792	[]byte("ftp://"),
 793	[]byte("file://"),
 794	[]byte("mailto:"),
 795}
 796
 797const shortestPrefix = 6 // len("ftp://"), the shortest of the above
 798
 799func maybeAutoLink(p *Markdown, data []byte, offset int) (int, *Node) {
 800	// quick check to rule out most false hits
 801	if p.insideLink || len(data) < offset+shortestPrefix {
 802		return 0, nil
 803	}
 804	for _, prefix := range protocolPrefixes {
 805		endOfHead := offset + 8 // 8 is the len() of the longest prefix
 806		if endOfHead > len(data) {
 807			endOfHead = len(data)
 808		}
 809		if hasPrefixCaseInsensitive(data[offset:endOfHead], prefix) {
 810			return autoLink(p, data, offset)
 811		}
 812	}
 813	return 0, nil
 814}
 815
 816func autoLink(p *Markdown, data []byte, offset int) (int, *Node) {
 817	// Now a more expensive check to see if we're not inside an anchor element
 818	anchorStart := offset
 819	offsetFromAnchor := 0
 820	for anchorStart > 0 && data[anchorStart] != '<' {
 821		anchorStart--
 822		offsetFromAnchor++
 823	}
 824
 825	anchorStr := anchorRe.Find(data[anchorStart:])
 826	if anchorStr != nil {
 827		anchorClose := NewNode(HTMLSpan)
 828		anchorClose.Literal = anchorStr[offsetFromAnchor:]
 829		return len(anchorStr) - offsetFromAnchor, anchorClose
 830	}
 831
 832	// scan backward for a word boundary
 833	rewind := 0
 834	for offset-rewind > 0 && rewind <= 7 && isletter(data[offset-rewind-1]) {
 835		rewind++
 836	}
 837	if rewind > 6 { // longest supported protocol is "mailto" which has 6 letters
 838		return 0, nil
 839	}
 840
 841	origData := data
 842	data = data[offset-rewind:]
 843
 844	if !isSafeLink(data) {
 845		return 0, nil
 846	}
 847
 848	linkEnd := 0
 849	for linkEnd < len(data) && !isEndOfLink(data[linkEnd]) {
 850		linkEnd++
 851	}
 852
 853	// Skip punctuation at the end of the link
 854	if (data[linkEnd-1] == '.' || data[linkEnd-1] == ',') && data[linkEnd-2] != '\\' {
 855		linkEnd--
 856	}
 857
 858	// But don't skip semicolon if it's a part of escaped entity:
 859	if data[linkEnd-1] == ';' && data[linkEnd-2] != '\\' && !linkEndsWithEntity(data, linkEnd) {
 860		linkEnd--
 861	}
 862
 863	// See if the link finishes with a punctuation sign that can be closed.
 864	var copen byte
 865	switch data[linkEnd-1] {
 866	case '"':
 867		copen = '"'
 868	case '\'':
 869		copen = '\''
 870	case ')':
 871		copen = '('
 872	case ']':
 873		copen = '['
 874	case '}':
 875		copen = '{'
 876	default:
 877		copen = 0
 878	}
 879
 880	if copen != 0 {
 881		bufEnd := offset - rewind + linkEnd - 2
 882
 883		openDelim := 1
 884
 885		/* Try to close the final punctuation sign in this same line;
 886		 * if we managed to close it outside of the URL, that means that it's
 887		 * not part of the URL. If it closes inside the URL, that means it
 888		 * is part of the URL.
 889		 *
 890		 * Examples:
 891		 *
 892		 *      foo http://www.pokemon.com/Pikachu_(Electric) bar
 893		 *              => http://www.pokemon.com/Pikachu_(Electric)
 894		 *
 895		 *      foo (http://www.pokemon.com/Pikachu_(Electric)) bar
 896		 *              => http://www.pokemon.com/Pikachu_(Electric)
 897		 *
 898		 *      foo http://www.pokemon.com/Pikachu_(Electric)) bar
 899		 *              => http://www.pokemon.com/Pikachu_(Electric))
 900		 *
 901		 *      (foo http://www.pokemon.com/Pikachu_(Electric)) bar
 902		 *              => foo http://www.pokemon.com/Pikachu_(Electric)
 903		 */
 904
 905		for bufEnd >= 0 && origData[bufEnd] != '\n' && openDelim != 0 {
 906			if origData[bufEnd] == data[linkEnd-1] {
 907				openDelim++
 908			}
 909
 910			if origData[bufEnd] == copen {
 911				openDelim--
 912			}
 913
 914			bufEnd--
 915		}
 916
 917		if openDelim == 0 {
 918			linkEnd--
 919		}
 920	}
 921
 922	var uLink bytes.Buffer
 923	unescapeText(&uLink, data[:linkEnd])
 924
 925	if uLink.Len() > 0 {
 926		node := NewNode(Link)
 927		node.Destination = uLink.Bytes()
 928		node.AppendChild(text(uLink.Bytes()))
 929		return linkEnd, node
 930	}
 931
 932	return linkEnd, nil
 933}
 934
 935func isEndOfLink(char byte) bool {
 936	return isspace(char) || char == '<'
 937}
 938
 939var validUris = [][]byte{[]byte("http://"), []byte("https://"), []byte("ftp://"), []byte("mailto://")}
 940var validPaths = [][]byte{[]byte("/"), []byte("./"), []byte("../")}
 941
 942func isSafeLink(link []byte) bool {
 943	for _, path := range validPaths {
 944		if len(link) >= len(path) && bytes.Equal(link[:len(path)], path) {
 945			if len(link) == len(path) {
 946				return true
 947			} else if isalnum(link[len(path)]) {
 948				return true
 949			}
 950		}
 951	}
 952
 953	for _, prefix := range validUris {
 954		// TODO: handle unicode here
 955		// case-insensitive prefix test
 956		if len(link) > len(prefix) && bytes.Equal(bytes.ToLower(link[:len(prefix)]), prefix) && isalnum(link[len(prefix)]) {
 957			return true
 958		}
 959	}
 960
 961	return false
 962}
 963
 964// return the length of the given tag, or 0 is it's not valid
 965func tagLength(data []byte) (autolink autolinkType, end int) {
 966	var i, j int
 967
 968	// a valid tag can't be shorter than 3 chars
 969	if len(data) < 3 {
 970		return notAutolink, 0
 971	}
 972
 973	// begins with a '<' optionally followed by '/', followed by letter or number
 974	if data[0] != '<' {
 975		return notAutolink, 0
 976	}
 977	if data[1] == '/' {
 978		i = 2
 979	} else {
 980		i = 1
 981	}
 982
 983	if !isalnum(data[i]) {
 984		return notAutolink, 0
 985	}
 986
 987	// scheme test
 988	autolink = notAutolink
 989
 990	// try to find the beginning of an URI
 991	for i < len(data) && (isalnum(data[i]) || data[i] == '.' || data[i] == '+' || data[i] == '-') {
 992		i++
 993	}
 994
 995	if i > 1 && i < len(data) && data[i] == '@' {
 996		if j = isMailtoAutoLink(data[i:]); j != 0 {
 997			return emailAutolink, i + j
 998		}
 999	}
1000
1001	if i > 2 && i < len(data) && data[i] == ':' {
1002		autolink = normalAutolink
1003		i++
1004	}
1005
1006	// complete autolink test: no whitespace or ' or "
1007	switch {
1008	case i >= len(data):
1009		autolink = notAutolink
1010	case autolink != notAutolink:
1011		j = i
1012
1013		for i < len(data) {
1014			if data[i] == '\\' {
1015				i += 2
1016			} else if data[i] == '>' || data[i] == '\'' || data[i] == '"' || isspace(data[i]) {
1017				break
1018			} else {
1019				i++
1020			}
1021
1022		}
1023
1024		if i >= len(data) {
1025			return autolink, 0
1026		}
1027		if i > j && data[i] == '>' {
1028			return autolink, i + 1
1029		}
1030
1031		// one of the forbidden chars has been found
1032		autolink = notAutolink
1033	}
1034	i += bytes.IndexByte(data[i:], '>')
1035	if i < 0 {
1036		return autolink, 0
1037	}
1038	return autolink, i + 1
1039}
1040
1041// look for the address part of a mail autolink and '>'
1042// this is less strict than the original markdown e-mail address matching
1043func isMailtoAutoLink(data []byte) int {
1044	nb := 0
1045
1046	// address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@'
1047	for i := 0; i < len(data); i++ {
1048		if isalnum(data[i]) {
1049			continue
1050		}
1051
1052		switch data[i] {
1053		case '@':
1054			nb++
1055
1056		case '-', '.', '_':
1057			break
1058
1059		case '>':
1060			if nb == 1 {
1061				return i + 1
1062			}
1063			return 0
1064		default:
1065			return 0
1066		}
1067	}
1068
1069	return 0
1070}
1071
1072// look for the next emph char, skipping other constructs
1073func helperFindEmphChar(data []byte, c byte) int {
1074	i := 0
1075
1076	for i < len(data) {
1077		for i < len(data) && data[i] != c && data[i] != '`' && data[i] != '[' {
1078			i++
1079		}
1080		if i >= len(data) {
1081			return 0
1082		}
1083		// do not count escaped chars
1084		if i != 0 && data[i-1] == '\\' {
1085			i++
1086			continue
1087		}
1088		if data[i] == c {
1089			return i
1090		}
1091
1092		if data[i] == '`' {
1093			// skip a code span
1094			tmpI := 0
1095			i++
1096			for i < len(data) && data[i] != '`' {
1097				if tmpI == 0 && data[i] == c {
1098					tmpI = i
1099				}
1100				i++
1101			}
1102			if i >= len(data) {
1103				return tmpI
1104			}
1105			i++
1106		} else if data[i] == '[' {
1107			// skip a link
1108			tmpI := 0
1109			i++
1110			for i < len(data) && data[i] != ']' {
1111				if tmpI == 0 && data[i] == c {
1112					tmpI = i
1113				}
1114				i++
1115			}
1116			i++
1117			for i < len(data) && (data[i] == ' ' || data[i] == '\n') {
1118				i++
1119			}
1120			if i >= len(data) {
1121				return tmpI
1122			}
1123			if data[i] != '[' && data[i] != '(' { // not a link
1124				if tmpI > 0 {
1125					return tmpI
1126				}
1127				continue
1128			}
1129			cc := data[i]
1130			i++
1131			for i < len(data) && data[i] != cc {
1132				if tmpI == 0 && data[i] == c {
1133					return i
1134				}
1135				i++
1136			}
1137			if i >= len(data) {
1138				return tmpI
1139			}
1140			i++
1141		}
1142	}
1143	return 0
1144}
1145
1146func helperEmphasis(p *Markdown, data []byte, c byte) (int, *Node) {
1147	i := 0
1148
1149	// skip one symbol if coming from emph3
1150	if len(data) > 1 && data[0] == c && data[1] == c {
1151		i = 1
1152	}
1153
1154	for i < len(data) {
1155		length := helperFindEmphChar(data[i:], c)
1156		if length == 0 {
1157			return 0, nil
1158		}
1159		i += length
1160		if i >= len(data) {
1161			return 0, nil
1162		}
1163
1164		if i+1 < len(data) && data[i+1] == c {
1165			i++
1166			continue
1167		}
1168
1169		if data[i] == c && !isspace(data[i-1]) {
1170
1171			if p.extensions&NoIntraEmphasis != 0 {
1172				if !(i+1 == len(data) || isspace(data[i+1]) || ispunct(data[i+1])) {
1173					continue
1174				}
1175			}
1176
1177			emph := NewNode(Emph)
1178			p.inline(emph, data[:i])
1179			return i + 1, emph
1180		}
1181	}
1182
1183	return 0, nil
1184}
1185
1186func helperDoubleEmphasis(p *Markdown, data []byte, c byte) (int, *Node) {
1187	i := 0
1188
1189	for i < len(data) {
1190		length := helperFindEmphChar(data[i:], c)
1191		if length == 0 {
1192			return 0, nil
1193		}
1194		i += length
1195
1196		if i+1 < len(data) && data[i] == c && data[i+1] == c && i > 0 && !isspace(data[i-1]) {
1197			nodeType := Strong
1198			if c == '~' {
1199				nodeType = Del
1200			}
1201			node := NewNode(nodeType)
1202			p.inline(node, data[:i])
1203			return i + 2, node
1204		}
1205		i++
1206	}
1207	return 0, nil
1208}
1209
1210func helperTripleEmphasis(p *Markdown, data []byte, offset int, c byte) (int, *Node) {
1211	i := 0
1212	origData := data
1213	data = data[offset:]
1214
1215	for i < len(data) {
1216		length := helperFindEmphChar(data[i:], c)
1217		if length == 0 {
1218			return 0, nil
1219		}
1220		i += length
1221
1222		// skip whitespace preceded symbols
1223		if data[i] != c || isspace(data[i-1]) {
1224			continue
1225		}
1226
1227		switch {
1228		case i+2 < len(data) && data[i+1] == c && data[i+2] == c:
1229			// triple symbol found
1230			strong := NewNode(Strong)
1231			em := NewNode(Emph)
1232			strong.AppendChild(em)
1233			p.inline(em, data[:i])
1234			return i + 3, strong
1235		case (i+1 < len(data) && data[i+1] == c):
1236			// double symbol found, hand over to emph1
1237			length, node := helperEmphasis(p, origData[offset-2:], c)
1238			if length == 0 {
1239				return 0, nil
1240			}
1241			return length - 2, node
1242		default:
1243			// single symbol found, hand over to emph2
1244			length, node := helperDoubleEmphasis(p, origData[offset-1:], c)
1245			if length == 0 {
1246				return 0, nil
1247			}
1248			return length - 1, node
1249		}
1250	}
1251	return 0, nil
1252}
1253
1254func text(s []byte) *Node {
1255	node := NewNode(Text)
1256	node.Literal = s
1257	return node
1258}
1259
1260func normalizeURI(s []byte) []byte {
1261	return s // TODO: implement
1262}
all repos — grayfriday @ 4170cc12d47178d054742ca80a4c719041f0d978

blackfriday fork with a few changes