html.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11//
12// HTML rendering backend
13//
14//
15
16package blackfriday
17
18import (
19 "bytes"
20 "fmt"
21 "regexp"
22 "strconv"
23 "strings"
24)
25
26// Html renderer configuration options.
27const (
28 HTML_SKIP_HTML = 1 << iota // skip preformatted HTML blocks
29 HTML_SKIP_STYLE // skip embedded <style> elements
30 HTML_SKIP_IMAGES // skip embedded images
31 HTML_SKIP_LINKS // skip all links
32 HTML_SANITIZE_OUTPUT // strip output of everything that's not known to be safe
33 HTML_SAFELINK // only link to trusted protocols
34 HTML_NOFOLLOW_LINKS // only link with rel="nofollow"
35 HTML_TOC // generate a table of contents
36 HTML_OMIT_CONTENTS // skip the main contents (for a standalone table of contents)
37 HTML_COMPLETE_PAGE // generate a complete HTML page
38 HTML_GITHUB_BLOCKCODE // use github fenced code rendering rules
39 HTML_USE_XHTML // generate XHTML output instead of HTML
40 HTML_USE_SMARTYPANTS // enable smart punctuation substitutions
41 HTML_SMARTYPANTS_FRACTIONS // enable smart fractions (with HTML_USE_SMARTYPANTS)
42 HTML_SMARTYPANTS_LATEX_DASHES // enable LaTeX-style dashes (with HTML_USE_SMARTYPANTS)
43)
44
45var (
46 tags = []string{
47 "b",
48 "blockquote",
49 "code",
50 "del",
51 "dd",
52 "dl",
53 "dt",
54 "em",
55 "h1",
56 "h2",
57 "h3",
58 "h4",
59 "h5",
60 "h6",
61 "i",
62 "kbd",
63 "li",
64 "ol",
65 "p",
66 "pre",
67 "s",
68 "sup",
69 "sub",
70 "strong",
71 "strike",
72 "ul",
73 "table",
74 "tr",
75 "td",
76 "th",
77 "thead",
78 "tbody",
79
80 }
81
82 alignments = []string{
83 "left",
84 "right",
85 "center",
86 }
87
88 urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
89 tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)(\salign="(` + strings.Join(alignments, "|") + `)")?>|<(br|hr)\s?\/?>)$`)
90 anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
91 imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
92 // TODO: improve this regexp to catch all possible entities:
93 htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`)
94)
95
96// Html is a type that implements the Renderer interface for HTML output.
97//
98// Do not create this directly, instead use the HtmlRenderer function.
99type Html struct {
100 flags int // HTML_* options
101 closeTag string // how to end singleton tags: either " />\n" or ">\n"
102 title string // document title
103 css string // optional css file url (used with HTML_COMPLETE_PAGE)
104
105 // table of contents data
106 tocMarker int
107 headerCount int
108 currentLevel int
109 toc *bytes.Buffer
110
111 smartypants *smartypantsRenderer
112}
113
114const (
115 xhtmlClose = " />\n"
116 htmlClose = ">\n"
117)
118
119// HtmlRenderer creates and configures an Html object, which
120// satisfies the Renderer interface.
121//
122// flags is a set of HTML_* options ORed together.
123// title is the title of the document, and css is a URL for the document's
124// stylesheet.
125// title and css are only used when HTML_COMPLETE_PAGE is selected.
126func HtmlRenderer(flags int, title string, css string) Renderer {
127 // configure the rendering engine
128 closeTag := htmlClose
129 if flags&HTML_USE_XHTML != 0 {
130 closeTag = xhtmlClose
131 }
132
133 return &Html{
134 flags: flags,
135 closeTag: closeTag,
136 title: title,
137 css: css,
138
139 headerCount: 0,
140 currentLevel: 0,
141 toc: new(bytes.Buffer),
142
143 smartypants: smartypants(flags),
144 }
145}
146
147// Using if statements is a bit faster than a switch statement. As the compiler
148// improves, this should be unnecessary this is only worthwhile because
149// attrEscape is the single largest CPU user in normal use.
150// Also tried using map, but that gave a ~3x slowdown.
151func escapeSingleChar(char byte) (string, bool) {
152 if char == '"' {
153 return """, true
154 }
155 if char == '&' {
156 return "&", true
157 }
158 if char == '<' {
159 return "<", true
160 }
161 if char == '>' {
162 return ">", true
163 }
164 return "", false
165}
166
167func attrEscape(out *bytes.Buffer, src []byte) {
168 org := 0
169 for i, ch := range src {
170 if entity, ok := escapeSingleChar(ch); ok {
171 if i > org {
172 // copy all the normal characters since the last escape
173 out.Write(src[org:i])
174 }
175 org = i + 1
176 out.WriteString(entity)
177 }
178 }
179 if org < len(src) {
180 out.Write(src[org:])
181 }
182}
183
184func entityEscapeWithSkip(out *bytes.Buffer, src []byte, skipRanges [][]int) {
185 end := 0
186 for _, rang := range skipRanges {
187 attrEscape(out, src[end:rang[0]])
188 out.Write(src[rang[0]:rang[1]])
189 end = rang[1]
190 }
191 attrEscape(out, src[end:])
192}
193
194func (options *Html) GetFlags() int {
195 return options.flags
196}
197
198func (options *Html) Header(out *bytes.Buffer, text func() bool, level int, id string) {
199 marker := out.Len()
200 doubleSpace(out)
201
202 if id != "" {
203 out.WriteString(fmt.Sprintf("<h%d id=\"%s\">", level, id))
204 } else if options.flags&HTML_TOC != 0 {
205 // headerCount is incremented in htmlTocHeader
206 out.WriteString(fmt.Sprintf("<h%d id=\"toc_%d\">", level, options.headerCount))
207 } else {
208 out.WriteString(fmt.Sprintf("<h%d>", level))
209 }
210
211 tocMarker := out.Len()
212 if !text() {
213 out.Truncate(marker)
214 return
215 }
216
217 // are we building a table of contents?
218 if options.flags&HTML_TOC != 0 {
219 options.TocHeader(out.Bytes()[tocMarker:], level)
220 }
221
222 out.WriteString(fmt.Sprintf("</h%d>\n", level))
223}
224
225func (options *Html) BlockHtml(out *bytes.Buffer, text []byte) {
226 if options.flags&HTML_SKIP_HTML != 0 {
227 return
228 }
229
230 doubleSpace(out)
231 out.Write(text)
232 out.WriteByte('\n')
233}
234
235func (options *Html) HRule(out *bytes.Buffer) {
236 doubleSpace(out)
237 out.WriteString("<hr")
238 out.WriteString(options.closeTag)
239}
240
241func (options *Html) BlockCode(out *bytes.Buffer, text []byte, lang string) {
242 if options.flags&HTML_GITHUB_BLOCKCODE != 0 {
243 options.BlockCodeGithub(out, text, lang)
244 } else {
245 options.BlockCodeNormal(out, text, lang)
246 }
247}
248
249func (options *Html) BlockCodeNormal(out *bytes.Buffer, text []byte, lang string) {
250 doubleSpace(out)
251
252 // parse out the language names/classes
253 count := 0
254 for _, elt := range strings.Fields(lang) {
255 if elt[0] == '.' {
256 elt = elt[1:]
257 }
258 if len(elt) == 0 {
259 continue
260 }
261 if count == 0 {
262 out.WriteString("<pre><code class=\"")
263 } else {
264 out.WriteByte(' ')
265 }
266 attrEscape(out, []byte(elt))
267 count++
268 }
269
270 if count == 0 {
271 out.WriteString("<pre><code>")
272 } else {
273 out.WriteString("\">")
274 }
275
276 attrEscape(out, text)
277 out.WriteString("</code></pre>\n")
278}
279
280// GitHub style code block:
281//
282// <pre lang="LANG"><code>
283// ...
284// </code></pre>
285//
286// Unlike other parsers, we store the language identifier in the <pre>,
287// and don't let the user generate custom classes.
288//
289// The language identifier in the <pre> block gets postprocessed and all
290// the code inside gets syntax highlighted with Pygments. This is much safer
291// than letting the user specify a CSS class for highlighting.
292//
293// Note that we only generate HTML for the first specifier.
294// E.g.
295// ~~~~ {.python .numbered} => <pre lang="python"><code>
296func (options *Html) BlockCodeGithub(out *bytes.Buffer, text []byte, lang string) {
297 doubleSpace(out)
298
299 // parse out the language name
300 count := 0
301 for _, elt := range strings.Fields(lang) {
302 if elt[0] == '.' {
303 elt = elt[1:]
304 }
305 if len(elt) == 0 {
306 continue
307 }
308 out.WriteString("<pre lang=\"")
309 attrEscape(out, []byte(elt))
310 out.WriteString("\"><code>")
311 count++
312 break
313 }
314
315 if count == 0 {
316 out.WriteString("<pre><code>")
317 }
318
319 attrEscape(out, text)
320 out.WriteString("</code></pre>\n")
321}
322
323func (options *Html) BlockQuote(out *bytes.Buffer, text []byte) {
324 doubleSpace(out)
325 out.WriteString("<blockquote>\n")
326 out.Write(text)
327 out.WriteString("</blockquote>\n")
328}
329
330func (options *Html) Table(out *bytes.Buffer, header []byte, body []byte, columnData []int) {
331 doubleSpace(out)
332 out.WriteString("<table>\n<thead>\n")
333 out.Write(header)
334 out.WriteString("</thead>\n\n<tbody>\n")
335 out.Write(body)
336 out.WriteString("</tbody>\n</table>\n")
337}
338
339func (options *Html) TableRow(out *bytes.Buffer, text []byte) {
340 doubleSpace(out)
341 out.WriteString("<tr>\n")
342 out.Write(text)
343 out.WriteString("\n</tr>\n")
344}
345
346func (options *Html) TableHeaderCell(out *bytes.Buffer, text []byte, align int) {
347 doubleSpace(out)
348 switch align {
349 case TABLE_ALIGNMENT_LEFT:
350 out.WriteString("<th align=\"left\">")
351 case TABLE_ALIGNMENT_RIGHT:
352 out.WriteString("<th align=\"right\">")
353 case TABLE_ALIGNMENT_CENTER:
354 out.WriteString("<th align=\"center\">")
355 default:
356 out.WriteString("<th>")
357 }
358
359 out.Write(text)
360 out.WriteString("</th>")
361}
362
363func (options *Html) TableCell(out *bytes.Buffer, text []byte, align int) {
364 doubleSpace(out)
365 switch align {
366 case TABLE_ALIGNMENT_LEFT:
367 out.WriteString("<td align=\"left\">")
368 case TABLE_ALIGNMENT_RIGHT:
369 out.WriteString("<td align=\"right\">")
370 case TABLE_ALIGNMENT_CENTER:
371 out.WriteString("<td align=\"center\">")
372 default:
373 out.WriteString("<td>")
374 }
375
376 out.Write(text)
377 out.WriteString("</td>")
378}
379
380func (options *Html) Footnotes(out *bytes.Buffer, text func() bool) {
381 out.WriteString("<div class=\"footnotes\">\n")
382 options.HRule(out)
383 options.List(out, text, LIST_TYPE_ORDERED)
384 out.WriteString("</div>\n")
385}
386
387func (options *Html) FootnoteItem(out *bytes.Buffer, name, text []byte, flags int) {
388 if flags&LIST_ITEM_CONTAINS_BLOCK != 0 || flags&LIST_ITEM_BEGINNING_OF_LIST != 0 {
389 doubleSpace(out)
390 }
391 out.WriteString(`<li id="fn:`)
392 out.Write(slugify(name))
393 out.WriteString(`">`)
394 out.Write(text)
395 out.WriteString("</li>\n")
396}
397
398func (options *Html) List(out *bytes.Buffer, text func() bool, flags int) {
399 marker := out.Len()
400 doubleSpace(out)
401
402 if flags&LIST_TYPE_ORDERED != 0 {
403 out.WriteString("<ol>")
404 } else {
405 out.WriteString("<ul>")
406 }
407 if !text() {
408 out.Truncate(marker)
409 return
410 }
411 if flags&LIST_TYPE_ORDERED != 0 {
412 out.WriteString("</ol>\n")
413 } else {
414 out.WriteString("</ul>\n")
415 }
416}
417
418func (options *Html) ListItem(out *bytes.Buffer, text []byte, flags int) {
419 if flags&LIST_ITEM_CONTAINS_BLOCK != 0 || flags&LIST_ITEM_BEGINNING_OF_LIST != 0 {
420 doubleSpace(out)
421 }
422 out.WriteString("<li>")
423 out.Write(text)
424 out.WriteString("</li>\n")
425}
426
427func (options *Html) Paragraph(out *bytes.Buffer, text func() bool) {
428 marker := out.Len()
429 doubleSpace(out)
430
431 out.WriteString("<p>")
432 if !text() {
433 out.Truncate(marker)
434 return
435 }
436 out.WriteString("</p>\n")
437}
438
439func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
440 skipRanges := htmlEntity.FindAllIndex(link, -1)
441 if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) && kind != LINK_TYPE_EMAIL {
442 // mark it but don't link it if it is not a safe link: no smartypants
443 out.WriteString("<tt>")
444 entityEscapeWithSkip(out, link, skipRanges)
445 out.WriteString("</tt>")
446 return
447 }
448
449 out.WriteString("<a href=\"")
450 if kind == LINK_TYPE_EMAIL {
451 out.WriteString("mailto:")
452 }
453 entityEscapeWithSkip(out, link, skipRanges)
454 out.WriteString("\">")
455
456 // Pretty print: if we get an email address as
457 // an actual URI, e.g. `mailto:foo@bar.com`, we don't
458 // want to print the `mailto:` prefix
459 switch {
460 case bytes.HasPrefix(link, []byte("mailto://")):
461 attrEscape(out, link[len("mailto://"):])
462 case bytes.HasPrefix(link, []byte("mailto:")):
463 attrEscape(out, link[len("mailto:"):])
464 default:
465 entityEscapeWithSkip(out, link, skipRanges)
466 }
467
468 out.WriteString("</a>")
469}
470
471func (options *Html) CodeSpan(out *bytes.Buffer, text []byte) {
472 out.WriteString("<code>")
473 attrEscape(out, text)
474 out.WriteString("</code>")
475}
476
477func (options *Html) DoubleEmphasis(out *bytes.Buffer, text []byte) {
478 out.WriteString("<strong>")
479 out.Write(text)
480 out.WriteString("</strong>")
481}
482
483func (options *Html) Emphasis(out *bytes.Buffer, text []byte) {
484 if len(text) == 0 {
485 return
486 }
487 out.WriteString("<em>")
488 out.Write(text)
489 out.WriteString("</em>")
490}
491
492func (options *Html) Image(out *bytes.Buffer, link []byte, title []byte, alt []byte) {
493 if options.flags&HTML_SKIP_IMAGES != 0 {
494 return
495 }
496
497 out.WriteString("<img src=\"")
498 attrEscape(out, link)
499 out.WriteString("\" alt=\"")
500 if len(alt) > 0 {
501 attrEscape(out, alt)
502 }
503 if len(title) > 0 {
504 out.WriteString("\" title=\"")
505 attrEscape(out, title)
506 }
507
508 out.WriteByte('"')
509 out.WriteString(options.closeTag)
510 return
511}
512
513func (options *Html) LineBreak(out *bytes.Buffer) {
514 out.WriteString("<br")
515 out.WriteString(options.closeTag)
516}
517
518func (options *Html) Link(out *bytes.Buffer, link []byte, title []byte, content []byte) {
519 if options.flags&HTML_SKIP_LINKS != 0 {
520 // write the link text out but don't link it, just mark it with typewriter font
521 out.WriteString("<tt>")
522 attrEscape(out, content)
523 out.WriteString("</tt>")
524 return
525 }
526
527 if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) {
528 // write the link text out but don't link it, just mark it with typewriter font
529 out.WriteString("<tt>")
530 attrEscape(out, content)
531 out.WriteString("</tt>")
532 return
533 }
534
535 out.WriteString("<a href=\"")
536 attrEscape(out, link)
537 if len(title) > 0 {
538 out.WriteString("\" title=\"")
539 attrEscape(out, title)
540 }
541 if options.flags&HTML_NOFOLLOW_LINKS != 0 {
542 out.WriteString("\" rel=\"nofollow")
543 }
544 out.WriteString("\">")
545 out.Write(content)
546 out.WriteString("</a>")
547 return
548}
549
550func (options *Html) RawHtmlTag(out *bytes.Buffer, text []byte) {
551 if options.flags&HTML_SKIP_HTML != 0 {
552 return
553 }
554 if options.flags&HTML_SKIP_STYLE != 0 && isHtmlTag(text, "style") {
555 return
556 }
557 if options.flags&HTML_SKIP_LINKS != 0 && isHtmlTag(text, "a") {
558 return
559 }
560 if options.flags&HTML_SKIP_IMAGES != 0 && isHtmlTag(text, "img") {
561 return
562 }
563 out.Write(text)
564}
565
566func (options *Html) TripleEmphasis(out *bytes.Buffer, text []byte) {
567 out.WriteString("<strong><em>")
568 out.Write(text)
569 out.WriteString("</em></strong>")
570}
571
572func (options *Html) StrikeThrough(out *bytes.Buffer, text []byte) {
573 out.WriteString("<del>")
574 out.Write(text)
575 out.WriteString("</del>")
576}
577
578func (options *Html) FootnoteRef(out *bytes.Buffer, ref []byte, id int) {
579 slug := slugify(ref)
580 out.WriteString(`<sup class="footnote-ref" id="fnref:`)
581 out.Write(slug)
582 out.WriteString(`"><a rel="footnote" href="#fn:`)
583 out.Write(slug)
584 out.WriteString(`">`)
585 out.WriteString(strconv.Itoa(id))
586 out.WriteString(`</a></sup>`)
587}
588
589func (options *Html) Entity(out *bytes.Buffer, entity []byte) {
590 out.Write(entity)
591}
592
593func (options *Html) NormalText(out *bytes.Buffer, text []byte) {
594 if options.flags&HTML_USE_SMARTYPANTS != 0 {
595 options.Smartypants(out, text)
596 } else {
597 attrEscape(out, text)
598 }
599}
600
601func (options *Html) Smartypants(out *bytes.Buffer, text []byte) {
602 smrt := smartypantsData{false, false}
603
604 // first do normal entity escaping
605 var escaped bytes.Buffer
606 attrEscape(&escaped, text)
607 text = escaped.Bytes()
608
609 mark := 0
610 for i := 0; i < len(text); i++ {
611 if action := options.smartypants[text[i]]; action != nil {
612 if i > mark {
613 out.Write(text[mark:i])
614 }
615
616 previousChar := byte(0)
617 if i > 0 {
618 previousChar = text[i-1]
619 }
620 i += action(out, &smrt, previousChar, text[i:])
621 mark = i + 1
622 }
623 }
624
625 if mark < len(text) {
626 out.Write(text[mark:])
627 }
628}
629
630func (options *Html) DocumentHeader(out *bytes.Buffer) {
631 if options.flags&HTML_COMPLETE_PAGE == 0 {
632 return
633 }
634
635 ending := ""
636 if options.flags&HTML_USE_XHTML != 0 {
637 out.WriteString("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" ")
638 out.WriteString("\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n")
639 out.WriteString("<html xmlns=\"http://www.w3.org/1999/xhtml\">\n")
640 ending = " /"
641 } else {
642 out.WriteString("<!DOCTYPE html>\n")
643 out.WriteString("<html>\n")
644 }
645 out.WriteString("<head>\n")
646 out.WriteString(" <title>")
647 options.NormalText(out, []byte(options.title))
648 out.WriteString("</title>\n")
649 out.WriteString(" <meta name=\"GENERATOR\" content=\"Blackfriday Markdown Processor v")
650 out.WriteString(VERSION)
651 out.WriteString("\"")
652 out.WriteString(ending)
653 out.WriteString(">\n")
654 out.WriteString(" <meta charset=\"utf-8\"")
655 out.WriteString(ending)
656 out.WriteString(">\n")
657 if options.css != "" {
658 out.WriteString(" <link rel=\"stylesheet\" type=\"text/css\" href=\"")
659 attrEscape(out, []byte(options.css))
660 out.WriteString("\"")
661 out.WriteString(ending)
662 out.WriteString(">\n")
663 }
664 out.WriteString("</head>\n")
665 out.WriteString("<body>\n")
666
667 options.tocMarker = out.Len()
668}
669
670func (options *Html) DocumentFooter(out *bytes.Buffer) {
671 // finalize and insert the table of contents
672 if options.flags&HTML_TOC != 0 {
673 options.TocFinalize()
674
675 // now we have to insert the table of contents into the document
676 var temp bytes.Buffer
677
678 // start by making a copy of everything after the document header
679 temp.Write(out.Bytes()[options.tocMarker:])
680
681 // now clear the copied material from the main output buffer
682 out.Truncate(options.tocMarker)
683
684 // corner case spacing issue
685 if options.flags&HTML_COMPLETE_PAGE != 0 {
686 out.WriteByte('\n')
687 }
688
689 // insert the table of contents
690 out.WriteString("<nav>\n")
691 out.Write(options.toc.Bytes())
692 out.WriteString("</nav>\n")
693
694 // corner case spacing issue
695 if options.flags&HTML_COMPLETE_PAGE == 0 && options.flags&HTML_OMIT_CONTENTS == 0 {
696 out.WriteByte('\n')
697 }
698
699 // write out everything that came after it
700 if options.flags&HTML_OMIT_CONTENTS == 0 {
701 out.Write(temp.Bytes())
702 }
703 }
704
705 if options.flags&HTML_COMPLETE_PAGE != 0 {
706 out.WriteString("\n</body>\n")
707 out.WriteString("</html>\n")
708 }
709
710}
711
712func (options *Html) TocHeader(text []byte, level int) {
713 for level > options.currentLevel {
714 switch {
715 case bytes.HasSuffix(options.toc.Bytes(), []byte("</li>\n")):
716 // this sublist can nest underneath a header
717 size := options.toc.Len()
718 options.toc.Truncate(size - len("</li>\n"))
719
720 case options.currentLevel > 0:
721 options.toc.WriteString("<li>")
722 }
723 if options.toc.Len() > 0 {
724 options.toc.WriteByte('\n')
725 }
726 options.toc.WriteString("<ul>\n")
727 options.currentLevel++
728 }
729
730 for level < options.currentLevel {
731 options.toc.WriteString("</ul>")
732 if options.currentLevel > 1 {
733 options.toc.WriteString("</li>\n")
734 }
735 options.currentLevel--
736 }
737
738 options.toc.WriteString("<li><a href=\"#toc_")
739 options.toc.WriteString(strconv.Itoa(options.headerCount))
740 options.toc.WriteString("\">")
741 options.headerCount++
742
743 options.toc.Write(text)
744
745 options.toc.WriteString("</a></li>\n")
746}
747
748func (options *Html) TocFinalize() {
749 for options.currentLevel > 1 {
750 options.toc.WriteString("</ul></li>\n")
751 options.currentLevel--
752 }
753
754 if options.currentLevel > 0 {
755 options.toc.WriteString("</ul>\n")
756 }
757}
758
759func isHtmlTag(tag []byte, tagname string) bool {
760 found, _ := findHtmlTagPos(tag, tagname)
761 return found
762}
763
764// Look for a character, but ignore it when it's in any kind of quotes, it
765// might be JavaScript
766func skipUntilCharIgnoreQuotes(html []byte, start int, char byte) int {
767 inSingleQuote := false
768 inDoubleQuote := false
769 inGraveQuote := false
770 i := start
771 for i < len(html) {
772 switch {
773 case html[i] == char && !inSingleQuote && !inDoubleQuote && !inGraveQuote:
774 return i
775 case html[i] == '\'':
776 inSingleQuote = !inSingleQuote
777 case html[i] == '"':
778 inDoubleQuote = !inDoubleQuote
779 case html[i] == '`':
780 inGraveQuote = !inGraveQuote
781 }
782 i++
783 }
784 return start
785}
786
787func findHtmlTagPos(tag []byte, tagname string) (bool, int) {
788 i := 0
789 if i < len(tag) && tag[0] != '<' {
790 return false, -1
791 }
792 i++
793 i = skipSpace(tag, i)
794
795 if i < len(tag) && tag[i] == '/' {
796 i++
797 }
798
799 i = skipSpace(tag, i)
800 j := 0
801 for ; i < len(tag); i, j = i+1, j+1 {
802 if j >= len(tagname) {
803 break
804 }
805
806 if strings.ToLower(string(tag[i]))[0] != tagname[j] {
807 return false, -1
808 }
809 }
810
811 if i == len(tag) {
812 return false, -1
813 }
814
815 rightAngle := skipUntilCharIgnoreQuotes(tag, i, '>')
816 if rightAngle > i {
817 return true, rightAngle
818 }
819
820 return false, -1
821}
822
823func sanitizeHtml(html []byte) []byte {
824 var result []byte
825 for string(html) != "" {
826 skip, tag, rest := findHtmlTag(html)
827 html = rest
828 result = append(result, skip...)
829 result = append(result, sanitizeTag(tag)...)
830 }
831 return append(result, []byte("\n")...)
832}
833
834func sanitizeTag(tag []byte) []byte {
835 if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) {
836 return tag
837 }
838 return []byte("")
839}
840
841func skipUntilChar(text []byte, start int, char byte) int {
842 i := start
843 for i < len(text) && text[i] != char {
844 i++
845 }
846 return i
847}
848
849func findHtmlTag(html []byte) (skip, tag, rest []byte) {
850 start := skipUntilChar(html, 0, '<')
851 rightAngle := skipUntilCharIgnoreQuotes(html, start, '>')
852 if rightAngle > start {
853 skip = html[0:start]
854 tag = html[start : rightAngle+1]
855 rest = html[rightAngle+1:]
856 return
857 }
858
859 return []byte(""), []byte(""), []byte("")
860}
861
862func skipSpace(tag []byte, i int) int {
863 for i < len(tag) && isspace(tag[i]) {
864 i++
865 }
866 return i
867}
868
869func doubleSpace(out *bytes.Buffer) {
870 if out.Len() > 0 {
871 out.WriteByte('\n')
872 }
873}