html.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11//
12// HTML rendering backend
13//
14//
15
16package blackfriday
17
18import (
19 "bytes"
20 "fmt"
21 "regexp"
22 "strconv"
23 "strings"
24)
25
26// Html renderer configuration options.
27const (
28 HTML_SKIP_HTML = 1 << iota // skip preformatted HTML blocks
29 HTML_SKIP_STYLE // skip embedded <style> elements
30 HTML_SKIP_IMAGES // skip embedded images
31 HTML_SKIP_LINKS // skip all links
32 HTML_SANITIZE_OUTPUT // strip output of everything that's not known to be safe
33 HTML_SAFELINK // only link to trusted protocols
34 HTML_NOFOLLOW_LINKS // only link with rel="nofollow"
35 HTML_TOC // generate a table of contents
36 HTML_OMIT_CONTENTS // skip the main contents (for a standalone table of contents)
37 HTML_COMPLETE_PAGE // generate a complete HTML page
38 HTML_GITHUB_BLOCKCODE // use github fenced code rendering rules
39 HTML_USE_XHTML // generate XHTML output instead of HTML
40 HTML_USE_SMARTYPANTS // enable smart punctuation substitutions
41 HTML_SMARTYPANTS_FRACTIONS // enable smart fractions (with HTML_USE_SMARTYPANTS)
42 HTML_SMARTYPANTS_LATEX_DASHES // enable LaTeX-style dashes (with HTML_USE_SMARTYPANTS)
43)
44
45var (
46 tags = []string{
47 "b",
48 "blockquote",
49 "code",
50 "del",
51 "dd",
52 "dl",
53 "dt",
54 "em",
55 "h1",
56 "h2",
57 "h3",
58 "h4",
59 "h5",
60 "h6",
61 "i",
62 "kbd",
63 "li",
64 "ol",
65 "p",
66 "pre",
67 "s",
68 "sup",
69 "sub",
70 "strong",
71 "strike",
72 "ul",
73 }
74 urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
75 tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`)
76 anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
77 imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
78 // TODO: improve this regexp to catch all possible entities:
79 htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`)
80)
81
82// Html is a type that implements the Renderer interface for HTML output.
83//
84// Do not create this directly, instead use the HtmlRenderer function.
85type Html struct {
86 flags int // HTML_* options
87 closeTag string // how to end singleton tags: either " />\n" or ">\n"
88 title string // document title
89 css string // optional css file url (used with HTML_COMPLETE_PAGE)
90
91 // table of contents data
92 tocMarker int
93 headerCount int
94 currentLevel int
95 toc *bytes.Buffer
96
97 smartypants *smartypantsRenderer
98}
99
100const (
101 xhtmlClose = " />\n"
102 htmlClose = ">\n"
103)
104
105// HtmlRenderer creates and configures an Html object, which
106// satisfies the Renderer interface.
107//
108// flags is a set of HTML_* options ORed together.
109// title is the title of the document, and css is a URL for the document's
110// stylesheet.
111// title and css are only used when HTML_COMPLETE_PAGE is selected.
112func HtmlRenderer(flags int, title string, css string) Renderer {
113 // configure the rendering engine
114 closeTag := htmlClose
115 if flags&HTML_USE_XHTML != 0 {
116 closeTag = xhtmlClose
117 }
118
119 return &Html{
120 flags: flags,
121 closeTag: closeTag,
122 title: title,
123 css: css,
124
125 headerCount: 0,
126 currentLevel: 0,
127 toc: new(bytes.Buffer),
128
129 smartypants: smartypants(flags),
130 }
131}
132
133// Using if statements is a bit faster than a switch statement. As the compiler
134// improves, this should be unnecessary this is only worthwhile because
135// attrEscape is the single largest CPU user in normal use.
136// Also tried using map, but that gave a ~3x slowdown.
137func escapeSingleChar(char byte) (string, bool) {
138 if char == '"' {
139 return """, true
140 }
141 if char == '&' {
142 return "&", true
143 }
144 if char == '<' {
145 return "<", true
146 }
147 if char == '>' {
148 return ">", true
149 }
150 return "", false
151}
152
153func attrEscape(out *bytes.Buffer, src []byte) {
154 org := 0
155 for i, ch := range src {
156 if entity, ok := escapeSingleChar(ch); ok {
157 if i > org {
158 // copy all the normal characters since the last escape
159 out.Write(src[org:i])
160 }
161 org = i + 1
162 out.WriteString(entity)
163 }
164 }
165 if org < len(src) {
166 out.Write(src[org:])
167 }
168}
169
170func entityEscapeWithSkip(out *bytes.Buffer, src []byte, skipRanges [][]int) {
171 end := 0
172 for _, rang := range skipRanges {
173 attrEscape(out, src[end:rang[0]])
174 out.Write(src[rang[0]:rang[1]])
175 end = rang[1]
176 }
177 attrEscape(out, src[end:])
178}
179
180func (options *Html) GetFlags() int {
181 return options.flags
182}
183
184func (options *Html) Header(out *bytes.Buffer, text func() bool, level int, id string) {
185 marker := out.Len()
186 doubleSpace(out)
187
188 if id != "" {
189 out.WriteString(fmt.Sprintf("<h%d id=\"%s\">", level, id))
190 } else if options.flags&HTML_TOC != 0 {
191 // headerCount is incremented in htmlTocHeader
192 out.WriteString(fmt.Sprintf("<h%d id=\"toc_%d\">", level, options.headerCount))
193 } else {
194 out.WriteString(fmt.Sprintf("<h%d>", level))
195 }
196
197 tocMarker := out.Len()
198 if !text() {
199 out.Truncate(marker)
200 return
201 }
202
203 // are we building a table of contents?
204 if options.flags&HTML_TOC != 0 {
205 options.TocHeader(out.Bytes()[tocMarker:], level)
206 }
207
208 out.WriteString(fmt.Sprintf("</h%d>\n", level))
209}
210
211func (options *Html) BlockHtml(out *bytes.Buffer, text []byte) {
212 if options.flags&HTML_SKIP_HTML != 0 {
213 return
214 }
215
216 doubleSpace(out)
217 out.Write(text)
218 out.WriteByte('\n')
219}
220
221func (options *Html) HRule(out *bytes.Buffer) {
222 doubleSpace(out)
223 out.WriteString("<hr")
224 out.WriteString(options.closeTag)
225}
226
227func (options *Html) BlockCode(out *bytes.Buffer, text []byte, lang string) {
228 if options.flags&HTML_GITHUB_BLOCKCODE != 0 {
229 options.BlockCodeGithub(out, text, lang)
230 } else {
231 options.BlockCodeNormal(out, text, lang)
232 }
233}
234
235func (options *Html) BlockCodeNormal(out *bytes.Buffer, text []byte, lang string) {
236 doubleSpace(out)
237
238 // parse out the language names/classes
239 count := 0
240 for _, elt := range strings.Fields(lang) {
241 if elt[0] == '.' {
242 elt = elt[1:]
243 }
244 if len(elt) == 0 {
245 continue
246 }
247 if count == 0 {
248 out.WriteString("<pre><code class=\"")
249 } else {
250 out.WriteByte(' ')
251 }
252 attrEscape(out, []byte(elt))
253 count++
254 }
255
256 if count == 0 {
257 out.WriteString("<pre><code>")
258 } else {
259 out.WriteString("\">")
260 }
261
262 attrEscape(out, text)
263 out.WriteString("</code></pre>\n")
264}
265
266// GitHub style code block:
267//
268// <pre lang="LANG"><code>
269// ...
270// </code></pre>
271//
272// Unlike other parsers, we store the language identifier in the <pre>,
273// and don't let the user generate custom classes.
274//
275// The language identifier in the <pre> block gets postprocessed and all
276// the code inside gets syntax highlighted with Pygments. This is much safer
277// than letting the user specify a CSS class for highlighting.
278//
279// Note that we only generate HTML for the first specifier.
280// E.g.
281// ~~~~ {.python .numbered} => <pre lang="python"><code>
282func (options *Html) BlockCodeGithub(out *bytes.Buffer, text []byte, lang string) {
283 doubleSpace(out)
284
285 // parse out the language name
286 count := 0
287 for _, elt := range strings.Fields(lang) {
288 if elt[0] == '.' {
289 elt = elt[1:]
290 }
291 if len(elt) == 0 {
292 continue
293 }
294 out.WriteString("<pre lang=\"")
295 attrEscape(out, []byte(elt))
296 out.WriteString("\"><code>")
297 count++
298 break
299 }
300
301 if count == 0 {
302 out.WriteString("<pre><code>")
303 }
304
305 attrEscape(out, text)
306 out.WriteString("</code></pre>\n")
307}
308
309func (options *Html) BlockQuote(out *bytes.Buffer, text []byte) {
310 doubleSpace(out)
311 out.WriteString("<blockquote>\n")
312 out.Write(text)
313 out.WriteString("</blockquote>\n")
314}
315
316func (options *Html) Table(out *bytes.Buffer, header []byte, body []byte, columnData []int) {
317 doubleSpace(out)
318 out.WriteString("<table>\n<thead>\n")
319 out.Write(header)
320 out.WriteString("</thead>\n\n<tbody>\n")
321 out.Write(body)
322 out.WriteString("</tbody>\n</table>\n")
323}
324
325func (options *Html) TableRow(out *bytes.Buffer, text []byte) {
326 doubleSpace(out)
327 out.WriteString("<tr>\n")
328 out.Write(text)
329 out.WriteString("\n</tr>\n")
330}
331
332func (options *Html) TableHeaderCell(out *bytes.Buffer, text []byte, align int) {
333 doubleSpace(out)
334 switch align {
335 case TABLE_ALIGNMENT_LEFT:
336 out.WriteString("<th align=\"left\">")
337 case TABLE_ALIGNMENT_RIGHT:
338 out.WriteString("<th align=\"right\">")
339 case TABLE_ALIGNMENT_CENTER:
340 out.WriteString("<th align=\"center\">")
341 default:
342 out.WriteString("<th>")
343 }
344
345 out.Write(text)
346 out.WriteString("</th>")
347}
348
349func (options *Html) TableCell(out *bytes.Buffer, text []byte, align int) {
350 doubleSpace(out)
351 switch align {
352 case TABLE_ALIGNMENT_LEFT:
353 out.WriteString("<td align=\"left\">")
354 case TABLE_ALIGNMENT_RIGHT:
355 out.WriteString("<td align=\"right\">")
356 case TABLE_ALIGNMENT_CENTER:
357 out.WriteString("<td align=\"center\">")
358 default:
359 out.WriteString("<td>")
360 }
361
362 out.Write(text)
363 out.WriteString("</td>")
364}
365
366func (options *Html) Footnotes(out *bytes.Buffer, text func() bool) {
367 out.WriteString("<div class=\"footnotes\">\n")
368 options.HRule(out)
369 options.List(out, text, LIST_TYPE_ORDERED)
370 out.WriteString("</div>\n")
371}
372
373func (options *Html) FootnoteItem(out *bytes.Buffer, name, text []byte, flags int) {
374 if flags&LIST_ITEM_CONTAINS_BLOCK != 0 || flags&LIST_ITEM_BEGINNING_OF_LIST != 0 {
375 doubleSpace(out)
376 }
377 out.WriteString(`<li id="fn:`)
378 out.Write(slugify(name))
379 out.WriteString(`">`)
380 out.Write(text)
381 out.WriteString("</li>\n")
382}
383
384func (options *Html) List(out *bytes.Buffer, text func() bool, flags int) {
385 marker := out.Len()
386 doubleSpace(out)
387
388 if flags&LIST_TYPE_ORDERED != 0 {
389 out.WriteString("<ol>")
390 } else {
391 out.WriteString("<ul>")
392 }
393 if !text() {
394 out.Truncate(marker)
395 return
396 }
397 if flags&LIST_TYPE_ORDERED != 0 {
398 out.WriteString("</ol>\n")
399 } else {
400 out.WriteString("</ul>\n")
401 }
402}
403
404func (options *Html) ListItem(out *bytes.Buffer, text []byte, flags int) {
405 if flags&LIST_ITEM_CONTAINS_BLOCK != 0 || flags&LIST_ITEM_BEGINNING_OF_LIST != 0 {
406 doubleSpace(out)
407 }
408 out.WriteString("<li>")
409 out.Write(text)
410 out.WriteString("</li>\n")
411}
412
413func (options *Html) Paragraph(out *bytes.Buffer, text func() bool) {
414 marker := out.Len()
415 doubleSpace(out)
416
417 out.WriteString("<p>")
418 if !text() {
419 out.Truncate(marker)
420 return
421 }
422 out.WriteString("</p>\n")
423}
424
425func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
426 skipRanges := htmlEntity.FindAllIndex(link, -1)
427 if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) && kind != LINK_TYPE_EMAIL {
428 // mark it but don't link it if it is not a safe link: no smartypants
429 out.WriteString("<tt>")
430 entityEscapeWithSkip(out, link, skipRanges)
431 out.WriteString("</tt>")
432 return
433 }
434
435 out.WriteString("<a href=\"")
436 if kind == LINK_TYPE_EMAIL {
437 out.WriteString("mailto:")
438 }
439 entityEscapeWithSkip(out, link, skipRanges)
440 out.WriteString("\">")
441
442 // Pretty print: if we get an email address as
443 // an actual URI, e.g. `mailto:foo@bar.com`, we don't
444 // want to print the `mailto:` prefix
445 switch {
446 case bytes.HasPrefix(link, []byte("mailto://")):
447 attrEscape(out, link[len("mailto://"):])
448 case bytes.HasPrefix(link, []byte("mailto:")):
449 attrEscape(out, link[len("mailto:"):])
450 default:
451 entityEscapeWithSkip(out, link, skipRanges)
452 }
453
454 out.WriteString("</a>")
455}
456
457func (options *Html) CodeSpan(out *bytes.Buffer, text []byte) {
458 out.WriteString("<code>")
459 attrEscape(out, text)
460 out.WriteString("</code>")
461}
462
463func (options *Html) DoubleEmphasis(out *bytes.Buffer, text []byte) {
464 out.WriteString("<strong>")
465 out.Write(text)
466 out.WriteString("</strong>")
467}
468
469func (options *Html) Emphasis(out *bytes.Buffer, text []byte) {
470 if len(text) == 0 {
471 return
472 }
473 out.WriteString("<em>")
474 out.Write(text)
475 out.WriteString("</em>")
476}
477
478func (options *Html) Image(out *bytes.Buffer, link []byte, title []byte, alt []byte) {
479 if options.flags&HTML_SKIP_IMAGES != 0 {
480 return
481 }
482
483 out.WriteString("<img src=\"")
484 attrEscape(out, link)
485 out.WriteString("\" alt=\"")
486 if len(alt) > 0 {
487 attrEscape(out, alt)
488 }
489 if len(title) > 0 {
490 out.WriteString("\" title=\"")
491 attrEscape(out, title)
492 }
493
494 out.WriteByte('"')
495 out.WriteString(options.closeTag)
496 return
497}
498
499func (options *Html) LineBreak(out *bytes.Buffer) {
500 out.WriteString("<br")
501 out.WriteString(options.closeTag)
502}
503
504func (options *Html) Link(out *bytes.Buffer, link []byte, title []byte, content []byte) {
505 if options.flags&HTML_SKIP_LINKS != 0 {
506 // write the link text out but don't link it, just mark it with typewriter font
507 out.WriteString("<tt>")
508 attrEscape(out, content)
509 out.WriteString("</tt>")
510 return
511 }
512
513 if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) {
514 // write the link text out but don't link it, just mark it with typewriter font
515 out.WriteString("<tt>")
516 attrEscape(out, content)
517 out.WriteString("</tt>")
518 return
519 }
520
521 out.WriteString("<a href=\"")
522 attrEscape(out, link)
523 if len(title) > 0 {
524 out.WriteString("\" title=\"")
525 attrEscape(out, title)
526 }
527 if options.flags&HTML_NOFOLLOW_LINKS != 0 {
528 out.WriteString("\" rel=\"nofollow")
529 }
530 out.WriteString("\">")
531 out.Write(content)
532 out.WriteString("</a>")
533 return
534}
535
536func (options *Html) RawHtmlTag(out *bytes.Buffer, text []byte) {
537 if options.flags&HTML_SKIP_HTML != 0 {
538 return
539 }
540 if options.flags&HTML_SKIP_STYLE != 0 && isHtmlTag(text, "style") {
541 return
542 }
543 if options.flags&HTML_SKIP_LINKS != 0 && isHtmlTag(text, "a") {
544 return
545 }
546 if options.flags&HTML_SKIP_IMAGES != 0 && isHtmlTag(text, "img") {
547 return
548 }
549 out.Write(text)
550}
551
552func (options *Html) TripleEmphasis(out *bytes.Buffer, text []byte) {
553 out.WriteString("<strong><em>")
554 out.Write(text)
555 out.WriteString("</em></strong>")
556}
557
558func (options *Html) StrikeThrough(out *bytes.Buffer, text []byte) {
559 out.WriteString("<del>")
560 out.Write(text)
561 out.WriteString("</del>")
562}
563
564func (options *Html) FootnoteRef(out *bytes.Buffer, ref []byte, id int) {
565 slug := slugify(ref)
566 out.WriteString(`<sup class="footnote-ref" id="fnref:`)
567 out.Write(slug)
568 out.WriteString(`"><a rel="footnote" href="#fn:`)
569 out.Write(slug)
570 out.WriteString(`">`)
571 out.WriteString(strconv.Itoa(id))
572 out.WriteString(`</a></sup>`)
573}
574
575func (options *Html) Entity(out *bytes.Buffer, entity []byte) {
576 out.Write(entity)
577}
578
579func (options *Html) NormalText(out *bytes.Buffer, text []byte) {
580 if options.flags&HTML_USE_SMARTYPANTS != 0 {
581 options.Smartypants(out, text)
582 } else {
583 attrEscape(out, text)
584 }
585}
586
587func (options *Html) Smartypants(out *bytes.Buffer, text []byte) {
588 smrt := smartypantsData{false, false}
589
590 // first do normal entity escaping
591 var escaped bytes.Buffer
592 attrEscape(&escaped, text)
593 text = escaped.Bytes()
594
595 mark := 0
596 for i := 0; i < len(text); i++ {
597 if action := options.smartypants[text[i]]; action != nil {
598 if i > mark {
599 out.Write(text[mark:i])
600 }
601
602 previousChar := byte(0)
603 if i > 0 {
604 previousChar = text[i-1]
605 }
606 i += action(out, &smrt, previousChar, text[i:])
607 mark = i + 1
608 }
609 }
610
611 if mark < len(text) {
612 out.Write(text[mark:])
613 }
614}
615
616func (options *Html) DocumentHeader(out *bytes.Buffer) {
617 if options.flags&HTML_COMPLETE_PAGE == 0 {
618 return
619 }
620
621 ending := ""
622 if options.flags&HTML_USE_XHTML != 0 {
623 out.WriteString("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" ")
624 out.WriteString("\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n")
625 out.WriteString("<html xmlns=\"http://www.w3.org/1999/xhtml\">\n")
626 ending = " /"
627 } else {
628 out.WriteString("<!DOCTYPE html>\n")
629 out.WriteString("<html>\n")
630 }
631 out.WriteString("<head>\n")
632 out.WriteString(" <title>")
633 options.NormalText(out, []byte(options.title))
634 out.WriteString("</title>\n")
635 out.WriteString(" <meta name=\"GENERATOR\" content=\"Blackfriday Markdown Processor v")
636 out.WriteString(VERSION)
637 out.WriteString("\"")
638 out.WriteString(ending)
639 out.WriteString(">\n")
640 out.WriteString(" <meta charset=\"utf-8\"")
641 out.WriteString(ending)
642 out.WriteString(">\n")
643 if options.css != "" {
644 out.WriteString(" <link rel=\"stylesheet\" type=\"text/css\" href=\"")
645 attrEscape(out, []byte(options.css))
646 out.WriteString("\"")
647 out.WriteString(ending)
648 out.WriteString(">\n")
649 }
650 out.WriteString("</head>\n")
651 out.WriteString("<body>\n")
652
653 options.tocMarker = out.Len()
654}
655
656func (options *Html) DocumentFooter(out *bytes.Buffer) {
657 // finalize and insert the table of contents
658 if options.flags&HTML_TOC != 0 {
659 options.TocFinalize()
660
661 // now we have to insert the table of contents into the document
662 var temp bytes.Buffer
663
664 // start by making a copy of everything after the document header
665 temp.Write(out.Bytes()[options.tocMarker:])
666
667 // now clear the copied material from the main output buffer
668 out.Truncate(options.tocMarker)
669
670 // corner case spacing issue
671 if options.flags&HTML_COMPLETE_PAGE != 0 {
672 out.WriteByte('\n')
673 }
674
675 // insert the table of contents
676 out.WriteString("<nav>\n")
677 out.Write(options.toc.Bytes())
678 out.WriteString("</nav>\n")
679
680 // corner case spacing issue
681 if options.flags&HTML_COMPLETE_PAGE == 0 && options.flags&HTML_OMIT_CONTENTS == 0 {
682 out.WriteByte('\n')
683 }
684
685 // write out everything that came after it
686 if options.flags&HTML_OMIT_CONTENTS == 0 {
687 out.Write(temp.Bytes())
688 }
689 }
690
691 if options.flags&HTML_COMPLETE_PAGE != 0 {
692 out.WriteString("\n</body>\n")
693 out.WriteString("</html>\n")
694 }
695
696}
697
698func (options *Html) TocHeader(text []byte, level int) {
699 for level > options.currentLevel {
700 switch {
701 case bytes.HasSuffix(options.toc.Bytes(), []byte("</li>\n")):
702 // this sublist can nest underneath a header
703 size := options.toc.Len()
704 options.toc.Truncate(size - len("</li>\n"))
705
706 case options.currentLevel > 0:
707 options.toc.WriteString("<li>")
708 }
709 if options.toc.Len() > 0 {
710 options.toc.WriteByte('\n')
711 }
712 options.toc.WriteString("<ul>\n")
713 options.currentLevel++
714 }
715
716 for level < options.currentLevel {
717 options.toc.WriteString("</ul>")
718 if options.currentLevel > 1 {
719 options.toc.WriteString("</li>\n")
720 }
721 options.currentLevel--
722 }
723
724 options.toc.WriteString("<li><a href=\"#toc_")
725 options.toc.WriteString(strconv.Itoa(options.headerCount))
726 options.toc.WriteString("\">")
727 options.headerCount++
728
729 options.toc.Write(text)
730
731 options.toc.WriteString("</a></li>\n")
732}
733
734func (options *Html) TocFinalize() {
735 for options.currentLevel > 1 {
736 options.toc.WriteString("</ul></li>\n")
737 options.currentLevel--
738 }
739
740 if options.currentLevel > 0 {
741 options.toc.WriteString("</ul>\n")
742 }
743}
744
745func isHtmlTag(tag []byte, tagname string) bool {
746 found, _ := findHtmlTagPos(tag, tagname)
747 return found
748}
749
750// Look for a character, but ignore it when it's in any kind of quotes, it
751// might be JavaScript
752func skipUntilCharIgnoreQuotes(html []byte, start int, char byte) int {
753 inSingleQuote := false
754 inDoubleQuote := false
755 inGraveQuote := false
756 i := start
757 for i < len(html) {
758 switch {
759 case html[i] == char && !inSingleQuote && !inDoubleQuote && !inGraveQuote:
760 return i
761 case html[i] == '\'':
762 inSingleQuote = !inSingleQuote
763 case html[i] == '"':
764 inDoubleQuote = !inDoubleQuote
765 case html[i] == '`':
766 inGraveQuote = !inGraveQuote
767 }
768 i++
769 }
770 return start
771}
772
773func findHtmlTagPos(tag []byte, tagname string) (bool, int) {
774 i := 0
775 if i < len(tag) && tag[0] != '<' {
776 return false, -1
777 }
778 i++
779 i = skipSpace(tag, i)
780
781 if i < len(tag) && tag[i] == '/' {
782 i++
783 }
784
785 i = skipSpace(tag, i)
786 j := 0
787 for ; i < len(tag); i, j = i+1, j+1 {
788 if j >= len(tagname) {
789 break
790 }
791
792 if strings.ToLower(string(tag[i]))[0] != tagname[j] {
793 return false, -1
794 }
795 }
796
797 if i == len(tag) {
798 return false, -1
799 }
800
801 rightAngle := skipUntilCharIgnoreQuotes(tag, i, '>')
802 if rightAngle > i {
803 return true, rightAngle
804 }
805
806 return false, -1
807}
808
809func sanitizeHtml(html []byte) []byte {
810 var result []byte
811 for string(html) != "" {
812 skip, tag, rest := findHtmlTag(html)
813 html = rest
814 result = append(result, skip...)
815 result = append(result, sanitizeTag(tag)...)
816 }
817 return append(result, []byte("\n")...)
818}
819
820func sanitizeTag(tag []byte) []byte {
821 if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) {
822 return tag
823 } else {
824 return []byte("")
825 }
826}
827
828func skipUntilChar(text []byte, start int, char byte) int {
829 i := start
830 for i < len(text) && text[i] != char {
831 i++
832 }
833 return i
834}
835
836func findHtmlTag(html []byte) (skip, tag, rest []byte) {
837 start := skipUntilChar(html, 0, '<')
838 rightAngle := skipUntilCharIgnoreQuotes(html, start, '>')
839 if rightAngle > start {
840 skip = html[0:start]
841 tag = html[start : rightAngle+1]
842 rest = html[rightAngle+1:]
843 return
844 }
845
846 return []byte(""), []byte(""), []byte("")
847}
848
849func skipSpace(tag []byte, i int) int {
850 for i < len(tag) && isspace(tag[i]) {
851 i++
852 }
853 return i
854}
855
856func doubleSpace(out *bytes.Buffer) {
857 if out.Len() > 0 {
858 out.WriteByte('\n')
859 }
860}