html.go (view raw)
1//
2// Blackfriday Markdown Processor
3// Available at http://github.com/russross/blackfriday
4//
5// Copyright © 2011 Russ Ross <russ@russross.com>.
6// Distributed under the Simplified BSD License.
7// See README.md for details.
8//
9
10//
11//
12// HTML rendering backend
13//
14//
15
16package blackfriday
17
18import (
19 "bytes"
20 "fmt"
21 "regexp"
22 "strconv"
23 "strings"
24)
25
26// Html renderer configuration options.
27const (
28 HTML_SKIP_HTML = 1 << iota // skip preformatted HTML blocks
29 HTML_SKIP_STYLE // skip embedded <style> elements
30 HTML_SKIP_IMAGES // skip embedded images
31 HTML_SKIP_LINKS // skip all links
32 HTML_SANITIZE_OUTPUT // strip output of everything that's not known to be safe
33 HTML_SAFELINK // only link to trusted protocols
34 HTML_TOC // generate a table of contents
35 HTML_OMIT_CONTENTS // skip the main contents (for a standalone table of contents)
36 HTML_COMPLETE_PAGE // generate a complete HTML page
37 HTML_GITHUB_BLOCKCODE // use github fenced code rendering rules
38 HTML_USE_XHTML // generate XHTML output instead of HTML
39 HTML_USE_SMARTYPANTS // enable smart punctuation substitutions
40 HTML_SMARTYPANTS_FRACTIONS // enable smart fractions (with HTML_USE_SMARTYPANTS)
41 HTML_SMARTYPANTS_LATEX_DASHES // enable LaTeX-style dashes (with HTML_USE_SMARTYPANTS)
42)
43
44var (
45 tags = []string{
46 "b",
47 "blockquote",
48 "code",
49 "del",
50 "dd",
51 "dl",
52 "dt",
53 "em",
54 "h1",
55 "h2",
56 "h3",
57 "h4",
58 "h5",
59 "h6",
60 "i",
61 "kbd",
62 "li",
63 "ol",
64 "p",
65 "pre",
66 "s",
67 "sup",
68 "sub",
69 "strong",
70 "strike",
71 "ul",
72 }
73 urlRe = `((https?|ftp):\/\/|\/)[-A-Za-z0-9+&@#\/%?=~_|!:,.;\(\)]+`
74 tagWhitelist = regexp.MustCompile(`^(<\/?(` + strings.Join(tags, "|") + `)>|<(br|hr)\s?\/?>)$`)
75 anchorClean = regexp.MustCompile(`^(<a\shref="` + urlRe + `"(\stitle="[^"<>]+")?\s?>|<\/a>)$`)
76 imgClean = regexp.MustCompile(`^(<img\ssrc="` + urlRe + `"(\swidth="\d{1,3}")?(\sheight="\d{1,3}")?(\salt="[^"<>]*")?(\stitle="[^"<>]*")?\s?\/?>)$`)
77 // TODO: improve this regexp to catch all possible entities:
78 htmlEntity = regexp.MustCompile(`&[a-z]{2,5};`)
79)
80
81// Html is a type that implements the Renderer interface for HTML output.
82//
83// Do not create this directly, instead use the HtmlRenderer function.
84type Html struct {
85 flags int // HTML_* options
86 closeTag string // how to end singleton tags: either " />\n" or ">\n"
87 title string // document title
88 css string // optional css file url (used with HTML_COMPLETE_PAGE)
89
90 // table of contents data
91 tocMarker int
92 headerCount int
93 currentLevel int
94 toc *bytes.Buffer
95
96 smartypants *smartypantsRenderer
97}
98
99const (
100 xhtmlClose = " />\n"
101 htmlClose = ">\n"
102)
103
104// HtmlRenderer creates and configures an Html object, which
105// satisfies the Renderer interface.
106//
107// flags is a set of HTML_* options ORed together.
108// title is the title of the document, and css is a URL for the document's
109// stylesheet.
110// title and css are only used when HTML_COMPLETE_PAGE is selected.
111func HtmlRenderer(flags int, title string, css string) Renderer {
112 // configure the rendering engine
113 closeTag := htmlClose
114 if flags&HTML_USE_XHTML != 0 {
115 closeTag = xhtmlClose
116 }
117
118 return &Html{
119 flags: flags,
120 closeTag: closeTag,
121 title: title,
122 css: css,
123
124 headerCount: 0,
125 currentLevel: 0,
126 toc: new(bytes.Buffer),
127
128 smartypants: smartypants(flags),
129 }
130}
131
132// Using if statements is a bit faster than a switch statement. As the compiler
133// improves, this should be unnecessary this is only worthwhile because
134// attrEscape is the single largest CPU user in normal use.
135// Also tried using map, but that gave a ~3x slowdown.
136func escapeSingleChar(char byte) (string, bool) {
137 if char == '"' {
138 return """, true
139 }
140 if char == '&' {
141 return "&", true
142 }
143 if char == '<' {
144 return "<", true
145 }
146 if char == '>' {
147 return ">", true
148 }
149 return "", false
150}
151
152func attrEscape(out *bytes.Buffer, src []byte) {
153 org := 0
154 for i, ch := range src {
155 if entity, ok := escapeSingleChar(ch); ok {
156 if i > org {
157 // copy all the normal characters since the last escape
158 out.Write(src[org:i])
159 }
160 org = i + 1
161 out.WriteString(entity)
162 }
163 }
164 if org < len(src) {
165 out.Write(src[org:])
166 }
167}
168
169func entityEscapeWithSkip(out *bytes.Buffer, src []byte, skipRanges [][]int) {
170 end := 0
171 for _, rang := range skipRanges {
172 attrEscape(out, src[end:rang[0]])
173 out.Write(src[rang[0]:rang[1]])
174 end = rang[1]
175 }
176 attrEscape(out, src[end:])
177}
178
179func (options *Html) GetFlags() int {
180 return options.flags
181}
182
183func (options *Html) Header(out *bytes.Buffer, text func() bool, level int) {
184 marker := out.Len()
185 doubleSpace(out)
186
187 if options.flags&HTML_TOC != 0 {
188 // headerCount is incremented in htmlTocHeader
189 out.WriteString(fmt.Sprintf("<h%d id=\"toc_%d\">", level, options.headerCount))
190 } else {
191 out.WriteString(fmt.Sprintf("<h%d>", level))
192 }
193
194 tocMarker := out.Len()
195 if !text() {
196 out.Truncate(marker)
197 return
198 }
199
200 // are we building a table of contents?
201 if options.flags&HTML_TOC != 0 {
202 options.TocHeader(out.Bytes()[tocMarker:], level)
203 }
204
205 out.WriteString(fmt.Sprintf("</h%d>\n", level))
206}
207
208func (options *Html) BlockHtml(out *bytes.Buffer, text []byte) {
209 if options.flags&HTML_SKIP_HTML != 0 {
210 return
211 }
212
213 doubleSpace(out)
214 out.Write(text)
215 out.WriteByte('\n')
216}
217
218func (options *Html) HRule(out *bytes.Buffer) {
219 doubleSpace(out)
220 out.WriteString("<hr")
221 out.WriteString(options.closeTag)
222}
223
224func (options *Html) BlockCode(out *bytes.Buffer, text []byte, lang string) {
225 if options.flags&HTML_GITHUB_BLOCKCODE != 0 {
226 options.BlockCodeGithub(out, text, lang)
227 } else {
228 options.BlockCodeNormal(out, text, lang)
229 }
230}
231
232func (options *Html) BlockCodeNormal(out *bytes.Buffer, text []byte, lang string) {
233 doubleSpace(out)
234
235 // parse out the language names/classes
236 count := 0
237 for _, elt := range strings.Fields(lang) {
238 if elt[0] == '.' {
239 elt = elt[1:]
240 }
241 if len(elt) == 0 {
242 continue
243 }
244 if count == 0 {
245 out.WriteString("<pre><code class=\"")
246 } else {
247 out.WriteByte(' ')
248 }
249 attrEscape(out, []byte(elt))
250 count++
251 }
252
253 if count == 0 {
254 out.WriteString("<pre><code>")
255 } else {
256 out.WriteString("\">")
257 }
258
259 attrEscape(out, text)
260 out.WriteString("</code></pre>\n")
261}
262
263// GitHub style code block:
264//
265// <pre lang="LANG"><code>
266// ...
267// </code></pre>
268//
269// Unlike other parsers, we store the language identifier in the <pre>,
270// and don't let the user generate custom classes.
271//
272// The language identifier in the <pre> block gets postprocessed and all
273// the code inside gets syntax highlighted with Pygments. This is much safer
274// than letting the user specify a CSS class for highlighting.
275//
276// Note that we only generate HTML for the first specifier.
277// E.g.
278// ~~~~ {.python .numbered} => <pre lang="python"><code>
279func (options *Html) BlockCodeGithub(out *bytes.Buffer, text []byte, lang string) {
280 doubleSpace(out)
281
282 // parse out the language name
283 count := 0
284 for _, elt := range strings.Fields(lang) {
285 if elt[0] == '.' {
286 elt = elt[1:]
287 }
288 if len(elt) == 0 {
289 continue
290 }
291 out.WriteString("<pre lang=\"")
292 attrEscape(out, []byte(elt))
293 out.WriteString("\"><code>")
294 count++
295 break
296 }
297
298 if count == 0 {
299 out.WriteString("<pre><code>")
300 }
301
302 attrEscape(out, text)
303 out.WriteString("</code></pre>\n")
304}
305
306func (options *Html) BlockQuote(out *bytes.Buffer, text []byte) {
307 doubleSpace(out)
308 out.WriteString("<blockquote>\n")
309 out.Write(text)
310 out.WriteString("</blockquote>\n")
311}
312
313func (options *Html) Table(out *bytes.Buffer, header []byte, body []byte, columnData []int) {
314 doubleSpace(out)
315 out.WriteString("<table>\n<thead>\n")
316 out.Write(header)
317 out.WriteString("</thead>\n\n<tbody>\n")
318 out.Write(body)
319 out.WriteString("</tbody>\n</table>\n")
320}
321
322func (options *Html) TableRow(out *bytes.Buffer, text []byte) {
323 doubleSpace(out)
324 out.WriteString("<tr>\n")
325 out.Write(text)
326 out.WriteString("\n</tr>\n")
327}
328
329func (options *Html) TableHeaderCell(out *bytes.Buffer, text []byte, align int) {
330 doubleSpace(out)
331 switch align {
332 case TABLE_ALIGNMENT_LEFT:
333 out.WriteString("<th align=\"left\">")
334 case TABLE_ALIGNMENT_RIGHT:
335 out.WriteString("<th align=\"right\">")
336 case TABLE_ALIGNMENT_CENTER:
337 out.WriteString("<th align=\"center\">")
338 default:
339 out.WriteString("<th>")
340 }
341
342 out.Write(text)
343 out.WriteString("</th>")
344}
345
346func (options *Html) TableCell(out *bytes.Buffer, text []byte, align int) {
347 doubleSpace(out)
348 switch align {
349 case TABLE_ALIGNMENT_LEFT:
350 out.WriteString("<td align=\"left\">")
351 case TABLE_ALIGNMENT_RIGHT:
352 out.WriteString("<td align=\"right\">")
353 case TABLE_ALIGNMENT_CENTER:
354 out.WriteString("<td align=\"center\">")
355 default:
356 out.WriteString("<td>")
357 }
358
359 out.Write(text)
360 out.WriteString("</td>")
361}
362
363func (options *Html) Footnotes(out *bytes.Buffer, text func() bool) {
364 out.WriteString("<div class=\"footnotes\">\n")
365 options.HRule(out)
366 options.List(out, text, LIST_TYPE_ORDERED)
367 out.WriteString("</div>\n")
368}
369
370func (options *Html) FootnoteItem(out *bytes.Buffer, name, text []byte, flags int) {
371 if flags&LIST_ITEM_CONTAINS_BLOCK != 0 || flags&LIST_ITEM_BEGINNING_OF_LIST != 0 {
372 doubleSpace(out)
373 }
374 out.WriteString(`<li id="fn:`)
375 out.Write(slugify(name))
376 out.WriteString(`">`)
377 out.Write(text)
378 out.WriteString("</li>\n")
379}
380
381func (options *Html) List(out *bytes.Buffer, text func() bool, flags int) {
382 marker := out.Len()
383 doubleSpace(out)
384
385 if flags&LIST_TYPE_ORDERED != 0 {
386 out.WriteString("<ol>")
387 } else {
388 out.WriteString("<ul>")
389 }
390 if !text() {
391 out.Truncate(marker)
392 return
393 }
394 if flags&LIST_TYPE_ORDERED != 0 {
395 out.WriteString("</ol>\n")
396 } else {
397 out.WriteString("</ul>\n")
398 }
399}
400
401func (options *Html) ListItem(out *bytes.Buffer, text []byte, flags int) {
402 if flags&LIST_ITEM_CONTAINS_BLOCK != 0 || flags&LIST_ITEM_BEGINNING_OF_LIST != 0 {
403 doubleSpace(out)
404 }
405 out.WriteString("<li>")
406 out.Write(text)
407 out.WriteString("</li>\n")
408}
409
410func (options *Html) Paragraph(out *bytes.Buffer, text func() bool) {
411 marker := out.Len()
412 doubleSpace(out)
413
414 out.WriteString("<p>")
415 if !text() {
416 out.Truncate(marker)
417 return
418 }
419 out.WriteString("</p>\n")
420}
421
422func (options *Html) AutoLink(out *bytes.Buffer, link []byte, kind int) {
423 skipRanges := htmlEntity.FindAllIndex(link, -1)
424 if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) && kind != LINK_TYPE_EMAIL {
425 // mark it but don't link it if it is not a safe link: no smartypants
426 out.WriteString("<tt>")
427 entityEscapeWithSkip(out, link, skipRanges)
428 out.WriteString("</tt>")
429 return
430 }
431
432 out.WriteString("<a href=\"")
433 if kind == LINK_TYPE_EMAIL {
434 out.WriteString("mailto:")
435 }
436 entityEscapeWithSkip(out, link, skipRanges)
437 out.WriteString("\">")
438
439 // Pretty print: if we get an email address as
440 // an actual URI, e.g. `mailto:foo@bar.com`, we don't
441 // want to print the `mailto:` prefix
442 switch {
443 case bytes.HasPrefix(link, []byte("mailto://")):
444 attrEscape(out, link[len("mailto://"):])
445 case bytes.HasPrefix(link, []byte("mailto:")):
446 attrEscape(out, link[len("mailto:"):])
447 default:
448 entityEscapeWithSkip(out, link, skipRanges)
449 }
450
451 out.WriteString("</a>")
452}
453
454func (options *Html) CodeSpan(out *bytes.Buffer, text []byte) {
455 out.WriteString("<code>")
456 attrEscape(out, text)
457 out.WriteString("</code>")
458}
459
460func (options *Html) DoubleEmphasis(out *bytes.Buffer, text []byte) {
461 out.WriteString("<strong>")
462 out.Write(text)
463 out.WriteString("</strong>")
464}
465
466func (options *Html) Emphasis(out *bytes.Buffer, text []byte) {
467 if len(text) == 0 {
468 return
469 }
470 out.WriteString("<em>")
471 out.Write(text)
472 out.WriteString("</em>")
473}
474
475func (options *Html) Image(out *bytes.Buffer, link []byte, title []byte, alt []byte) {
476 if options.flags&HTML_SKIP_IMAGES != 0 {
477 return
478 }
479
480 out.WriteString("<img src=\"")
481 attrEscape(out, link)
482 out.WriteString("\" alt=\"")
483 if len(alt) > 0 {
484 attrEscape(out, alt)
485 }
486 if len(title) > 0 {
487 out.WriteString("\" title=\"")
488 attrEscape(out, title)
489 }
490
491 out.WriteByte('"')
492 out.WriteString(options.closeTag)
493 return
494}
495
496func (options *Html) LineBreak(out *bytes.Buffer) {
497 out.WriteString("<br")
498 out.WriteString(options.closeTag)
499}
500
501func (options *Html) Link(out *bytes.Buffer, link []byte, title []byte, content []byte) {
502 if options.flags&HTML_SKIP_LINKS != 0 {
503 // write the link text out but don't link it, just mark it with typewriter font
504 out.WriteString("<tt>")
505 attrEscape(out, content)
506 out.WriteString("</tt>")
507 return
508 }
509
510 if options.flags&HTML_SAFELINK != 0 && !isSafeLink(link) {
511 // write the link text out but don't link it, just mark it with typewriter font
512 out.WriteString("<tt>")
513 attrEscape(out, content)
514 out.WriteString("</tt>")
515 return
516 }
517
518 out.WriteString("<a href=\"")
519 attrEscape(out, link)
520 if len(title) > 0 {
521 out.WriteString("\" title=\"")
522 attrEscape(out, title)
523 }
524 out.WriteString("\">")
525 out.Write(content)
526 out.WriteString("</a>")
527 return
528}
529
530func (options *Html) RawHtmlTag(out *bytes.Buffer, text []byte) {
531 if options.flags&HTML_SKIP_HTML != 0 {
532 return
533 }
534 if options.flags&HTML_SKIP_STYLE != 0 && isHtmlTag(text, "style") {
535 return
536 }
537 if options.flags&HTML_SKIP_LINKS != 0 && isHtmlTag(text, "a") {
538 return
539 }
540 if options.flags&HTML_SKIP_IMAGES != 0 && isHtmlTag(text, "img") {
541 return
542 }
543 out.Write(text)
544}
545
546func (options *Html) TripleEmphasis(out *bytes.Buffer, text []byte) {
547 out.WriteString("<strong><em>")
548 out.Write(text)
549 out.WriteString("</em></strong>")
550}
551
552func (options *Html) StrikeThrough(out *bytes.Buffer, text []byte) {
553 out.WriteString("<del>")
554 out.Write(text)
555 out.WriteString("</del>")
556}
557
558func (options *Html) FootnoteRef(out *bytes.Buffer, ref []byte, id int) {
559 slug := slugify(ref)
560 out.WriteString(`<sup class="footnote-ref" id="fnref:`)
561 out.Write(slug)
562 out.WriteString(`"><a rel="footnote" href="#fn:`)
563 out.Write(slug)
564 out.WriteString(`">`)
565 out.WriteString(strconv.Itoa(id))
566 out.WriteString(`</a></sup>`)
567}
568
569func (options *Html) Entity(out *bytes.Buffer, entity []byte) {
570 out.Write(entity)
571}
572
573func (options *Html) NormalText(out *bytes.Buffer, text []byte) {
574 if options.flags&HTML_USE_SMARTYPANTS != 0 {
575 options.Smartypants(out, text)
576 } else {
577 attrEscape(out, text)
578 }
579}
580
581func (options *Html) Smartypants(out *bytes.Buffer, text []byte) {
582 smrt := smartypantsData{false, false}
583
584 // first do normal entity escaping
585 var escaped bytes.Buffer
586 attrEscape(&escaped, text)
587 text = escaped.Bytes()
588
589 mark := 0
590 for i := 0; i < len(text); i++ {
591 if action := options.smartypants[text[i]]; action != nil {
592 if i > mark {
593 out.Write(text[mark:i])
594 }
595
596 previousChar := byte(0)
597 if i > 0 {
598 previousChar = text[i-1]
599 }
600 i += action(out, &smrt, previousChar, text[i:])
601 mark = i + 1
602 }
603 }
604
605 if mark < len(text) {
606 out.Write(text[mark:])
607 }
608}
609
610func (options *Html) DocumentHeader(out *bytes.Buffer) {
611 if options.flags&HTML_COMPLETE_PAGE == 0 {
612 return
613 }
614
615 ending := ""
616 if options.flags&HTML_USE_XHTML != 0 {
617 out.WriteString("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" ")
618 out.WriteString("\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n")
619 out.WriteString("<html xmlns=\"http://www.w3.org/1999/xhtml\">\n")
620 ending = " /"
621 } else {
622 out.WriteString("<!DOCTYPE html>\n")
623 out.WriteString("<html>\n")
624 }
625 out.WriteString("<head>\n")
626 out.WriteString(" <title>")
627 options.NormalText(out, []byte(options.title))
628 out.WriteString("</title>\n")
629 out.WriteString(" <meta name=\"GENERATOR\" content=\"Blackfriday Markdown Processor v")
630 out.WriteString(VERSION)
631 out.WriteString("\"")
632 out.WriteString(ending)
633 out.WriteString(">\n")
634 out.WriteString(" <meta charset=\"utf-8\"")
635 out.WriteString(ending)
636 out.WriteString(">\n")
637 if options.css != "" {
638 out.WriteString(" <link rel=\"stylesheet\" type=\"text/css\" href=\"")
639 attrEscape(out, []byte(options.css))
640 out.WriteString("\"")
641 out.WriteString(ending)
642 out.WriteString(">\n")
643 }
644 out.WriteString("</head>\n")
645 out.WriteString("<body>\n")
646
647 options.tocMarker = out.Len()
648}
649
650func (options *Html) DocumentFooter(out *bytes.Buffer) {
651 // finalize and insert the table of contents
652 if options.flags&HTML_TOC != 0 {
653 options.TocFinalize()
654
655 // now we have to insert the table of contents into the document
656 var temp bytes.Buffer
657
658 // start by making a copy of everything after the document header
659 temp.Write(out.Bytes()[options.tocMarker:])
660
661 // now clear the copied material from the main output buffer
662 out.Truncate(options.tocMarker)
663
664 // corner case spacing issue
665 if options.flags&HTML_COMPLETE_PAGE != 0 {
666 out.WriteByte('\n')
667 }
668
669 // insert the table of contents
670 out.WriteString("<nav>\n")
671 out.Write(options.toc.Bytes())
672 out.WriteString("</nav>\n")
673
674 // corner case spacing issue
675 if options.flags&HTML_COMPLETE_PAGE == 0 && options.flags&HTML_OMIT_CONTENTS == 0 {
676 out.WriteByte('\n')
677 }
678
679 // write out everything that came after it
680 if options.flags&HTML_OMIT_CONTENTS == 0 {
681 out.Write(temp.Bytes())
682 }
683 }
684
685 if options.flags&HTML_COMPLETE_PAGE != 0 {
686 out.WriteString("\n</body>\n")
687 out.WriteString("</html>\n")
688 }
689
690}
691
692func (options *Html) TocHeader(text []byte, level int) {
693 for level > options.currentLevel {
694 switch {
695 case bytes.HasSuffix(options.toc.Bytes(), []byte("</li>\n")):
696 // this sublist can nest underneath a header
697 size := options.toc.Len()
698 options.toc.Truncate(size - len("</li>\n"))
699
700 case options.currentLevel > 0:
701 options.toc.WriteString("<li>")
702 }
703 if options.toc.Len() > 0 {
704 options.toc.WriteByte('\n')
705 }
706 options.toc.WriteString("<ul>\n")
707 options.currentLevel++
708 }
709
710 for level < options.currentLevel {
711 options.toc.WriteString("</ul>")
712 if options.currentLevel > 1 {
713 options.toc.WriteString("</li>\n")
714 }
715 options.currentLevel--
716 }
717
718 options.toc.WriteString("<li><a href=\"#toc_")
719 options.toc.WriteString(strconv.Itoa(options.headerCount))
720 options.toc.WriteString("\">")
721 options.headerCount++
722
723 options.toc.Write(text)
724
725 options.toc.WriteString("</a></li>\n")
726}
727
728func (options *Html) TocFinalize() {
729 for options.currentLevel > 1 {
730 options.toc.WriteString("</ul></li>\n")
731 options.currentLevel--
732 }
733
734 if options.currentLevel > 0 {
735 options.toc.WriteString("</ul>\n")
736 }
737}
738
739func isHtmlTag(tag []byte, tagname string) bool {
740 found, _ := findHtmlTagPos(tag, tagname)
741 return found
742}
743
744// Look for a character, but ignore it when it's in any kind of quotes, it
745// might be JavaScript
746func skipUntilCharIgnoreQuotes(html []byte, start int, char byte) int {
747 inSingleQuote := false
748 inDoubleQuote := false
749 inGraveQuote := false
750 i := start
751 for i < len(html) {
752 switch {
753 case html[i] == char && !inSingleQuote && !inDoubleQuote && !inGraveQuote:
754 return i
755 case html[i] == '\'':
756 inSingleQuote = !inSingleQuote
757 case html[i] == '"':
758 inDoubleQuote = !inDoubleQuote
759 case html[i] == '`':
760 inGraveQuote = !inGraveQuote
761 }
762 i++
763 }
764 return start
765}
766
767func findHtmlTagPos(tag []byte, tagname string) (bool, int) {
768 i := 0
769 if i < len(tag) && tag[0] != '<' {
770 return false, -1
771 }
772 i++
773 i = skipSpace(tag, i)
774
775 if i < len(tag) && tag[i] == '/' {
776 i++
777 }
778
779 i = skipSpace(tag, i)
780 j := 0
781 for ; i < len(tag); i, j = i+1, j+1 {
782 if j >= len(tagname) {
783 break
784 }
785
786 if strings.ToLower(string(tag[i]))[0] != tagname[j] {
787 return false, -1
788 }
789 }
790
791 if i == len(tag) {
792 return false, -1
793 }
794
795 rightAngle := skipUntilCharIgnoreQuotes(tag, i, '>')
796 if rightAngle > i {
797 return true, rightAngle
798 }
799
800 return false, -1
801}
802
803func sanitizeHtml(html []byte) []byte {
804 var result []byte
805 for string(html) != "" {
806 skip, tag, rest := findHtmlTag(html)
807 html = rest
808 result = append(result, skip...)
809 result = append(result, sanitizeTag(tag)...)
810 }
811 return append(result, []byte("\n")...)
812}
813
814func sanitizeTag(tag []byte) []byte {
815 if tagWhitelist.Match(tag) || anchorClean.Match(tag) || imgClean.Match(tag) {
816 return tag
817 } else {
818 return []byte("")
819 }
820}
821
822func skipUntilChar(text []byte, start int, char byte) int {
823 i := start
824 for i < len(text) && text[i] != char {
825 i++
826 }
827 return i
828}
829
830func findHtmlTag(html []byte) (skip, tag, rest []byte) {
831 start := skipUntilChar(html, 0, '<')
832 rightAngle := skipUntilCharIgnoreQuotes(html, start, '>')
833 if rightAngle > start {
834 skip = html[0:start]
835 tag = html[start : rightAngle+1]
836 rest = html[rightAngle+1:]
837 return
838 }
839
840 return []byte(""), []byte(""), []byte("")
841}
842
843func skipSpace(tag []byte, i int) int {
844 for i < len(tag) && isspace(tag[i]) {
845 i++
846 }
847 return i
848}
849
850func doubleSpace(out *bytes.Buffer) {
851 if out.Len() > 0 {
852 out.WriteByte('\n')
853 }
854}