hoot.go (view raw)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
// // Copyright (c) 2019 Ted Unangst <tedu@tedunangst.com> // // Permission to use, copy, modify, and distribute this software for any // purpose with or without fee is hereby granted, provided that the above // copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. package main import ( "fmt" "io" "net/http" "os" "regexp" "strings" "github.com/andybalholm/cascadia" "golang.org/x/net/html" "humungus.tedunangst.com/r/webs/htfilter" "humungus.tedunangst.com/r/webs/templates" ) var tweetsel = cascadia.MustCompile("div[data-testid=tweetText]") var linksel = cascadia.MustCompile("a time") var replyingto = cascadia.MustCompile(".ReplyingToContextBelowAuthor") var imgsel = cascadia.MustCompile("div[data-testid=tweetPhoto] img") var authorregex = regexp.MustCompile("twitter.com/([^/]+)") var re_hoots = regexp.MustCompile(`hoot: ?https://\S+`) var re_removepics = regexp.MustCompile(`pic\.twitter\.com/[[:alnum:]]+`) func hootextractor(r io.Reader, url string, seen map[string]bool) string { root, err := html.Parse(r) if err != nil { elog.Printf("error parsing hoot: %s", err) return url } url = strings.Replace(url, "mobile.twitter.com", "twitter.com", -1) wantmatch := authorregex.FindStringSubmatch(url) var wanted string if len(wantmatch) == 2 { wanted = wantmatch[1] } var htf htfilter.Filter htf.Imager = func(node *html.Node) string { alt := htfilter.GetAttr(node, "alt") src := htfilter.GetAttr(node, "src") if htfilter.HasClass(node, "Emoji") || strings.HasSuffix(src, ".svg") { return alt } return string(templates.Sprintf(" <img src='%s' alt='%s'>", src, alt)) } var buf strings.Builder fmt.Fprintf(&buf, "%s\n", url) divs := tweetsel.MatchAll(root) for i, div := range divs { twp := div.Parent.Parent.Parent.Parent.Parent link := url alink := linksel.MatchFirst(twp) if alink == nil { if i != 0 { dlog.Printf("missing link") continue } } else { alink = alink.Parent link = "https://twitter.com" + htfilter.GetAttr(alink, "href") } authormatch := authorregex.FindStringSubmatch(link) if len(authormatch) < 2 { dlog.Printf("no author?: %s", link) continue } author := authormatch[1] if wanted == "" { wanted = author } if author != wanted { continue } for _, img := range imgsel.MatchAll(twp) { img.Parent.RemoveChild(img) div.AppendChild(img) } text := htf.NodeText(div) text = strings.Replace(text, "\n", " ", -1) fmt.Fprintf(&buf, "> @%s: %s\n", author, text) } return buf.String() } func hooterize(noise string) string { seen := make(map[string]bool) hootfetcher := func(hoot string) string { url := hoot[5:] if url[0] == ' ' { url = url[1:] } url = strings.Replace(url, "mobile.twitter.com", "twitter.com", -1) dlog.Printf("hooterizing %s", url) req, err := http.NewRequest("GET", url, nil) if err != nil { ilog.Printf("error: %s", err) return hoot } req.Header.Set("User-Agent", "Bot") req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") req.Header.Set("Accept-Language", "en-US,en;q=0.9") resp, err := http.DefaultClient.Do(req) if err != nil { ilog.Printf("error: %s", err) return hoot } defer resp.Body.Close() if resp.StatusCode != 200 { ilog.Printf("error getting %s: %d", url, resp.StatusCode) return hoot } ld, _ := os.Create("lasthoot.html") r := io.TeeReader(resp.Body, ld) return hootextractor(r, url, seen) } return re_hoots.ReplaceAllStringFunc(noise, hootfetcher) } |