dedupe the hoots
Ted Unangst tedu@tedunangst.com
Wed, 30 Oct 2019 23:05:27 -0400
1 files changed,
74 insertions(+),
67 deletions(-)
jump to
M
hoot.go
→
hoot.go
@@ -33,81 +33,88 @@ var tweetsel = cascadia.MustCompile("p.tweet-text")
var linksel = cascadia.MustCompile(".time a.tweet-timestamp") var authorregex = regexp.MustCompile("twitter.com/([^/]+)") -func hootfetcher(hoot string) string { - url := hoot[5:] - if url[0] == ' ' { - url = url[1:] - } - url = strings.Replace(url, "mobile.twitter.com", "twitter.com", -1) - log.Printf("hooterizing %s", url) - req, err := http.NewRequest("GET", url, nil) - if err != nil { - log.Printf("error: %s", err) - return hoot - } - req.Header.Set("User-Agent", "OpenBSD ftp") - req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") - req.Header.Set("Accept-Language", "en-US,en;q=0.9") - resp, err := http.DefaultClient.Do(req) - if err != nil { - log.Printf("error: %s", err) - return hoot - } - defer resp.Body.Close() - if resp.StatusCode != 200 { - log.Printf("error getting %s: %d", url, resp.StatusCode) - return hoot - } - ld, _ := os.Create("lasthoot.html") - r := io.TeeReader(resp.Body, ld) - return hootfixer(r, url) -} +var re_hoots = regexp.MustCompile(`hoot: ?https://\S+`) + +func hooterize(noise string) string { + seen := make(map[string]bool) + + hootfixer := func(r io.Reader, url string) string { + root, err := html.Parse(r) + if err != nil { + log.Printf("error parsing hoot: %s", err) + return url + } + divs := tweetsel.MatchAll(root) + + wantmatch := authorregex.FindStringSubmatch(url) + if len(wantmatch) < 2 { + log.Printf("no wanted author?") + } + wanted := wantmatch[1] + var buf strings.Builder + + var htf htfilter.Filter + fmt.Fprintf(&buf, "%s\n", url) + for _, div := range divs { + twp := div.Parent.Parent.Parent + alink := linksel.MatchFirst(twp) + if alink == nil { + log.Printf("missing link") + continue + } + link := "https://twitter.com" + htfilter.GetAttr(alink, "href") + authormatch := authorregex.FindStringSubmatch(link) + if len(authormatch) < 2 { + log.Printf("no author?") + continue + } + author := authormatch[1] + if author != wanted { + continue + } + text := htf.TextOnly(div) + text = strings.Replace(text, "\n", " ", -1) + text = strings.Replace(text, "pic.twitter.com", "https://pic.twitter.com", -1) -func hootfixer(r io.Reader, url string) string { - root, err := html.Parse(r) - if err != nil { - log.Printf("error parsing hoot: %s", err) - return url - } - divs := tweetsel.MatchAll(root) + if seen[text] { + continue + } - wantmatch := authorregex.FindStringSubmatch(url) - if len(wantmatch) < 2 { - log.Printf("no wanted author?") + fmt.Fprintf(&buf, "> @%s: %s\n", author, text) + seen[text] = true + } + return buf.String() } - wanted := wantmatch[1] - var buf strings.Builder - var htf htfilter.Filter - fmt.Fprintf(&buf, "%s\n", url) - for _, div := range divs { - twp := div.Parent.Parent.Parent - alink := linksel.MatchFirst(twp) - if alink == nil { - log.Printf("missing link") - continue + hootfetcher := func(hoot string) string { + url := hoot[5:] + if url[0] == ' ' { + url = url[1:] } - link := "https://twitter.com" + htfilter.GetAttr(alink, "href") - authormatch := authorregex.FindStringSubmatch(link) - if len(authormatch) < 2 { - log.Printf("no author?") - continue + url = strings.Replace(url, "mobile.twitter.com", "twitter.com", -1) + log.Printf("hooterizing %s", url) + req, err := http.NewRequest("GET", url, nil) + if err != nil { + log.Printf("error: %s", err) + return hoot } - author := authormatch[1] - if author != wanted { - continue + req.Header.Set("User-Agent", "OpenBSD ftp") + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + req.Header.Set("Accept-Language", "en-US,en;q=0.9") + resp, err := http.DefaultClient.Do(req) + if err != nil { + log.Printf("error: %s", err) + return hoot + } + defer resp.Body.Close() + if resp.StatusCode != 200 { + log.Printf("error getting %s: %d", url, resp.StatusCode) + return hoot } - text := htf.TextOnly(div) - text = strings.Replace(text, "\n", " ", -1) - text = strings.Replace(text, "pic.twitter.com", "https://pic.twitter.com", -1) - - fmt.Fprintf(&buf, "> @%s: %s\n", author, text) + ld, _ := os.Create("lasthoot.html") + r := io.TeeReader(resp.Body, ld) + return hootfixer(r, url) } - return buf.String() -} -var re_hoots = regexp.MustCompile(`hoot: ?https://\S+`) - -func hooterize(noise string) string { return re_hoots.ReplaceAllStringFunc(noise, hootfetcher) }