reader/fetch.go (view raw)
1package reader
2
3import (
4 "bytes"
5 "crypto/sha1"
6 "encoding/hex"
7 "fmt"
8 "io"
9 "mime"
10 "net/http"
11 "net/url"
12 "strings"
13
14 "git.icyphox.sh/forlater/navani/cache"
15 readability "github.com/go-shiori/go-readability"
16)
17
18type Article struct {
19 readability.Article
20 URL *url.URL
21}
22
23func checksum(s []byte) string {
24 h := sha1.New()
25 h.Write(s)
26 b := h.Sum(nil)
27 return hex.EncodeToString(b)
28}
29
30type Response struct {
31 Body io.Reader
32 MIMEType string
33}
34
35// Fetches the web page and stores the hash of the URL against
36// the response body in cache. Returns an io.Reader.
37func Fetch(url string) (Response, error) {
38 client := &http.Client{}
39 sum := checksum([]byte(url))
40 c, err := cache.NewConn()
41 if err != nil {
42 return Response{}, fmt.Errorf("cache error: %w\n", err)
43 }
44
45 body, err := c.Get(sum)
46 // Not in cache.
47 if err != nil {
48 req, err := http.NewRequest("GET", url, nil)
49 if err != nil {
50 return Response{}, fmt.Errorf("http error: %w\n", err)
51 }
52
53 req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36")
54 resp, err := client.Do(req)
55 if err != nil {
56 return Response{}, fmt.Errorf("http client error: %w\n", err)
57 }
58
59 mt, _, err := mime.ParseMediaType(resp.Header.Get("Content-Type"))
60 if err != nil {
61 return Response{}, fmt.Errorf("parse mime: %w\n", err)
62 }
63
64 // If page isn't text/html, just return the body; no caching.
65 if mt != "text/html" {
66 if err != nil {
67 return Response{}, fmt.Errorf("reading non-html body: %w\n", err)
68 }
69
70 return Response{resp.Body, mt}, nil
71 }
72
73 buf := bytes.Buffer{}
74 // Read into r and write into buf.
75 // Cache and return!
76 r := io.TeeReader(resp.Body, &buf)
77 b, err := io.ReadAll(r)
78 if err != nil {
79 return Response{}, fmt.Errorf("io error: %w\n", err)
80 }
81 _, err = c.Set(sum, b)
82 if err != nil {
83 return Response{}, fmt.Errorf("cache error: %w\n", err)
84 }
85 return Response{&buf, mt}, nil
86 }
87
88 // We can safely assume it's text/html
89 return Response{strings.NewReader(body), "text/html"}, nil
90}
91
92// Makes a given html body readable. Returns an error if it
93// can't.
94func Readable(r io.Reader, u *url.URL) (Article, error) {
95 article, err := readability.FromReader(r, u)
96 if err != nil {
97 return Article{readability.Article{}, u}, fmt.Errorf("failed to parse %s: %w\n", u, err)
98 }
99
100 return Article{article, u}, nil
101}