all repos — navani @ 2928def670071abf79a9dd21ecbaa532149405f4

forlater's primary mail processing service

reader/fetch.go (view raw)

  1package reader
  2
  3import (
  4	"bytes"
  5	"crypto/sha1"
  6	"encoding/hex"
  7	"fmt"
  8	"io"
  9	"mime"
 10	"net/http"
 11	"net/url"
 12	"strings"
 13
 14	"git.icyphox.sh/forlater/navani/cache"
 15	readability "github.com/go-shiori/go-readability"
 16)
 17
 18type Article struct {
 19	readability.Article
 20	URL *url.URL
 21}
 22
 23func checksum(s []byte) string {
 24	h := sha1.New()
 25	h.Write(s)
 26	b := h.Sum(nil)
 27	return hex.EncodeToString(b)
 28}
 29
 30type Response struct {
 31	Body     io.Reader
 32	MIMEType string
 33}
 34
 35// Fetches the web page and stores the hash of the URL against
 36// the response body in cache. Returns an io.Reader.
 37func Fetch(url string) (Response, error) {
 38	client := &http.Client{}
 39	sum := checksum([]byte(url))
 40	c, err := cache.NewConn()
 41	if err != nil {
 42		return Response{}, fmt.Errorf("cache error: %w\n", err)
 43	}
 44
 45	body, err := c.Get(sum)
 46	// Not in cache.
 47	if err != nil {
 48		req, err := http.NewRequest("GET", url, nil)
 49		if err != nil {
 50			return Response{}, fmt.Errorf("http error: %w\n", err)
 51		}
 52
 53		req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36")
 54		resp, err := client.Do(req)
 55		if err != nil {
 56			return Response{}, fmt.Errorf("http client error: %w\n", err)
 57		}
 58
 59		mt, _, err := mime.ParseMediaType(resp.Header.Get("Content-Type"))
 60		if err != nil {
 61			return Response{}, fmt.Errorf("parse mime: %w\n", err)
 62		}
 63
 64		// If page isn't text/html, just return the body; no caching.
 65		if mt != "text/html" {
 66			if err != nil {
 67				return Response{}, fmt.Errorf("reading non-html body: %w\n", err)
 68			}
 69
 70			return Response{resp.Body, mt}, nil
 71		}
 72
 73		buf := bytes.Buffer{}
 74		// Read into r and write into buf.
 75		// Cache and return!
 76		r := io.TeeReader(resp.Body, &buf)
 77		b, err := io.ReadAll(r)
 78		if err != nil {
 79			return Response{}, fmt.Errorf("io error: %w\n", err)
 80		}
 81		_, err = c.Set(sum, b)
 82		if err != nil {
 83			return Response{}, fmt.Errorf("cache error: %w\n", err)
 84		}
 85		return Response{&buf, mt}, nil
 86	}
 87
 88	// We can safely assume it's text/html
 89	return Response{strings.NewReader(body), "text/html"}, nil
 90}
 91
 92// Makes a given html body readable. Returns an error if it
 93// can't.
 94func Readable(r io.Reader, u *url.URL) (Article, error) {
 95	article, err := readability.FromReader(r, u)
 96	if err != nil {
 97		return Article{readability.Article{}, u}, fmt.Errorf("failed to parse %s: %w\n", u, err)
 98	}
 99
100	return Article{article, u}, nil
101}