all repos — navani @ ea8ad151310499d429bb7aa02cdb4acdac813db0

forlater's primary mail processing service

reader/fetch.go (view raw)

 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
package reader

import (
	"bytes"
	"crypto/sha1"
	"encoding/hex"
	"fmt"
	"io"
	"mime"
	"net/http"
	"net/url"
	"strings"

	"git.icyphox.sh/forlater/navani/cache"
	readability "github.com/go-shiori/go-readability"
)

type Article struct {
	readability.Article
	URL *url.URL
}

func checksum(s []byte) string {
	h := sha1.New()
	h.Write(s)
	b := h.Sum(nil)
	return hex.EncodeToString(b)
}

type Response struct {
	Body     io.Reader
	MIMEType string
}

// Fetches the web page and stores the hash of the URL against
// the response body in cache. Returns an io.Reader.
func Fetch(url string) (Response, error) {
	client := &http.Client{}
	sum := checksum([]byte(url))
	c, err := cache.NewConn()
	if err != nil {
		return Response{}, fmt.Errorf("cache error: %w\n", err)
	}

	body, err := c.Get(sum)
	// Not in cache.
	if err != nil {
		req, err := http.NewRequest("GET", url, nil)
		if err != nil {
			return Response{}, fmt.Errorf("http error: %w\n", err)
		}

		req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36")
		resp, err := client.Do(req)
		if err != nil {
			return Response{}, fmt.Errorf("http client error: %w\n", err)
		}

		mt, _, err := mime.ParseMediaType(resp.Header.Get("Content-Type"))
		if err != nil {
			return Response{}, fmt.Errorf("parse mime: %w\n", err)
		}

		// If page isn't text/html, just return the body; no caching.
		if mt != "text/html" {
			if err != nil {
				return Response{}, fmt.Errorf("reading non-html body: %w\n", err)
			}

			return Response{resp.Body, mt}, nil
		}

		buf := bytes.Buffer{}
		// Read into r and write into buf.
		// Cache and return!
		r := io.TeeReader(resp.Body, &buf)
		b, err := io.ReadAll(r)
		if err != nil {
			return Response{}, fmt.Errorf("io error: %w\n", err)
		}
		_, err = c.Set(sum, b)
		if err != nil {
			return Response{}, fmt.Errorf("cache error: %w\n", err)
		}
		return Response{&buf, mt}, nil
	}

	// We can safely assume it's text/html
	return Response{strings.NewReader(body), "text/html"}, nil
}

// Makes a given html body readable. Returns an error if it
// can't.
func Readable(r io.Reader, u *url.URL) (Article, error) {
	article, err := readability.FromReader(r, u)
	if err != nil {
		return Article{readability.Article{}, u}, fmt.Errorf("failed to parse %s: %w\n", u, err)
	}

	return Article{article, u}, nil
}