all repos — donkey @ 042bf159b226541e2a067d5c6781c0404bbba614

The main backend for forlater.email

app/main.py (view raw)

 1from flask import Flask, request
 2from urlextract import URLExtract
 3from bs4 import BeautifulSoup
 4import requests
 5import urllib.parse
 6
 7from .extract import extract_urls
 8from .cleanhtml import cleaner
 9from .utils import is_absolute_url
10
11app = Flask(__name__)
12
13
14def process_urls(body, html=False):
15    if html:
16        cleaned = cleaner(body)
17        urls = extract_urls(cleaned)
18    else:
19        urls = extract_urls(body)
20
21    return urls
22
23
24def fetch_page(url):
25    res = requests.get(url)
26    soup = BeautifulSoup(res.content, "lxml")
27    for a in soup.find_all("a"):
28        if not is_absolute_url(a["href"]):
29            a["href"] = urllib.parse.urljoin(url, a["href"])
30
31    return {"title": soup.title.string, "html": str(soup.html)}
32
33
34@app.route("/webhook", methods=["POST"])
35def webhook():
36    mail = request.json
37    parts = mail["Parts"]
38    try:
39        p = parts[0]["text/plain"]
40        urls = process_urls(p)
41    except KeyError:
42        p = parts[0]["text/html"]
43        urls = process_urls(p, html=True)
44
45    for u in urls:
46        pageinfo = fetch_page(u)
47        send_mail(
48            mail["From"],
49            mail["ReplyTo"],
50            pageinfo["title"],
51            pageinfo["html"],
52        )
53    return "ok"