app/main.py (view raw)
1from flask import Flask, request
2from urlextract import URLExtract
3from bs4 import BeautifulSoup
4import requests
5import urllib.parse
6
7from .extract import extract_urls
8from .cleanhtml import cleaner
9from .utils import is_absolute_url
10
11app = Flask(__name__)
12
13
14def process_urls(body, html=False):
15 if html:
16 cleaned = cleaner(body)
17 urls = extract_urls(cleaned)
18 else:
19 urls = extract_urls(body)
20
21 return urls
22
23
24def fetch_page(url):
25 res = requests.get(url)
26 soup = BeautifulSoup(res.content, "lxml")
27 for a in soup.find_all("a"):
28 if not is_absolute_url(a["href"]):
29 a["href"] = urllib.parse.urljoin(url, a["href"])
30
31 return {"title": soup.title.string, "html": str(soup.html)}
32
33
34@app.route("/webhook", methods=["POST"])
35def webhook():
36 mail = request.json
37 parts = mail["Parts"]
38 try:
39 p = parts[0]["text/plain"]
40 urls = process_urls(p)
41 except KeyError:
42 p = parts[0]["text/html"]
43 urls = process_urls(p, html=True)
44
45 for u in urls:
46 pageinfo = fetch_page(u)
47 send_mail(
48 mail["From"],
49 mail["ReplyTo"],
50 pageinfo["title"],
51 pageinfo["html"],
52 )
53 return "ok"