from flask import Flask, request from urlextract import URLExtract from bs4 import BeautifulSoup import requests import urllib.parse from .extract import extract_urls from .cleanhtml import cleaner from .utils import is_absolute_url app = Flask(__name__) def process_urls(body, html=False): if html: cleaned = cleaner(body) urls = extract_urls(cleaned) else: urls = extract_urls(body) return urls def fetch_page(url): res = requests.get(url) soup = BeautifulSoup(res.content, "lxml") for a in soup.find_all("a"): if not is_absolute_url(a["href"]): a["href"] = urllib.parse.urljoin(url, a["href"]) return {"title": soup.title.string, "html": str(soup.html)} @app.route("/webhook", methods=["POST"]) def webhook(): mail = request.json parts = mail["Parts"] try: p = parts[0]["text/plain"] urls = process_urls(p) except KeyError: p = parts[0]["text/html"] urls = process_urls(p, html=True) for u in urls: pageinfo = fetch_page(u) send_mail( mail["From"], mail["ReplyTo"], pageinfo["title"], pageinfo["html"], ) return "ok"