all repos — donkey @ 924f43708d675e88632f81eb607a680c353f562d

The main backend for forlater.email

app/html.py (view raw)

 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
import requests
import urllib.parse
from bs4 import BeautifulSoup

from .extract import extract_urls
from .utils import is_absolute_url


def cleaner(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(separator=" ")


def process_urls(body, html=False):
    if html:
        cleaned = cleaner(body)
        urls = extract_urls(cleaned)
    else:
        urls = extract_urls(body)

    return urls


def fetch_page(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
    }
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")
    for a in soup.find_all("a"):
        try:
            if not is_absolute_url(a["href"]):
                a["href"] = urllib.parse.urljoin(url, a["href"])
        except KeyError:
            pass

    return {"title": soup.title.string, "html": str(soup.html)}


def fetch_plaintext(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(strip=True)