all repos — donkey @ 2a98ded5cd847a031cc1e97ddb64aa452cb213b0

The main backend for forlater.email

app/html.py (view raw)

 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
import requests
import urllib.parse
from bs4 import BeautifulSoup

from .extract import extract_urls
from .utils import is_absolute_url


def cleaner(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(separator=" ")


def process_urls(body, html=False):
    if html:
        cleaned = cleaner(body)
        urls = extract_urls(cleaned)
    else:
        urls = extract_urls(body)

    return urls


def fetch_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")
    for a in soup.find_all("a"):
        try:
            if not is_absolute_url(a["href"]):
                a["href"] = urllib.parse.urljoin(url, a["href"])
        except KeyError:
            pass

    return {"title": soup.title.string, "html": str(soup.html)}


def fetch_plaintext(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(strip=True)