icy does git — donkey (dc80dc1a0d50b02336a12040d4dd62baa2f20000): app/html.py

app/html.py (view raw)

import requests
import urllib.parse
from bs4 import BeautifulSoup

from .extract import extract_urls
from .utils import is_absolute_url


def cleaner(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(separator=" ")


def process_urls(body, html=False):
    if html:
        cleaned = cleaner(body)
        urls = extract_urls(cleaned)
    else:
        urls = extract_urls(body)

    return urls


def fetch_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")
    for a in soup.find_all("a"):
        if not is_absolute_url(a["href"]):
            a["href"] = urllib.parse.urljoin(url, a["href"])

    return {"title": soup.title.string, "html": str(soup.html)}


def fetch_plaintext(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(strip=True)

all repos — donkey @ dc80dc1a0d50b02336a12040d4dd62baa2f20000

The main backend for forlater.email