icy does git — donkey (2a98ded5cd847a031cc1e97ddb64aa452cb213b0): app/html.py

app/html.py (view raw)

import requests
import urllib.parse
from bs4 import BeautifulSoup

from .extract import extract_urls
from .utils import is_absolute_url


def cleaner(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(separator=" ")


def process_urls(body, html=False):
    if html:
        cleaned = cleaner(body)
        urls = extract_urls(cleaned)
    else:
        urls = extract_urls(body)

    return urls


def fetch_page(url):
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")
    for a in soup.find_all("a"):
        try:
            if not is_absolute_url(a["href"]):
                a["href"] = urllib.parse.urljoin(url, a["href"])
        except KeyError:
            pass

    return {"title": soup.title.string, "html": str(soup.html)}


def fetch_plaintext(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(strip=True)

all repos — donkey @ 2a98ded5cd847a031cc1e97ddb64aa452cb213b0

The main backend for forlater.email