icy does git — donkey (924f43708d675e88632f81eb607a680c353f562d): app/html.py

app/html.py (view raw)

import requests
import urllib.parse
from bs4 import BeautifulSoup

from .extract import extract_urls
from .utils import is_absolute_url


def cleaner(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(separator=" ")


def process_urls(body, html=False):
    if html:
        cleaned = cleaner(body)
        urls = extract_urls(cleaned)
    else:
        urls = extract_urls(body)

    return urls


def fetch_page(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"
    }
    res = requests.get(url)
    soup = BeautifulSoup(res.content, "lxml")
    for a in soup.find_all("a"):
        try:
            if not is_absolute_url(a["href"]):
                a["href"] = urllib.parse.urljoin(url, a["href"])
        except KeyError:
            pass

    return {"title": soup.title.string, "html": str(soup.html)}


def fetch_plaintext(html):
    soup = BeautifulSoup(html, "lxml")
    return soup.get_text(strip=True)

all repos — donkey @ 924f43708d675e88632f81eb607a680c353f562d

The main backend for forlater.email