import requests import urllib.parse from bs4 import BeautifulSoup from .extract import extract_urls from .utils import is_absolute_url def cleaner(html): soup = BeautifulSoup(html, "lxml") return soup.get_text(separator=" ") def process_urls(body, html=False): if html: cleaned = cleaner(body) urls = extract_urls(cleaned) else: urls = extract_urls(body) return urls def fetch_page(url): res = requests.get(url) soup = BeautifulSoup(res.content, "lxml") for a in soup.find_all("a"): try: if not is_absolute_url(a["href"]): a["href"] = urllib.parse.urljoin(url, a["href"]) except KeyError: pass return {"title": soup.title.string, "html": str(soup.html)} def fetch_plaintext(html): soup = BeautifulSoup(html, "lxml") return soup.get_text(strip=True)