url_fetcher.py — HoffDoc

📄 url_fetcher.py 1,732 bytes Apr 25, 2026 📋 Raw

"""URL fetching with Jina Reader API."""

import re
import os
import sys

def extract_urls(text: str) -> list:
"""Extract HTTP(S) URLs from text."""
if not text:
return []
urls = re.findall(r'https?://[^\s<>"\']+', text)
return [u.rstrip('.,;:)>]') for u in urls]

def fetch_url_as_markdown(url: str, timeout: int = 30) -> str:
"""Fetch URL content via Jina Reader API, return as Markdown."""
import urllib.request
import urllib.error

jina_url = f"https://r.jina.ai/{url}"

req = urllib.request.Request(
    jina_url,
    headers={
        "Accept": "text/markdown",
        "User-Agent": "Mozilla/5.0 (compatible; HoffDesk/1.0)"
    }
)

try:
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        content = resp.read().decode('utf-8', errors='replace')
        return content
except urllib.error.HTTPError as e:
    return f"[Error fetching URL: HTTP {e.code}]"
except urllib.error.URLError as e:
    return f"[Error fetching URL: {e.reason}]"
except Exception as e:
    return f"[Error fetching URL: {e}]"

def enrich_body_with_urls(body: str, max_urls: int = 3) -> str:
"""Extract URLs from body, fetch content, append to body.

For newsletter-style emails with just a URL link (like Smore).
"""
urls = extract_urls(body)
if not urls:
    return body

enriched = body
fetched_count = 0

for url in urls[:max_urls]:
    content = fetch_url_as_markdown(url)
    if content and not content.startswith("[Error"):
        enriched += f"\n\n---\nFetched from {url}:\n{content}"
        fetched_count += 1

return enriched

← Back