url_fetcher.py — HoffDoc

📄 url_fetcher.py 3,561 bytes Apr 19, 2026 📋 Raw

"""URL content extraction via Jina Reader API.

Fetches a URL through https://r.jina.ai/ which renders JavaScript
server-side and returns clean Markdown. No browser dependencies needed.
"""

import re
import sys

import requests

from family_assistant.config import LLM_TIMEOUT

def extract_urls(text):
"""Extract HTTP(S) URLs from a text string.

Returns a list of URLs found in the text.
"""
if not text:
    return []
url_pattern = r'https?://[^\s<>"\']+'
urls = re.findall(url_pattern, text)
# Strip trailing punctuation that's not part of the URL
cleaned = []
for url in urls:
    url = url.rstrip('.,;:)>]')
    cleaned.append(url)
return cleaned

def fetch_url_as_markdown(url, timeout=None):
"""Fetch a URL through Jina Reader API and return Markdown text.

Jina Reader renders the page server-side (including JS) and returns
clean Markdown. No local browser or heavy dependencies needed.

Args:
    url: The URL to fetch.
    timeout: Request timeout in seconds (defaults to LLM_TIMEOUT).

Returns:
    Markdown string of the page content, or empty string on failure.
"""
if timeout is None:
    timeout = LLM_TIMEOUT

jina_url = f"https://r.jina.ai/{url}"
headers = {
    "Accept": "text/markdown",
}

try:
    resp = requests.get(jina_url, headers=headers, timeout=timeout)
    resp.raise_for_status()
    content = resp.text.strip()
    if content:
        print(f"  [URL Fetch] Extracted {len(content)} chars from {url[:60]}", file=sys.stderr)
    else:
        print(f"  [URL Fetch] Empty response from {url[:60]}", file=sys.stderr)
    return content
except requests.exceptions.Timeout:
    print(f"  [URL Fetch] Timeout fetching {url[:60]}", file=sys.stderr)
    return ""
except requests.exceptions.HTTPError as e:
    print(f"  [URL Fetch] HTTP error {e.response.status_code} for {url[:60]}", file=sys.stderr)
    return ""
except Exception as e:
    print(f"  [URL Fetch] Error: {e}", file=sys.stderr)
    return ""

def enrich_body_with_urls(body):
"""If the email body contains URLs, fetch their content and append it.

Designed for newsletter emails that contain a link to the actual
content (e.g. Smore, Constant Contact, Mailchimp). Fetches each URL
via Jina Reader and appends the Markdown content to the body.

Returns the enriched body string. If no URLs found or fetch fails,
returns the original body unchanged.
"""
urls = extract_urls(body)
if not urls:
    return body

# Filter out tracking pixels, unsubscribe links, and common non-content URLs
skip_patterns = [
    r'/api/message/pix',       # tracking pixels
    r'unsubscribe',             # unsubscribe links
    r'mailto:',                # email links
    r'\.png$', r'\.gif$',      # image URLs
    r'target101\.brightarrow', # school notification platform tracking
]
content_urls = []
for url in urls:
    if not any(re.search(p, url, re.IGNORECASE) for p in skip_patterns):
        content_urls.append(url)

if not content_urls:
    return body

# Fetch the first content URL (most newsletters have one primary link)
# Future: support multiple URLs if needed
markdown = fetch_url_as_markdown(content_urls[0])
if not markdown:
    return body

# Append fetched content to body
enriched = f"{body}\n\n---\n[Newsletter Content]\n{markdown}"
return enriched

← Back