"""URL fetching with Jina Reader API."""
import re
import os
import sys
def extract_urls(text: str) -> list:
"""Extract HTTP(S) URLs from text."""
if not text:
return []
urls = re.findall(r'https?://[^\s<>"\']+', text)
return [u.rstrip('.,;:)>]') for u in urls]
def fetch_url_as_markdown(url: str, timeout: int = 30) -> str:
"""Fetch URL content via Jina Reader API, return as Markdown."""
import urllib.request
import urllib.error
jina_url = f"https://r.jina.ai/{url}"
req = urllib.request.Request(
jina_url,
headers={
"Accept": "text/markdown",
"User-Agent": "Mozilla/5.0 (compatible; HoffDesk/1.0)"
}
)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
content = resp.read().decode('utf-8', errors='replace')
return content
except urllib.error.HTTPError as e:
return f"[Error fetching URL: HTTP {e.code}]"
except urllib.error.URLError as e:
return f"[Error fetching URL: {e.reason}]"
except Exception as e:
return f"[Error fetching URL: {e}]"
def enrich_body_with_urls(body: str, max_urls: int = 3) -> str:
"""Extract URLs from body, fetch content, append to body.
For newsletter-style emails with just a URL link (like Smore).
"""
urls = extract_urls(body)
if not urls:
return body
enriched = body
fetched_count = 0
for url in urls[:max_urls]:
content = fetch_url_as_markdown(url)
if content and not content.startswith("[Error"):
enriched += f"\n\n---\nFetched from {url}:\n{content}"
fetched_count += 1
return enriched