"""URL fetching with Jina Reader API.""" import re import os import sys def extract_urls(text: str) -> list: """Extract HTTP(S) URLs from text.""" if not text: return [] urls = re.findall(r'https?://[^\s<>"\']+', text) return [u.rstrip('.,;:)>]') for u in urls] def fetch_url_as_markdown(url: str, timeout: int = 30) -> str: """Fetch URL content via Jina Reader API, return as Markdown.""" import urllib.request import urllib.error jina_url = f"https://r.jina.ai/{url}" req = urllib.request.Request( jina_url, headers={ "Accept": "text/markdown", "User-Agent": "Mozilla/5.0 (compatible; HoffDesk/1.0)" } ) try: with urllib.request.urlopen(req, timeout=timeout) as resp: content = resp.read().decode('utf-8', errors='replace') return content except urllib.error.HTTPError as e: return f"[Error fetching URL: HTTP {e.code}]" except urllib.error.URLError as e: return f"[Error fetching URL: {e.reason}]" except Exception as e: return f"[Error fetching URL: {e}]" def enrich_body_with_urls(body: str, max_urls: int = 3) -> str: """Extract URLs from body, fetch content, append to body. For newsletter-style emails with just a URL link (like Smore). """ urls = extract_urls(body) if not urls: return body enriched = body fetched_count = 0 for url in urls[:max_urls]: content = fetch_url_as_markdown(url) if content and not content.startswith("[Error"): enriched += f"\n\n---\nFetched from {url}:\n{content}" fetched_count += 1 return enriched