"""Recipe extractor — fetch recipe URLs and extract structured ingredients + instructions.
Sovereign port from costco_route/recipe_extractor.py — zero imports from costco_route.
Extraction fallback chain:
1. ld+json from raw HTML (most reliable, zero LLM cost)
2. Jina AI clean markdown → LLM (reliable, needs network)
3. Raw HTML → text extraction → LLM (last resort, local)
"""
import json
import logging
import os
import re
import sqlite3
import uuid
from datetime import datetime, timedelta
from pathlib import Path
import httpx
from icarus.core.config.staging import LLM_MODEL, LLM_URL
---------------------------------------------------------------------------
Recipe storage
---------------------------------------------------------------------------
DATA_DIR = Path(os.environ.get("DATA_DIR", Path.home() / ".icarus/staging"))
RECIPES_DIR = DATA_DIR / "recipes"
def _ensure_recipes_dir() -> Path:
RECIPES_DIR.mkdir(parents=True, exist_ok=True)
return RECIPES_DIR
---------------------------------------------------------------------------
LLM helper
---------------------------------------------------------------------------
def _call_llm(prompt: str, timeout: int = 120) -> str:
payload = {
"model": LLM_MODEL,
"messages": [{"role": "user", "content": prompt}],
"stream": False,
"options": {"temperature": 0.1, "num_predict": 2048},
}
resp = httpx.post(LLM_URL, json=payload, timeout=timeout)
resp.raise_for_status()
data = resp.json()
# OpenAI-compatible /v1/chat/completions returns choices[0].message.content
if "choices" in data:
return data["choices"][0]["message"]["content"]
# Ollama native /api/chat returns message.content
return data["message"]["content"]
---------------------------------------------------------------------------
Prompts
---------------------------------------------------------------------------
RECIPE_FETCH_PROMPT = """SYSTEM: You are a recipe extraction engine. Extract the recipe from the following text, which was scraped from a recipe website.
The text will contain ads, navigation, stories, and other garbage. Ignore ALL of that. Extract ONLY the recipe.
Return a JSON object with this exact structure:
{{
"title": "Recipe title",
"servings": "Number of servings (e.g., '4' or '6-8')",
"prep_time": "Prep time if stated (e.g., '15 min'), null if not found",
"cook_time": "Cook time if stated (e.g., '30 min'), null if not found",
"total_time": "Total time if stated, null if not found",
"ingredients": ["ingredient 1", "ingredient 2", ...],
"instructions": ["step 1", "step 2", ...],
"tags": ["dinner", "chicken", "easy", ...],
"source_url": "the URL provided"
}}
RULES:
- ingredients: One item per list entry, including quantities (e.g., "2 cups flour", "1 lb chicken thighs")
- instructions: Numbered steps as separate list entries. Combine multiple short steps if they're fragments.
- tags: 3-8 relevant tags for categorization (meal type, protein, cuisine, difficulty)
- If the page contains multiple recipes, extract the MAIN one
- If no recipe is found, return {{"error": "no recipe found", "source_url": "..."}}
- Strip ALL ad copy, nutrition disclaimers, "jump to recipe" buttons, social sharing text
- Return ONLY the JSON object, no markdown fences
USER: Extract the recipe from this text:
URL: {url}
TEXT:
{text}
RESPONSE (JSON only):"""
---------------------------------------------------------------------------
ld+json structured data extraction
---------------------------------------------------------------------------
def _parse_servings(yield_str: str | None) -> str | None:
if not yield_str:
return None
yield_str = str(yield_str).strip()
if match := re.search(r'(?i)(serves?\s+)?(\d+[\s-]\d)', yield_str):
return match.group(2).strip()
return yield_str if yield_str else None
def _parse_duration(iso_duration: str | None) -> str | None:
if not iso_duration:
return None
match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?', str(iso_duration))
if not match:
return str(iso_duration)
hours, minutes = match.groups()
parts = []
if hours:
parts.append(f"{hours} hr")
if minutes:
parts.append(f"{minutes} min")
return " ".join(parts) if parts else None
def _clean_ingredients(ingredients: list) -> list[str]:
cleaned = []
for ing in ingredients:
if ing:
cleaned.append(re.sub(r'\s+', ' ', str(ing).replace('\r', '').replace('\n', ' ')).strip())
return [i for i in cleaned if i]
def _parse_instructions(instructions: list | str) -> list[str]:
if isinstance(instructions, str):
steps = re.split(r'\n+|\d+)\s*', instructions)
return [s.strip() for s in steps if s.strip()]
result = []
for step in instructions:
if isinstance(step, dict):
text = step.get("text", step.get("name", ""))
if text:
result.append(str(text).strip())
elif isinstance(step, str):
result.append(step.strip())
return result
def _extract_tags(ld_recipe: dict) -> list[str]:
tags = []
for field in ["recipeCategory", "recipeCuisine", "keywords"]:
val = ld_recipe.get(field)
if val:
if isinstance(val, str):
tags.extend([t.strip() for t in val.split(",") if t.strip()])
elif isinstance(val, list):
tags.extend([str(v).strip() for v in val if v])
name = ld_recipe.get("name", "").lower()
if any(w in name for w in ["chicken", "beef", "pork", "fish"]):
tags.append("dinner")
return list(set(tags))[:8]
def _fix_json_newlines(json_str: str) -> str:
result = []
in_string = False
i = 0
while i < len(json_str):
c = json_str[i]
if c == '"' and (i == 0 or json_str[i-1] != '\'):
in_string = not in_string
result.append(c)
elif c == '\n' and in_string:
result.append('\n')
elif c == '\t' and in_string:
result.append('\t')
else:
result.append(c)
i += 1
return ''.join(result)
def _extract_ld_json(soup_text: str) -> dict | None:
"""Extract Recipe from ld+json structured data in raw HTML.
Only works on raw HTML, not on Jina markdown output.
Returns None if BeautifulSoup unavailable or no Recipe found.
"""
try:
from bs4 import BeautifulSoup
except ImportError:
logging.warning("recipe: BeautifulSoup not available, skipping ld+json extraction")
return None
soup = BeautifulSoup(soup_text, "html.parser")
for script in soup.find_all("script", type="application/ld+json"):
try:
raw = script.string if script.string else ""
raw = raw.replace("\r", "")
raw = _fix_json_newlines(raw)
data = json.loads(raw)
if isinstance(data, dict) and "@graph" in data:
for item in data["@graph"]:
if item.get("@type") == "Recipe":
return item
if isinstance(data, dict) and data.get("@type") == "Recipe":
return data
if isinstance(data, list):
for item in data:
if isinstance(item, dict) and item.get("@type") == "Recipe":
return item
except (json.JSONDecodeError, TypeError):
continue
return None
---------------------------------------------------------------------------
Fetch strategies
---------------------------------------------------------------------------
async def _fetch_via_jina(url: str, timeout: int = 30) -> str:
"""Fetch clean markdown via Jina AI reader proxy."""
jina_url = f"https://r.jina.ai/{url}"
try:
async with httpx.AsyncClient() as client:
resp = await client.get(jina_url, timeout=timeout)
resp.raise_for_status()
content = resp.text.strip()
if content and len(content) > 100:
logging.info("recipe: Jina fetch OK, %d chars", len(content))
return content[:15000]
logging.warning("recipe: Jina returned short content (%d chars)", len(content) if content else 0)
except httpx.HTTPStatusError as e:
logging.warning("recipe: Jina HTTP %d for %s", e.response.status_code, url)
except httpx.HTTPError as e:
logging.warning("recipe: Jina fetch error: %s", e)
return ""
def _build_text_from_html(html: str) -> str:
"""Strip HTML to plain text, focusing on recipe-relevant sections."""
try:
from bs4 import BeautifulSoup
except ImportError:
return html[:15000]
soup = BeautifulSoup(html, "html.parser")
for tag in soup.find_all(["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe", "form"]):
tag.decompose()
recipe_selectors = [
"[id*='recipe']", "[class*='recipe']",
"[id*='ingredient']", "[class*='ingredient']",
"[id*='direction']", "[class*='direction']",
"[id*='instruction']", "[class*='instruction']",
"article", "main", ".entry-content", ".post-content"
]
recipe_content = ""
for selector in recipe_selectors:
try:
elements = soup.select(selector)
if elements:
for el in elements:
recipe_content += el.get_text(separator="\n", strip=True) + "\n\n"
except Exception:
continue
if recipe_content:
text = recipe_content
else:
text = soup.get_text(separator="\n", strip=True)
text = re.sub(r"\n{3,}", "\n\n", text)
return text[:15000]
---------------------------------------------------------------------------
Main extraction
---------------------------------------------------------------------------
async def fetch_recipe(url: str, timeout: int = 30) -> dict:
"""Fetch and extract a recipe from a URL.
Fallback chain:
1. ld+json from raw HTML (fast, structured, no LLM)
2. Jina AI clean markdown → LLM extraction
3. Raw HTML → text cleanup → LLM extraction
4. Give up with a helpful error
Returns dict with recipe data or error info.
"""
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
# --- Step 1: Try direct HTML fetch ---
html = ""
direct_ok = False
async with httpx.AsyncClient(follow_redirects=True) as client:
try:
resp = await client.get(url, headers=headers, timeout=timeout)
resp.raise_for_status()
html = resp.text
direct_ok = True
logging.info("recipe: direct fetch OK, %d chars, status=%d", len(html), resp.status_code)
except httpx.HTTPStatusError as e:
logging.warning("recipe: direct HTTP %d for %s", e.response.status_code, url)
except httpx.HTTPError as e:
logging.warning("recipe: direct fetch error: %s", e)
# --- Step 2: Try ld+json from raw HTML (best case, no LLM needed) ---
if html and direct_ok:
ld_recipe = _extract_ld_json(html)
if ld_recipe:
logging.info("recipe: ld+json extraction succeeded for %s", url)
recipe = {
"title": ld_recipe.get("name", "Untitled"),
"servings": _parse_servings(ld_recipe.get("recipeYield", "")),
"prep_time": _parse_duration(ld_recipe.get("prepTime")),
"cook_time": _parse_duration(ld_recipe.get("cookTime")),
"total_time": _parse_duration(ld_recipe.get("totalTime")),
"ingredients": _clean_ingredients(ld_recipe.get("recipeIngredient", [])),
"instructions": _parse_instructions(ld_recipe.get("recipeInstructions", [])),
"tags": _extract_tags(ld_recipe),
"source_url": url,
"extracted_from": "ld+json",
}
# Validate: must have at least title + ingredients
if recipe["title"] != "Untitled" and recipe["ingredients"]:
recipe_id = _save_recipe(recipe)
recipe["recipe_id"] = recipe_id
return recipe
logging.info("recipe: ld+json found but incomplete (title=%s, ingredients=%d), falling back",
recipe["title"], len(recipe["ingredients"]))
# --- Step 3: Try Jina AI for clean markdown → LLM ---
logging.info("recipe: trying Jina + LLM for %s", url)
jina_text = await _fetch_via_jina(url, timeout=timeout)
if jina_text:
prompt = RECIPE_FETCH_PROMPT.format(url=url, text=jina_text[:12000])
try:
raw = _call_llm(prompt, timeout=120)
logging.info("recipe: LLM response received, %d chars", len(raw))
except Exception as e:
logging.error("recipe: LLM call failed: %s", e)
# LLM unreachable — try HTML fallback before giving up
raw = None
if raw:
raw = raw.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
try:
recipe = json.loads(raw)
except json.JSONDecodeError:
logging.warning("recipe: LLM returned invalid JSON (first 200 chars): %s", raw[:200])
# Fall through to HTML fallback
if isinstance(recipe, dict):
if "error" in recipe:
logging.warning("recipe: LLM returned error: %s", recipe.get("error"))
# Fall through to HTML fallback
elif recipe.get("title") and recipe.get("ingredients"):
recipe["extracted_from"] = "jina+llm"
recipe_id = _save_recipe(recipe)
recipe["recipe_id"] = recipe_id
return recipe
else:
logging.warning("recipe: LLM result incomplete (title=%s, ingredients=%d)",
recipe.get("title"), len(recipe.get("ingredients", [])))
# --- Step 4: HTML text extraction → LLM (last resort) ---
if html and direct_ok:
logging.info("recipe: trying HTML text extraction + LLM for %s", url)
text = _build_text_from_html(html)
if text:
prompt = RECIPE_FETCH_PROMPT.format(url=url, text=text[:12000])
try:
raw = _call_llm(prompt, timeout=120)
logging.info("recipe: HTML+LLM response received, %d chars", len(raw))
except Exception as e:
logging.error("recipe: HTML+LLM call failed: %s", e)
return {
"error": f"Recipe extraction failed. The site may be blocking automated access, or the local LLM (Gaming PC) is offline. Error: {e}",
"source_url": url,
"diagnostics": {
"direct_fetch": "ok",
"ld_json": "no recipe found",
"jina": "ok" if jina_text else "failed",
"llm": f"error: {e}",
}
}
raw = raw.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
try:
recipe = json.loads(raw)
except json.JSONDecodeError:
logging.warning("recipe: HTML+LLM returned invalid JSON")
return {
"error": "Recipe extraction failed. Could not parse the page content.",
"source_url": url,
"diagnostics": {
"direct_fetch": "ok",
"ld_json": "no recipe found",
"jina": "ok" if jina_text else "failed",
"llm": "invalid JSON response",
}
}
if isinstance(recipe, dict) and "error" not in recipe:
if recipe.get("title") and recipe.get("ingredients"):
recipe["extracted_from"] = "html+llm"
recipe_id = _save_recipe(recipe)
recipe["recipe_id"] = recipe_id
return recipe
# --- All methods failed ---
logging.error("recipe: all extraction methods failed for %s", url)
diag = {
"direct_fetch": "ok" if direct_ok else "failed",
"ld_json": "no recipe found" if (html and direct_ok) else "skipped (no HTML)",
"jina": "ok" if jina_text else "failed",
"llm": "tried" if jina_text or (html and direct_ok) else "not reached",
}
# Helpful error message based on diagnostics
if not direct_ok and not jina_text:
error_msg = "Recipe extraction failed. Could not fetch the page (site may be down or blocking access). Try a different URL."
elif not jina_text and direct_ok:
error_msg = "Recipe extraction failed. The site may not contain a recognized recipe. Try a different URL."
else:
error_msg = "Recipe extraction failed. The page was fetched but no recipe could be identified. Try a different URL or check that the link goes directly to a recipe page."
return {
"error": error_msg,
"source_url": url,
"diagnostics": diag,
}
def _slugify(text: str) -> str:
"""Slugify text, limited to keep recipe_id under Telegram's 64-byte callback_data limit.
callback_data format: toggle:RECIPE_ID:INDEX
- prefix "toggle:" = 7 chars
- separator ":" = 1 char
- INDEX = 1-2 chars
- Max recipe_id = 64 - 7 - 1 - 2 = 54 chars
- slug (max 20) + "-" + uuid8 (8) = 29 chars max -> well within limit
"""
return re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")[:20]
def _save_recipe(recipe: dict) -> str:
_ensure_recipes_dir()
title = recipe.get("title", "untitled")
slug = _slugify(title)
recipe_id = f"{slug}-{uuid.uuid4().hex[:8]}"
filepath = RECIPES_DIR / f"{recipe_id}.json"
recipe["saved_at"] = datetime.now().isoformat()
recipe["recipe_id"] = recipe_id
with open(filepath, "w") as f:
json.dump(recipe, f, indent=2, ensure_ascii=False)
return recipe_id
def format_recipe(recipe: dict) -> str:
lines = [f"🍳 {recipe.get('title', 'Untitled')}"]
if recipe.get("servings"):
lines.append(f"👥 Serves {recipe['servings']}")
times = []
for t in ("prep_time", "cook_time", "total_time"):
if recipe.get(t):
times.append(f"{t.replace('_', ' ').title()}: {recipe[t]}")
if times:
lines.append("⏱ " + " | ".join(times))
lines.append("")
ingredients = recipe.get("ingredients", [])
if ingredients:
lines.append("**Ingredients:**")
for ing in ingredients:
lines.append(f" • {ing}")
instructions = recipe.get("instructions", [])
if instructions:
lines.append("")
lines.append("**Instructions:**")
for i, step in enumerate(instructions, 1):
lines.append(f" {i}. {step}")
tags = recipe.get("tags", [])
if tags:
lines.append("")
lines.append(f"🏷 {' | '.join(tags)}")
if recipe.get("source_url"):
lines.append(f"🔗 {recipe['source_url']}")
return "\n".join(lines)