"""Recipe extractor — fetch recipe URLs and extract structured ingredients + instructions. Sovereign port from costco_route/recipe_extractor.py — zero imports from costco_route. Extraction fallback chain: 1. ld+json from raw HTML (most reliable, zero LLM cost) 2. Jina AI clean markdown → LLM (reliable, needs network) 3. Raw HTML → text extraction → LLM (last resort, local) """ import json import logging import os import re import sqlite3 import uuid from datetime import datetime, timedelta from pathlib import Path import httpx from icarus.core.config.staging import LLM_MODEL, LLM_URL # --------------------------------------------------------------------------- # Recipe storage # --------------------------------------------------------------------------- DATA_DIR = Path(os.environ.get("DATA_DIR", Path.home() / ".icarus/staging")) RECIPES_DIR = DATA_DIR / "recipes" def _ensure_recipes_dir() -> Path: RECIPES_DIR.mkdir(parents=True, exist_ok=True) return RECIPES_DIR # --------------------------------------------------------------------------- # LLM helper # --------------------------------------------------------------------------- def _call_llm(prompt: str, timeout: int = 120) -> str: payload = { "model": LLM_MODEL, "messages": [{"role": "user", "content": prompt}], "stream": False, "options": {"temperature": 0.1, "num_predict": 2048}, } resp = httpx.post(LLM_URL, json=payload, timeout=timeout) resp.raise_for_status() data = resp.json() # OpenAI-compatible /v1/chat/completions returns choices[0].message.content if "choices" in data: return data["choices"][0]["message"]["content"] # Ollama native /api/chat returns message.content return data["message"]["content"] # --------------------------------------------------------------------------- # Prompts # --------------------------------------------------------------------------- RECIPE_FETCH_PROMPT = """SYSTEM: You are a recipe extraction engine. Extract the recipe from the following text, which was scraped from a recipe website. The text will contain ads, navigation, stories, and other garbage. Ignore ALL of that. Extract ONLY the recipe. Return a JSON object with this exact structure: {{ "title": "Recipe title", "servings": "Number of servings (e.g., '4' or '6-8')", "prep_time": "Prep time if stated (e.g., '15 min'), null if not found", "cook_time": "Cook time if stated (e.g., '30 min'), null if not found", "total_time": "Total time if stated, null if not found", "ingredients": ["ingredient 1", "ingredient 2", ...], "instructions": ["step 1", "step 2", ...], "tags": ["dinner", "chicken", "easy", ...], "source_url": "the URL provided" }} RULES: - ingredients: One item per list entry, including quantities (e.g., "2 cups flour", "1 lb chicken thighs") - instructions: Numbered steps as separate list entries. Combine multiple short steps if they're fragments. - tags: 3-8 relevant tags for categorization (meal type, protein, cuisine, difficulty) - If the page contains multiple recipes, extract the MAIN one - If no recipe is found, return {{"error": "no recipe found", "source_url": "..."}} - Strip ALL ad copy, nutrition disclaimers, "jump to recipe" buttons, social sharing text - Return ONLY the JSON object, no markdown fences USER: Extract the recipe from this text: URL: {url} TEXT: {text} RESPONSE (JSON only):""" # --------------------------------------------------------------------------- # ld+json structured data extraction # --------------------------------------------------------------------------- def _parse_servings(yield_str: str | None) -> str | None: if not yield_str: return None yield_str = str(yield_str).strip() if match := re.search(r'(?i)(serves?\s+)?(\d+[\s\-]*\d*)', yield_str): return match.group(2).strip() return yield_str if yield_str else None def _parse_duration(iso_duration: str | None) -> str | None: if not iso_duration: return None match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?', str(iso_duration)) if not match: return str(iso_duration) hours, minutes = match.groups() parts = [] if hours: parts.append(f"{hours} hr") if minutes: parts.append(f"{minutes} min") return " ".join(parts) if parts else None def _clean_ingredients(ingredients: list) -> list[str]: cleaned = [] for ing in ingredients: if ing: cleaned.append(re.sub(r'\s+', ' ', str(ing).replace('\r', '').replace('\n', ' ')).strip()) return [i for i in cleaned if i] def _parse_instructions(instructions: list | str) -> list[str]: if isinstance(instructions, str): steps = re.split(r'\n+|\d+\)\s*', instructions) return [s.strip() for s in steps if s.strip()] result = [] for step in instructions: if isinstance(step, dict): text = step.get("text", step.get("name", "")) if text: result.append(str(text).strip()) elif isinstance(step, str): result.append(step.strip()) return result def _extract_tags(ld_recipe: dict) -> list[str]: tags = [] for field in ["recipeCategory", "recipeCuisine", "keywords"]: val = ld_recipe.get(field) if val: if isinstance(val, str): tags.extend([t.strip() for t in val.split(",") if t.strip()]) elif isinstance(val, list): tags.extend([str(v).strip() for v in val if v]) name = ld_recipe.get("name", "").lower() if any(w in name for w in ["chicken", "beef", "pork", "fish"]): tags.append("dinner") return list(set(tags))[:8] def _fix_json_newlines(json_str: str) -> str: result = [] in_string = False i = 0 while i < len(json_str): c = json_str[i] if c == '"' and (i == 0 or json_str[i-1] != '\\'): in_string = not in_string result.append(c) elif c == '\n' and in_string: result.append('\\n') elif c == '\t' and in_string: result.append('\\t') else: result.append(c) i += 1 return ''.join(result) def _extract_ld_json(soup_text: str) -> dict | None: """Extract Recipe from ld+json structured data in raw HTML. Only works on raw HTML, not on Jina markdown output. Returns None if BeautifulSoup unavailable or no Recipe found. """ try: from bs4 import BeautifulSoup except ImportError: logging.warning("recipe: BeautifulSoup not available, skipping ld+json extraction") return None soup = BeautifulSoup(soup_text, "html.parser") for script in soup.find_all("script", type="application/ld+json"): try: raw = script.string if script.string else "" raw = raw.replace("\r", "") raw = _fix_json_newlines(raw) data = json.loads(raw) if isinstance(data, dict) and "@graph" in data: for item in data["@graph"]: if item.get("@type") == "Recipe": return item if isinstance(data, dict) and data.get("@type") == "Recipe": return data if isinstance(data, list): for item in data: if isinstance(item, dict) and item.get("@type") == "Recipe": return item except (json.JSONDecodeError, TypeError): continue return None # --------------------------------------------------------------------------- # Fetch strategies # --------------------------------------------------------------------------- async def _fetch_via_jina(url: str, timeout: int = 30) -> str: """Fetch clean markdown via Jina AI reader proxy.""" jina_url = f"https://r.jina.ai/{url}" try: async with httpx.AsyncClient() as client: resp = await client.get(jina_url, timeout=timeout) resp.raise_for_status() content = resp.text.strip() if content and len(content) > 100: logging.info("recipe: Jina fetch OK, %d chars", len(content)) return content[:15000] logging.warning("recipe: Jina returned short content (%d chars)", len(content) if content else 0) except httpx.HTTPStatusError as e: logging.warning("recipe: Jina HTTP %d for %s", e.response.status_code, url) except httpx.HTTPError as e: logging.warning("recipe: Jina fetch error: %s", e) return "" def _build_text_from_html(html: str) -> str: """Strip HTML to plain text, focusing on recipe-relevant sections.""" try: from bs4 import BeautifulSoup except ImportError: return html[:15000] soup = BeautifulSoup(html, "html.parser") for tag in soup.find_all(["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe", "form"]): tag.decompose() recipe_selectors = [ "[id*='recipe']", "[class*='recipe']", "[id*='ingredient']", "[class*='ingredient']", "[id*='direction']", "[class*='direction']", "[id*='instruction']", "[class*='instruction']", "article", "main", ".entry-content", ".post-content" ] recipe_content = "" for selector in recipe_selectors: try: elements = soup.select(selector) if elements: for el in elements: recipe_content += el.get_text(separator="\n", strip=True) + "\n\n" except Exception: continue if recipe_content: text = recipe_content else: text = soup.get_text(separator="\n", strip=True) text = re.sub(r"\n{3,}", "\n\n", text) return text[:15000] # --------------------------------------------------------------------------- # Main extraction # --------------------------------------------------------------------------- async def fetch_recipe(url: str, timeout: int = 30) -> dict: """Fetch and extract a recipe from a URL. Fallback chain: 1. ld+json from raw HTML (fast, structured, no LLM) 2. Jina AI clean markdown → LLM extraction 3. Raw HTML → text cleanup → LLM extraction 4. Give up with a helpful error Returns dict with recipe data or error info. """ headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", } # --- Step 1: Try direct HTML fetch --- html = "" direct_ok = False async with httpx.AsyncClient(follow_redirects=True) as client: try: resp = await client.get(url, headers=headers, timeout=timeout) resp.raise_for_status() html = resp.text direct_ok = True logging.info("recipe: direct fetch OK, %d chars, status=%d", len(html), resp.status_code) except httpx.HTTPStatusError as e: logging.warning("recipe: direct HTTP %d for %s", e.response.status_code, url) except httpx.HTTPError as e: logging.warning("recipe: direct fetch error: %s", e) # --- Step 2: Try ld+json from raw HTML (best case, no LLM needed) --- if html and direct_ok: ld_recipe = _extract_ld_json(html) if ld_recipe: logging.info("recipe: ld+json extraction succeeded for %s", url) recipe = { "title": ld_recipe.get("name", "Untitled"), "servings": _parse_servings(ld_recipe.get("recipeYield", "")), "prep_time": _parse_duration(ld_recipe.get("prepTime")), "cook_time": _parse_duration(ld_recipe.get("cookTime")), "total_time": _parse_duration(ld_recipe.get("totalTime")), "ingredients": _clean_ingredients(ld_recipe.get("recipeIngredient", [])), "instructions": _parse_instructions(ld_recipe.get("recipeInstructions", [])), "tags": _extract_tags(ld_recipe), "source_url": url, "extracted_from": "ld+json", } # Validate: must have at least title + ingredients if recipe["title"] != "Untitled" and recipe["ingredients"]: recipe_id = _save_recipe(recipe) recipe["recipe_id"] = recipe_id return recipe logging.info("recipe: ld+json found but incomplete (title=%s, ingredients=%d), falling back", recipe["title"], len(recipe["ingredients"])) # --- Step 3: Try Jina AI for clean markdown → LLM --- logging.info("recipe: trying Jina + LLM for %s", url) jina_text = await _fetch_via_jina(url, timeout=timeout) if jina_text: prompt = RECIPE_FETCH_PROMPT.format(url=url, text=jina_text[:12000]) try: raw = _call_llm(prompt, timeout=120) logging.info("recipe: LLM response received, %d chars", len(raw)) except Exception as e: logging.error("recipe: LLM call failed: %s", e) # LLM unreachable — try HTML fallback before giving up raw = None if raw: raw = raw.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip() try: recipe = json.loads(raw) except json.JSONDecodeError: logging.warning("recipe: LLM returned invalid JSON (first 200 chars): %s", raw[:200]) # Fall through to HTML fallback if isinstance(recipe, dict): if "error" in recipe: logging.warning("recipe: LLM returned error: %s", recipe.get("error")) # Fall through to HTML fallback elif recipe.get("title") and recipe.get("ingredients"): recipe["extracted_from"] = "jina+llm" recipe_id = _save_recipe(recipe) recipe["recipe_id"] = recipe_id return recipe else: logging.warning("recipe: LLM result incomplete (title=%s, ingredients=%d)", recipe.get("title"), len(recipe.get("ingredients", []))) # --- Step 4: HTML text extraction → LLM (last resort) --- if html and direct_ok: logging.info("recipe: trying HTML text extraction + LLM for %s", url) text = _build_text_from_html(html) if text: prompt = RECIPE_FETCH_PROMPT.format(url=url, text=text[:12000]) try: raw = _call_llm(prompt, timeout=120) logging.info("recipe: HTML+LLM response received, %d chars", len(raw)) except Exception as e: logging.error("recipe: HTML+LLM call failed: %s", e) return { "error": f"Recipe extraction failed. The site may be blocking automated access, or the local LLM (Gaming PC) is offline. Error: {e}", "source_url": url, "diagnostics": { "direct_fetch": "ok", "ld_json": "no recipe found", "jina": "ok" if jina_text else "failed", "llm": f"error: {e}", } } raw = raw.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip() try: recipe = json.loads(raw) except json.JSONDecodeError: logging.warning("recipe: HTML+LLM returned invalid JSON") return { "error": "Recipe extraction failed. Could not parse the page content.", "source_url": url, "diagnostics": { "direct_fetch": "ok", "ld_json": "no recipe found", "jina": "ok" if jina_text else "failed", "llm": "invalid JSON response", } } if isinstance(recipe, dict) and "error" not in recipe: if recipe.get("title") and recipe.get("ingredients"): recipe["extracted_from"] = "html+llm" recipe_id = _save_recipe(recipe) recipe["recipe_id"] = recipe_id return recipe # --- All methods failed --- logging.error("recipe: all extraction methods failed for %s", url) diag = { "direct_fetch": "ok" if direct_ok else "failed", "ld_json": "no recipe found" if (html and direct_ok) else "skipped (no HTML)", "jina": "ok" if jina_text else "failed", "llm": "tried" if jina_text or (html and direct_ok) else "not reached", } # Helpful error message based on diagnostics if not direct_ok and not jina_text: error_msg = "Recipe extraction failed. Could not fetch the page (site may be down or blocking access). Try a different URL." elif not jina_text and direct_ok: error_msg = "Recipe extraction failed. The site may not contain a recognized recipe. Try a different URL." else: error_msg = "Recipe extraction failed. The page was fetched but no recipe could be identified. Try a different URL or check that the link goes directly to a recipe page." return { "error": error_msg, "source_url": url, "diagnostics": diag, } def _slugify(text: str) -> str: """Slugify text, limited to keep recipe_id under Telegram's 64-byte callback_data limit. callback_data format: toggle:RECIPE_ID:INDEX - prefix "toggle:" = 7 chars - separator ":" = 1 char - INDEX = 1-2 chars - Max recipe_id = 64 - 7 - 1 - 2 = 54 chars - slug (max 20) + "-" + uuid8 (8) = 29 chars max -> well within limit """ return re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")[:20] def _save_recipe(recipe: dict) -> str: _ensure_recipes_dir() title = recipe.get("title", "untitled") slug = _slugify(title) recipe_id = f"{slug}-{uuid.uuid4().hex[:8]}" filepath = RECIPES_DIR / f"{recipe_id}.json" recipe["saved_at"] = datetime.now().isoformat() recipe["recipe_id"] = recipe_id with open(filepath, "w") as f: json.dump(recipe, f, indent=2, ensure_ascii=False) return recipe_id def format_recipe(recipe: dict) -> str: lines = [f"🍳 **{recipe.get('title', 'Untitled')}**"] if recipe.get("servings"): lines.append(f"👥 Serves {recipe['servings']}") times = [] for t in ("prep_time", "cook_time", "total_time"): if recipe.get(t): times.append(f"{t.replace('_', ' ').title()}: {recipe[t]}") if times: lines.append("⏱ " + " | ".join(times)) lines.append("") ingredients = recipe.get("ingredients", []) if ingredients: lines.append("**Ingredients:**") for ing in ingredients: lines.append(f" • {ing}") instructions = recipe.get("instructions", []) if instructions: lines.append("") lines.append("**Instructions:**") for i, step in enumerate(instructions, 1): lines.append(f" {i}. {step}") tags = recipe.get("tags", []) if tags: lines.append("") lines.append(f"🏷 {' | '.join(tags)}") if recipe.get("source_url"): lines.append(f"🔗 {recipe['source_url']}") return "\n".join(lines)