"""Recipe extractor — fetch recipe URLs and extract structured ingredients + instructions. Uses a three-step waterfall: 1. JSON-LD Sniper — deterministic, zero LLM, ~95% hit rate 2. HTML Chunk → LLM — heuristic extraction with timeout 3. Hard timeout guard — never hang beyond 30s total Then classifies ingredients into Costco zones using the existing pipeline and stores recipes in a family dinner Rolodex (JSON file). The core extraction logic lives in costco_route.extractors.recipe. This module provides backward-compatible wrappers, storage, and formatting. """ import json import logging import os import re from datetime import datetime from pathlib import Path import requests from bs4 import BeautifulSoup from costco_route.config import LLM_URL, LLM_MODEL from costco_route.llm_client import _call_llm from costco_route.pipeline import optimize from costco_route.extractors.recipe import ( extract_recipe as _waterfall_extract, extract_json_ld, recipe_from_json_ld, RecipeResult, ExtractionMethod, ) # --------------------------------------------------------------------------- # Recipe storage # --------------------------------------------------------------------------- RECIPES_DIR = Path(os.environ.get("RECIPES_DIR", os.path.expanduser("~/.costco_route/recipes"))) def _ensure_recipes_dir() -> Path: """Create recipes directory if it doesn't exist.""" RECIPES_DIR.mkdir(parents=True, exist_ok=True) return RECIPES_DIR # --------------------------------------------------------------------------- # Structured data helpers (ld+json parsing) # --------------------------------------------------------------------------- def _parse_servings(yield_str: str | None) -> str | None: """Parse recipeYield into clean servings string.""" if not yield_str: return None yield_str = str(yield_str).strip() # Handle "Serves 4-6" or "4-6 servings" or just "4" if match := re.search(r'(?i)(serves?\s+)?(\d+[\s\-]*\d*)', yield_str): return match.group(2).strip() return yield_str if yield_str else None def _parse_duration(iso_duration: str | None) -> str | None: """Parse ISO 8601 duration (PT30M) into readable string.""" if not iso_duration: return None # PT30M → 30 min, PT1H30M → 1 hr 30 min match = re.match(r'PT(?:(\d+)H)?(?:(\d+)M)?', str(iso_duration)) if not match: return str(iso_duration) hours, minutes = match.groups() parts = [] if hours: parts.append(f"{hours} hr") if minutes: parts.append(f"{minutes} min") return " ".join(parts) if parts else None def _clean_ingredients(ingredients: list) -> list[str]: """Clean ingredient strings (remove newlines, extra spaces).""" cleaned = [] for ing in ingredients: if ing: cleaned.append(re.sub(r'\s+', ' ', str(ing).replace('\r', '').replace('\n', ' ')).strip()) return [i for i in cleaned if i] def _parse_instructions(instructions: list | str) -> list[str]: """Parse recipeInstructions from ld+json (handles HowToStep or plain text).""" if isinstance(instructions, str): # Split on numbers or newlines steps = re.split(r'\n+|\d+\)\s*', instructions) return [s.strip() for s in steps if s.strip()] result = [] for step in instructions: if isinstance(step, dict): # HowToStep text = step.get("text", step.get("name", "")) if text: result.append(str(text).strip()) elif isinstance(step, str): result.append(step.strip()) return result def _extract_tags(ld_recipe: dict) -> list[str]: """Extract tags from recipeCategory and recipeCuisine.""" tags = [] for field in ["recipeCategory", "recipeCuisine", "keywords"]: val = ld_recipe.get(field) if val: if isinstance(val, str): tags.extend([t.strip() for t in val.split(",") if t.strip()]) elif isinstance(val, list): tags.extend([str(v).strip() for v in val if v]) # Add some inference name = ld_recipe.get("name", "").lower() if any(w in name for w in ["chicken", "beef", "pork", "fish"]): tags.append("dinner") return list(set(tags))[:8] # Deduplicate and limit # --------------------------------------------------------------------------- # URL fetching + HTML cleaning # --------------------------------------------------------------------------- RECIPE_FETCH_PROMPT = """SYSTEM: You are a recipe extraction engine. Extract the recipe from the following text, which was scraped from a recipe website. The text will contain ads, navigation, stories, and other garbage. Ignore ALL of that. Extract ONLY the recipe. Return a JSON object with this exact structure: {{ "title": "Recipe title", "servings": "Number of servings (e.g., '4' or '6-8')", "prep_time": "Prep time if stated (e.g., '15 min'), null if not found", "cook_time": "Cook time if stated (e.g., '30 min'), null if not found", "total_time": "Total time if stated, null if not found", "ingredients": ["ingredient 1", "ingredient 2", ...], "instructions": ["step 1", "step 2", ...], "tags": ["dinner", "chicken", "easy", ...], "source_url": "the URL provided" }} RULES: - ingredients: One item per list entry, including quantities (e.g., "2 cups flour", "1 lb chicken thighs") - instructions: Numbered steps as separate list entries. Combine multiple short steps if they're fragments. - tags: 3-8 relevant tags for categorization (meal type, protein, cuisine, difficulty) - If the page contains multiple recipes, extract the MAIN one - If no recipe is found, return {{"error": "no recipe found", "source_url": "..."}} - Strip ALL ad copy, nutrition disclaimers, "jump to recipe" buttons, social sharing text - Return ONLY the JSON object, no markdown fences USER: Extract the recipe from this text: URL: {url} TEXT: {text} RESPONSE (JSON only):""" def _extract_ld_json(soup: BeautifulSoup) -> dict | None: """Extract structured recipe data from ld+json script tags. Returns: Recipe dict if found, None otherwise. """ for script in soup.find_all("script", type="application/ld+json"): try: raw = script.string if script.string else "" # Remove carriage returns (Windows line endings in strings) raw = raw.replace("\r", "") # Fix unescaped newlines inside JSON strings (common recipe site issue) # We need to escape \n that appear inside quoted string values raw = _fix_json_newlines(raw) data = json.loads(raw) # Handle @graph structure (some sites wrap recipes) if isinstance(data, dict) and "@graph" in data: for item in data["@graph"]: if item.get("@type") == "Recipe": return item # Direct Recipe type if isinstance(data, dict) and data.get("@type") == "Recipe": return data # Array of items if isinstance(data, list): for item in data: if isinstance(item, dict) and item.get("@type") == "Recipe": return item except (json.JSONDecodeError, TypeError): continue return None def _fix_json_newlines(json_str: str) -> str: """Fix unescaped newlines inside JSON string values. Recipe sites often have multiline strings in their ld+json that aren't properly escaped. This attempts to fix them by escaping newlines that appear inside quoted strings. """ result = [] in_string = False i = 0 while i < len(json_str): c = json_str[i] if c == '"' and (i == 0 or json_str[i-1] != '\\'): in_string = not in_string result.append(c) elif c == '\n' and in_string: # Replace unescaped newline with escaped newline result.append('\\n') elif c == '\t' and in_string: # Replace unescaped tab with escaped tab result.append('\\t') else: result.append(c) i += 1 return ''.join(result) BROWSERLESS_URL = os.environ.get("BROWSERLESS_URL", "http://127.0.0.1:3000") # Pull from config if available (centralized), otherwise use the fallback above try: from costco_route.config import BROWSERLESS_URL as _BL_CFG BROWSERLESS_URL = _BL_CFG except ImportError: pass def _fetch_via_browserless(url: str, timeout: int = 30) -> str: """Fetch a URL through local Browserless container (headless Chrome). Renders JavaScript, bypasses bot detection, returns full HTML. Best used to extract ld+json from JS-heavy sites where direct requests fail. Returns: Raw HTML string, or empty string on failure. """ try: resp = requests.post( f"{BROWSERLESS_URL}/content", json={"url": url}, timeout=timeout, ) resp.raise_for_status() content = resp.text.strip() if content and len(content) > 200: return content[:50000] # Cap at 50K chars except requests.RequestException: pass return "" def _fetch_via_jina(url: str, timeout: int = 30) -> str: """Fetch a URL through Jina Reader API and return clean markdown. Jina Reader renders JavaScript server-side and strips ads/nav/cookies, returning clean Markdown. Much better than BeautifulSoup for JS-heavy sites. Returns: Clean markdown text, or empty string on failure. """ jina_url = f"https://r.jina.ai/{url}" headers = {"Accept": "text/markdown"} try: resp = requests.get(jina_url, headers=headers, timeout=timeout) resp.raise_for_status() content = resp.text.strip() if content and len(content) > 100: # Skip obviously empty pages return content[:15000] except requests.RequestException: pass return "" def _build_text_from_html(soup: BeautifulSoup) -> str: """Build clean text from HTML, preserving recipe content. This is a gentler approach than the original — targets only clear navigation/ad elements, not broad class/id regexes that strip content. """ # Make a copy to avoid modifying original soup = BeautifulSoup(str(soup), "html.parser") # Remove obvious non-content elements for tag in soup.find_all(["script", "style", "nav", "footer", "header", "aside", "noscript", "iframe", "form"]): tag.decompose() # Try to find recipe-specific content areas first recipe_selectors = [ "[id*='recipe']", "[class*='recipe']", "[id*='ingredient']", "[class*='ingredient']", "[id*='direction']", "[class*='direction']", "[id*='instruction']", "[class*='instruction']", "article", "main", ".entry-content", ".post-content" ] recipe_content = "" for selector in recipe_selectors: try: elements = soup.select(selector) if elements: for el in elements: recipe_content += el.get_text(separator="\n", strip=True) + "\n\n" except Exception: continue if recipe_content: text = recipe_content else: # Fallback: use body text text = soup.get_text(separator="\n", strip=True) # Collapse excessive whitespace text = re.sub(r"\n{3,}", "\n\n", text) return text[:15000] # Cap length def fetch_recipe(url: str, timeout: int = 30) -> dict: """Fetch a recipe URL and extract structured data using the waterfall. Waterfall: JSON-LD Sniper → HTML Chunk + LLM → Timeout guard Zero domain checks. Schema-based extraction only. Args: url: Recipe URL to fetch timeout: Overall timeout in seconds (default 30) Returns: Dict with title, ingredients, instructions, etc. """ # Use the new waterfall extractor result = _waterfall_extract(url, total_timeout=timeout) # Convert RecipeResult to dict for backward compatibility recipe = result.to_dict() # Handle extraction failure if result.error: return recipe # Already has error key # Classify ingredients into Costco zones ingredients = recipe.get("ingredients", []) if ingredients: ingredient_text = "\n".join(ingredients) route = optimize(ingredient_text, use_memory=True, markdown=False) recipe["zone_map"] = route.get("output", "") recipe["classified_ingredients"] = route.get("classified", {}) recipe["route_zones"] = list(route.get("classified", {}).keys()) # Save to Rolodex recipe_id = _save_recipe(recipe) recipe["recipe_id"] = recipe_id return recipe def _save_recipe(recipe: dict) -> str: """Save a recipe to the Rolodex. Returns: Recipe ID (slug-based). """ _ensure_recipes_dir() title = recipe.get("title", "untitled") slug = re.sub(r"[^a-z0-9]+", "-", title.lower()).strip("-")[:50] recipe_id = f"{slug}" filepath = RECIPES_DIR / f"{recipe_id}.json" # Add metadata recipe["saved_at"] = datetime.now().isoformat() recipe["recipe_id"] = recipe_id with open(filepath, "w") as f: json.dump(recipe, f, indent=2, ensure_ascii=False) return recipe_id def list_recipes() -> list[dict]: """List all saved recipes in the Rolodex. Returns: List of recipe summary dicts (id, title, tags, servings, saved_at). """ _ensure_recipes_dir() recipes = [] for filepath in sorted(RECIPES_DIR.glob("*.json")): try: with open(filepath) as f: r = json.load(f) recipes.append({ "id": r.get("recipe_id", filepath.stem), "title": r.get("title", "Unknown"), "tags": r.get("tags", []), "servings": r.get("servings"), "saved_at": r.get("saved_at"), "ingredient_count": len(r.get("ingredients", [])), }) except (json.JSONDecodeError, KeyError): continue return recipes def get_recipe(recipe_id: str) -> dict | None: """Load a specific recipe by ID. Args: recipe_id: Recipe ID (slug from title) Returns: Full recipe dict or None if not found. """ _ensure_recipes_dir() filepath = RECIPES_DIR / f"{recipe_id}.json" if not filepath.exists(): return None with open(filepath) as f: return json.load(f) def delete_recipe(recipe_id: str) -> bool: """Delete a recipe from the Rolodex. Returns: True if deleted, False if not found. """ filepath = RECIPES_DIR / f"{recipe_id}.json" if filepath.exists(): filepath.unlink() return True return False def format_recipe(recipe: dict, include_zones: bool = True) -> str: """Format a recipe for display (Telegram-friendly). Args: recipe: Recipe dict from fetch_recipe() or get_recipe() include_zones: Whether to include Costco zone classification Returns: Formatted string. """ lines = [] title = recipe.get("title", "Untitled Recipe") lines.append(f"🍳 **{title}**") if recipe.get("servings"): lines.append(f"👥 Serves {recipe['servings']}") times = [] for t in ("prep_time", "cook_time", "total_time"): if recipe.get(t): label = t.replace("_", " ").title() times.append(f"{label}: {recipe[t]}") if times: lines.append("⏱ " + " | ".join(times)) lines.append("") ingredients = recipe.get("ingredients", []) if ingredients: lines.append("**Ingredients:**") for ing in ingredients: lines.append(f" • {ing}") instructions = recipe.get("instructions", []) if instructions: lines.append("") lines.append("**Instructions:**") for i, step in enumerate(instructions, 1): lines.append(f" {i}. {step}") if include_zones and recipe.get("zone_map"): lines.append("") lines.append("**Costco Route:**") lines.append(recipe["zone_map"]) tags = recipe.get("tags", []) if tags: lines.append("") lines.append(f"🏷 {' | '.join(tags)}") source = recipe.get("source_url") if source: lines.append(f"🔗 {source}") return "\n".join(lines)