"""Tests for the waterfall recipe extractor. Test case: https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/ Expected: JSON-LD sniper finds Recipe schema, returns ingredients without LLM call. """ import json import os import sys import pytest # Add parent to path for imports sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from bs4 import BeautifulSoup from costco_route.extractors.recipe import ( extract_json_ld, extract_recipe, extract_recipe_waterfall, recipe_from_json_ld, RecipeResult, ExtractionMethod, _flatten_ld_json, _is_recipe_type, _fix_json_newlines, _extract_html_chunk, _parse_servings, _parse_duration, _clean_ingredients, _parse_instructions, _extract_tags, ) # --------------------------------------------------------------------------- # Sample ld+json data for unit tests # --------------------------------------------------------------------------- SAMPLE_LD_JSON_DIRECT = ''' ''' SAMPLE_LD_JSON_GRAPH = ''' ''' SAMPLE_LD_JSON_LIST = ''' ''' SAMPLE_HTML_NO_LD = '''

Simple Pasta

Ingredients: 1 lb pasta, 2 cups sauce, 1 tbsp olive oil

Instructions: Boil pasta. Add sauce. Serve.

''' SAMPLE_HTML_WITH_LD = f''' {SAMPLE_LD_JSON_DIRECT}

Funeral Sandwiches

''' # --------------------------------------------------------------------------- # Test: _flatten_ld_json # --------------------------------------------------------------------------- class TestFlattenLdJson: def test_direct_dict(self): data = {"@type": "Recipe", "name": "Test"} result = _flatten_ld_json(data) assert len(result) == 1 assert result[0]["@type"] == "Recipe" def test_list(self): data = [ {"@type": "WebPage"}, {"@type": "Recipe", "name": "Test"}, ] result = _flatten_ld_json(data) assert len(result) == 2 def test_graph(self): data = { "@context": "https://schema.org", "@graph": [ {"@type": "WebPage"}, {"@type": "Recipe", "name": "Test"}, ] } result = _flatten_ld_json(data) # Should have parent dict + 2 graph items assert len(result) >= 3 recipe_items = [r for r in result if r.get("@type") == "Recipe"] assert len(recipe_items) == 1 def test_nested_graph(self): data = { "@context": "https://schema.org", "@graph": [ {"@type": "WebPage"}, { "@type": "ItemList", "@graph": [ {"@type": "Recipe", "name": "Nested"} ] } ] } result = _flatten_ld_json(data) # Should recursively flatten assert len(result) >= 3 # --------------------------------------------------------------------------- # Test: _is_recipe_type # --------------------------------------------------------------------------- class TestIsRecipeType: def test_string_type(self): assert _is_recipe_type({"@type": "Recipe"}) is True def test_list_type(self): assert _is_recipe_type({"@type": ["Recipe", "CreativeWork"]}) is True def test_non_recipe(self): assert _is_recipe_type({"@type": "WebPage"}) is False def test_no_type(self): assert _is_recipe_type({}) is False def test_non_dict(self): assert _is_recipe_type("string") is False # --------------------------------------------------------------------------- # Test: _fix_json_newlines # --------------------------------------------------------------------------- class TestFixJsonNewlines: def test_no_newlines(self): assert _fix_json_newlines('{"key": "value"}') == '{"key": "value"}' def test_escaped_newline_preserved(self): # Already escaped newlines should be preserved s = '{"key": "line1\\nline2"}' result = _fix_json_newlines(s) # The \\n should stay as \\n assert "\\n" in result def test_unescaped_newline_fixed(self): # Unescaped newline inside a string should be fixed s = '{"key": "line1\nline2"}' result = _fix_json_newlines(s) # Should now be parseable JSON data = json.loads(result) assert data["key"] == "line1\nline2" def test_tab_inside_string(self): s = '{"key": "col1\tcol2"}' result = _fix_json_newlines(s) data = json.loads(result) assert data["key"] == "col1\tcol2" # --------------------------------------------------------------------------- # Test: extract_json_ld # --------------------------------------------------------------------------- class TestExtractJsonLd: def test_direct_recipe_type(self): result = extract_json_ld(SAMPLE_HTML_WITH_LD) assert result is not None assert result.get("@type") == "Recipe" assert result.get("name") == "Funeral Sandwiches" assert len(result.get("recipeIngredient", [])) == 6 def test_graph_structure(self): html = f"{SAMPLE_LD_JSON_GRAPH}" result = extract_json_ld(html) assert result is not None assert result.get("@type") == "Recipe" assert result.get("name") == "Funeral Sandwiches" def test_list_of_scripts(self): html = f"{SAMPLE_LD_JSON_LIST}" result = extract_json_ld(html) assert result is not None assert result.get("@type") == "Recipe" def test_no_ld_json(self): result = extract_json_ld(SAMPLE_HTML_NO_LD) assert result is None def test_broken_json_skipped(self): html = ''' ''' result = extract_json_ld(html) assert result is not None assert result.get("name") == "Works" def test_multiple_types_in_array(self): html = ''' ''' result = extract_json_ld(html) assert result is not None assert "Recipe" in result.get("@type", []) # --------------------------------------------------------------------------- # Test: recipe_from_json_ld # --------------------------------------------------------------------------- class TestRecipeFromJsonLd: def test_full_recipe(self): ld = { "@type": "Recipe", "name": "Funeral Sandwiches", "recipeIngredient": [ "1 lb deli ham", "Swiss cheese", "Butter", "Poppy seeds", ], "recipeInstructions": [ {"@type": "HowToStep", "text": "Preheat oven."}, {"@type": "HowToStep", "text": "Assemble."}, ], "recipeYield": "12 sandwiches", "prepTime": "PT15M", "cookTime": "PT15M", "totalTime": "PT30M", "recipeCategory": "Appetizer", } result = recipe_from_json_ld(ld, source_url="https://example.com/test") assert isinstance(result, RecipeResult) assert result.title == "Funeral Sandwiches" assert result.servings == "12" assert result.prep_time == "15 min" assert result.cook_time == "15 min" assert result.total_time == "30 min" assert len(result.ingredients) == 4 assert len(result.instructions) == 2 assert result.extraction_method == ExtractionMethod.JSON_LD assert result.source_url == "https://example.com/test" def test_how_to_section(self): """Test that HowToSection items are flattened.""" ld = { "@type": "Recipe", "name": "Layered Recipe", "recipeIngredient": ["flour", "water"], "recipeInstructions": [ { "@type": "HowToSection", "name": "Prep", "itemListElement": [ {"@type": "HowToStep", "text": "Mix flour and water."}, {"@type": "HowToStep", "text": "Knead for 5 minutes."}, ] }, { "@type": "HowToSection", "name": "Cook", "itemListElement": [ {"@type": "HowToStep", "text": "Bake at 350°F."}, ] } ], } result = recipe_from_json_ld(ld) assert len(result.instructions) == 3 assert result.instructions[0] == "Mix flour and water." # --------------------------------------------------------------------------- # Test: _parse_servings # --------------------------------------------------------------------------- class TestParseServings: def test_simple_number(self): assert _parse_servings("4") == "4" def test_serves_prefix(self): assert _parse_servings("Serves 6") == "6" def test_range(self): assert _parse_servings("4-6") == "4-6" def test_serves_range(self): assert _parse_servings("Serves 4-6") == "4-6" def test_none(self): assert _parse_servings(None) is None def test_empty(self): assert _parse_servings("") is None # --------------------------------------------------------------------------- # Test: _parse_duration # --------------------------------------------------------------------------- class TestParseDuration: def test_minutes(self): assert _parse_duration("PT30M") == "30 min" def test_hours_and_minutes(self): assert _parse_duration("PT1H30M") == "1 hr 30 min" def test_hours_only(self): assert _parse_duration("PT2H") == "2 hr" def test_none(self): assert _parse_duration(None) is None def test_non_iso(self): # Pass through if not ISO format assert _parse_duration("30 minutes") == "30 minutes" # --------------------------------------------------------------------------- # Test: _clean_ingredients # --------------------------------------------------------------------------- class TestCleanIngredients: def test_basic(self): assert _clean_ingredients(["1 cup flour", "2 eggs"]) == ["1 cup flour", "2 eggs"] def test_newlines(self): assert _clean_ingredients(["1 cup\nflour"]) == ["1 cup flour"] def test_empty_entries(self): assert _clean_ingredients(["eggs", "", "flour"]) == ["eggs", "flour"] def test_none_entries(self): assert _clean_ingredients(["eggs", None, "flour"]) == ["eggs", "flour"] # --------------------------------------------------------------------------- # Test: _parse_instructions # --------------------------------------------------------------------------- class TestParseInstructions: def test_how_to_steps(self): instructions = [ {"@type": "HowToStep", "text": "Step 1"}, {"@type": "HowToStep", "text": "Step 2"}, ] result = _parse_instructions(instructions) assert result == ["Step 1", "Step 2"] def test_string_instructions(self): result = _parse_instructions("Mix. Bake. Serve.") assert len(result) >= 1 def test_mixed_types(self): instructions = [ {"@type": "HowToStep", "text": "Step 1"}, "Step 2 as string", ] result = _parse_instructions(instructions) assert len(result) == 2 def test_empty_list(self): assert _parse_instructions([]) == [] # --------------------------------------------------------------------------- # Test: _extract_html_chunk # --------------------------------------------------------------------------- class TestExtractHtmlChunk: def test_recipe_container(self): html = '

2 cups flour

' result = _extract_html_chunk(html) assert "2 cups flour" in result def test_article_fallback(self): html = '

Mix everything together

' result = _extract_html_chunk(html) assert "Mix everything together" in result def test_script_removed(self): html = '

Real content

' result = _extract_html_chunk(html) assert "var x" not in result assert "Real content" in result def test_truncation(self): # Very long content should be truncated html = '
' + '

Content

' * 10000 + '
' result = _extract_html_chunk(html) assert len(result) <= 15000 # --------------------------------------------------------------------------- # Test: _extract_tags # --------------------------------------------------------------------------- class TestExtractTags: def test_from_category(self): result = _extract_tags({"recipeCategory": "Dessert", "name": "Cake"}) assert "Dessert" in result def test_from_cuisine(self): result = _extract_tags({"recipeCuisine": "Italian", "name": "Pasta"}) assert "Italian" in result def test_from_keywords(self): result = _extract_tags({"keywords": "easy, quick, dinner", "name": "Pasta"}) assert "easy" in result assert "quick" in result def test_infer_dinner(self): result = _extract_tags({"name": "Chicken Stir Fry"}) assert "dinner" in result def test_dedup_limit(self): result = _extract_tags({"keywords": "a,b,c,d,e,f,g,h,i", "name": "Test"}) assert len(result) <= 8 # --------------------------------------------------------------------------- # Integration test: extract_json_ld on real-world-like HTML # --------------------------------------------------------------------------- class TestExtractJsonLdRealistic: def test_grilled_cheese_social_style(self): """Simulate the structure of grilledcheesesocial.com ld+json.""" html = '''

Funeral Sandwiches

''' result = extract_json_ld(html) assert result is not None assert result["name"] == "Funeral Sandwiches" assert len(result["recipeIngredient"]) == 8 assert len(result["recipeInstructions"]) == 5 # Convert to RecipeResult recipe = recipe_from_json_ld(result, source_url="https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/") assert recipe.title == "Funeral Sandwiches" assert recipe.extraction_method == ExtractionMethod.JSON_LD assert recipe.servings == "12" assert recipe.prep_time == "20 min" assert recipe.cook_time == "15 min" assert recipe.total_time == "35 min" assert "Appetizer" in recipe.tags assert "Southern" in recipe.tags def test_multiple_ld_json_blocks(self): """Test that we skip non-Recipe blocks and find the Recipe.""" html = ''' ''' result = extract_json_ld(html) assert result is not None assert result["@type"] == "Recipe" def test_recipe_in_graph_with_other_items(self): """Test @graph containing both Recipe and other types.""" html = ''' ''' result = extract_json_ld(html) assert result is not None assert result["@type"] == "Recipe" assert result["name"] == "Pasta" # --------------------------------------------------------------------------- # Test: RecipeResult.to_dict() # --------------------------------------------------------------------------- class TestRecipeResultToDict: def test_success_dict(self): result = RecipeResult( title="Test Recipe", ingredients=["eggs", "flour"], instructions=["Mix", "Bake"], extraction_method=ExtractionMethod.JSON_LD, source_url="https://example.com", ) d = result.to_dict() assert d["title"] == "Test Recipe" assert d["extracted_from"] == "json_ld" assert "error" not in d def test_error_dict(self): result = RecipeResult( extraction_method=ExtractionMethod.FAILED, error="Could not fetch HTML", source_url="https://example.com", ) d = result.to_dict() assert d["error"] == "Could not fetch HTML" assert d["extracted_from"] == "failed" # --------------------------------------------------------------------------- # Live integration test (skipped unless --live flag) # --------------------------------------------------------------------------- class TestLiveExtraction: @pytest.mark.skipif( not os.environ.get("LIVE_TEST"), reason="Set LIVE_TEST=1 to run live extraction tests" ) def test_grilledcheesesocial_json_ld(self): """Live test: verify JSON-LD extraction on grilledcheesesocial.com.""" url = "https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/" result = extract_recipe(url, total_timeout=30) assert result.error is None, f"Extraction failed: {result.error}" assert result.extraction_method == ExtractionMethod.JSON_LD, \ f"Expected json_ld extraction, got {result.extraction_method}" assert "Funeral Sandwiches" in result.title or "funeral" in result.title.lower(), \ f"Unexpected title: {result.title}" assert len(result.ingredients) > 0, "No ingredients found" assert len(result.instructions) > 0, "No instructions found" @pytest.mark.skipif( not os.environ.get("LIVE_TEST"), reason="Set LIVE_TEST=1 to run live extraction tests" ) def test_allrecipes_json_ld(self): """Live test: verify JSON-LD on a major recipe site.""" url = "https://www.allrecipes.com/recipe/228285/classic-funeral-sandwiches/" result = extract_recipe(url, total_timeout=30) assert result.error is None, f"Extraction failed: {result.error}" assert result.extraction_method == ExtractionMethod.JSON_LD, \ f"Expected json_ld extraction, got {result.extraction_method}" assert len(result.ingredients) > 0, "No ingredients found" if __name__ == "__main__": pytest.main([__file__, "-v"])