Simple Pasta
Ingredients: 1 lb pasta, 2 cups sauce, 1 tbsp olive oil
Instructions: Boil pasta. Add sauce. Serve.
"""Tests for the waterfall recipe extractor.
Test case: https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/
Expected: JSON-LD sniper finds Recipe schema, returns ingredients without LLM call.
"""
import json
import os
import sys
import pytest
sys.path.insert(0, os.path.join(os.path.dirname(file), ".."))
from bs4 import BeautifulSoup
from costco_route.extractors.recipe import (
extract_json_ld,
extract_recipe,
extract_recipe_waterfall,
recipe_from_json_ld,
RecipeResult,
ExtractionMethod,
_flatten_ld_json,
_is_recipe_type,
_fix_json_newlines,
_extract_html_chunk,
_parse_servings,
_parse_duration,
_clean_ingredients,
_parse_instructions,
_extract_tags,
)
SAMPLE_LD_JSON_DIRECT = '''
'''
SAMPLE_LD_JSON_GRAPH = '''
'''
SAMPLE_LD_JSON_LIST = '''
'''
SAMPLE_HTML_NO_LD = '''
Ingredients: 1 lb pasta, 2 cups sauce, 1 tbsp olive oil
Instructions: Boil pasta. Add sauce. Serve.
'''
SAMPLE_HTML_WITH_LD = f'''
{SAMPLE_LD_JSON_DIRECT}'''
class TestFlattenLdJson:
def test_direct_dict(self):
data = {"@type": "Recipe", "name": "Test"}
result = _flatten_ld_json(data)
assert len(result) == 1
assert result[0]["@type"] == "Recipe"
def test_list(self):
data = [
{"@type": "WebPage"},
{"@type": "Recipe", "name": "Test"},
]
result = _flatten_ld_json(data)
assert len(result) == 2
def test_graph(self):
data = {
"@context": "https://schema.org",
"@graph": [
{"@type": "WebPage"},
{"@type": "Recipe", "name": "Test"},
]
}
result = _flatten_ld_json(data)
# Should have parent dict + 2 graph items
assert len(result) >= 3
recipe_items = [r for r in result if r.get("@type") == "Recipe"]
assert len(recipe_items) == 1
def test_nested_graph(self):
data = {
"@context": "https://schema.org",
"@graph": [
{"@type": "WebPage"},
{
"@type": "ItemList",
"@graph": [
{"@type": "Recipe", "name": "Nested"}
]
}
]
}
result = _flatten_ld_json(data)
# Should recursively flatten
assert len(result) >= 3
class TestIsRecipeType:
def test_string_type(self):
assert _is_recipe_type({"@type": "Recipe"}) is True
def test_list_type(self):
assert _is_recipe_type({"@type": ["Recipe", "CreativeWork"]}) is True
def test_non_recipe(self):
assert _is_recipe_type({"@type": "WebPage"}) is False
def test_no_type(self):
assert _is_recipe_type({}) is False
def test_non_dict(self):
assert _is_recipe_type("string") is False
class TestFixJsonNewlines:
def test_no_newlines(self):
assert _fix_json_newlines('{"key": "value"}') == '{"key": "value"}'
def test_escaped_newline_preserved(self):
# Already escaped newlines should be preserved
s = '{"key": "line1\\nline2"}'
result = _fix_json_newlines(s)
# The \\n should stay as \\n
assert "\\n" in result
def test_unescaped_newline_fixed(self):
# Unescaped newline inside a string should be fixed
s = '{"key": "line1\nline2"}'
result = _fix_json_newlines(s)
# Should now be parseable JSON
data = json.loads(result)
assert data["key"] == "line1\nline2"
def test_tab_inside_string(self):
s = '{"key": "col1\tcol2"}'
result = _fix_json_newlines(s)
data = json.loads(result)
assert data["key"] == "col1\tcol2"
class TestExtractJsonLd:
def test_direct_recipe_type(self):
result = extract_json_ld(SAMPLE_HTML_WITH_LD)
assert result is not None
assert result.get("@type") == "Recipe"
assert result.get("name") == "Funeral Sandwiches"
assert len(result.get("recipeIngredient", [])) == 6
def test_graph_structure(self):
html = f"<html><head>{SAMPLE_LD_JSON_GRAPH}</head><body></body></html>"
result = extract_json_ld(html)
assert result is not None
assert result.get("@type") == "Recipe"
assert result.get("name") == "Funeral Sandwiches"
def test_list_of_scripts(self):
html = f"<html><head>{SAMPLE_LD_JSON_LIST}</head><body></body></html>"
result = extract_json_ld(html)
assert result is not None
assert result.get("@type") == "Recipe"
def test_no_ld_json(self):
result = extract_json_ld(SAMPLE_HTML_NO_LD)
assert result is None
def test_broken_json_skipped(self):
html = '''
<html><head>
<script type="application/ld+json">{broken json</script>
<script type="application/ld+json">{"@type": "Recipe", "name": "Works"}</script>
</head><body></body></html>
'''
result = extract_json_ld(html)
assert result is not None
assert result.get("name") == "Works"
def test_multiple_types_in_array(self):
html = '''
<html><head>
<script type="application/ld+json">
{"@type": ["Recipe", "CreativeWork"], "name": "Multi-type", "recipeIngredient": ["eggs"], "recipeInstructions": "Cook."}
</script>
</head><body></body></html>
'''
result = extract_json_ld(html)
assert result is not None
assert "Recipe" in result.get("@type", [])
class TestRecipeFromJsonLd:
def test_full_recipe(self):
ld = {
"@type": "Recipe",
"name": "Funeral Sandwiches",
"recipeIngredient": [
"1 lb deli ham",
"Swiss cheese",
"Butter",
"Poppy seeds",
],
"recipeInstructions": [
{"@type": "HowToStep", "text": "Preheat oven."},
{"@type": "HowToStep", "text": "Assemble."},
],
"recipeYield": "12 sandwiches",
"prepTime": "PT15M",
"cookTime": "PT15M",
"totalTime": "PT30M",
"recipeCategory": "Appetizer",
}
result = recipe_from_json_ld(ld, source_url="https://example.com/test")
assert isinstance(result, RecipeResult)
assert result.title == "Funeral Sandwiches"
assert result.servings == "12"
assert result.prep_time == "15 min"
assert result.cook_time == "15 min"
assert result.total_time == "30 min"
assert len(result.ingredients) == 4
assert len(result.instructions) == 2
assert result.extraction_method == ExtractionMethod.JSON_LD
assert result.source_url == "https://example.com/test"
def test_how_to_section(self):
"""Test that HowToSection items are flattened."""
ld = {
"@type": "Recipe",
"name": "Layered Recipe",
"recipeIngredient": ["flour", "water"],
"recipeInstructions": [
{
"@type": "HowToSection",
"name": "Prep",
"itemListElement": [
{"@type": "HowToStep", "text": "Mix flour and water."},
{"@type": "HowToStep", "text": "Knead for 5 minutes."},
]
},
{
"@type": "HowToSection",
"name": "Cook",
"itemListElement": [
{"@type": "HowToStep", "text": "Bake at 350°F."},
]
}
],
}
result = recipe_from_json_ld(ld)
assert len(result.instructions) == 3
assert result.instructions[0] == "Mix flour and water."
class TestParseServings:
def test_simple_number(self):
assert _parse_servings("4") == "4"
def test_serves_prefix(self):
assert _parse_servings("Serves 6") == "6"
def test_range(self):
assert _parse_servings("4-6") == "4-6"
def test_serves_range(self):
assert _parse_servings("Serves 4-6") == "4-6"
def test_none(self):
assert _parse_servings(None) is None
def test_empty(self):
assert _parse_servings("") is None
class TestParseDuration:
def test_minutes(self):
assert _parse_duration("PT30M") == "30 min"
def test_hours_and_minutes(self):
assert _parse_duration("PT1H30M") == "1 hr 30 min"
def test_hours_only(self):
assert _parse_duration("PT2H") == "2 hr"
def test_none(self):
assert _parse_duration(None) is None
def test_non_iso(self):
# Pass through if not ISO format
assert _parse_duration("30 minutes") == "30 minutes"
class TestCleanIngredients:
def test_basic(self):
assert _clean_ingredients(["1 cup flour", "2 eggs"]) == ["1 cup flour", "2 eggs"]
def test_newlines(self):
assert _clean_ingredients(["1 cup\nflour"]) == ["1 cup flour"]
def test_empty_entries(self):
assert _clean_ingredients(["eggs", "", "flour"]) == ["eggs", "flour"]
def test_none_entries(self):
assert _clean_ingredients(["eggs", None, "flour"]) == ["eggs", "flour"]
class TestParseInstructions:
def test_how_to_steps(self):
instructions = [
{"@type": "HowToStep", "text": "Step 1"},
{"@type": "HowToStep", "text": "Step 2"},
]
result = _parse_instructions(instructions)
assert result == ["Step 1", "Step 2"]
def test_string_instructions(self):
result = _parse_instructions("Mix. Bake. Serve.")
assert len(result) >= 1
def test_mixed_types(self):
instructions = [
{"@type": "HowToStep", "text": "Step 1"},
"Step 2 as string",
]
result = _parse_instructions(instructions)
assert len(result) == 2
def test_empty_list(self):
assert _parse_instructions([]) == []
class TestExtractHtmlChunk:
def test_recipe_container(self):
html = '
2 cups flour
def test_article_fallback(self):
html = '<html><body><article><p>Mix everything together</p></article></body></html>'
result = _extract_html_chunk(html)
assert "Mix everything together" in result
def test_script_removed(self):
html = '<html><body><article><script>var x = 1;</script><p>Real content</p></article></body></html>'
result = _extract_html_chunk(html)
assert "var x" not in result
assert "Real content" in result
def test_truncation(self):
# Very long content should be truncated
html = '<html><body><article>' + '<p>Content</p>' * 10000 + '</article></body></html>'
result = _extract_html_chunk(html)
assert len(result) <= 15000
class TestExtractTags:
def test_from_category(self):
result = _extract_tags({"recipeCategory": "Dessert", "name": "Cake"})
assert "Dessert" in result
def test_from_cuisine(self):
result = _extract_tags({"recipeCuisine": "Italian", "name": "Pasta"})
assert "Italian" in result
def test_from_keywords(self):
result = _extract_tags({"keywords": "easy, quick, dinner", "name": "Pasta"})
assert "easy" in result
assert "quick" in result
def test_infer_dinner(self):
result = _extract_tags({"name": "Chicken Stir Fry"})
assert "dinner" in result
def test_dedup_limit(self):
result = _extract_tags({"keywords": "a,b,c,d,e,f,g,h,i", "name": "Test"})
assert len(result) <= 8
class TestExtractJsonLdRealistic:
def test_grilled_cheese_social_style(self):
"""Simulate the structure of grilledcheesesocial.com ld+json."""
html = '''
result = extract_json_ld(html)
assert result is not None
assert result["name"] == "Funeral Sandwiches"
assert len(result["recipeIngredient"]) == 8
assert len(result["recipeInstructions"]) == 5
# Convert to RecipeResult
recipe = recipe_from_json_ld(result, source_url="https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/")
assert recipe.title == "Funeral Sandwiches"
assert recipe.extraction_method == ExtractionMethod.JSON_LD
assert recipe.servings == "12"
assert recipe.prep_time == "20 min"
assert recipe.cook_time == "15 min"
assert recipe.total_time == "35 min"
assert "Appetizer" in recipe.tags
assert "Southern" in recipe.tags
def test_multiple_ld_json_blocks(self):
"""Test that we skip non-Recipe blocks and find the Recipe."""
html = '''<html><head>
<script type="application/ld+json">
{"@context": "https://schema.org", "@type": "BreadcrumbList", "itemListElement": [{"@type": "ListItem", "position": 1, "name": "Home"}]}
</script>
<script type="application/ld+json">
{"@context": "https://schema.org", "@type": "Recipe", "name": "Test", "recipeIngredient": ["eggs"], "recipeInstructions": "Cook."}
</script>
</head><body></body></html>'''
result = extract_json_ld(html)
assert result is not None
assert result["@type"] == "Recipe"
def test_recipe_in_graph_with_other_items(self):
"""Test @graph containing both Recipe and other types."""
html = '''<html><head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@graph": [
{"@type": "WebSite", "name": "Example", "url": "https://example.com"},
{"@type": "BreadcrumbList", "itemListElement": []},
{"@type": "Recipe", "name": "Pasta", "recipeIngredient": ["pasta", "sauce"], "recipeInstructions": "Boil pasta."}
]
}
</script>
</head><body></body></html>'''
result = extract_json_ld(html)
assert result is not None
assert result["@type"] == "Recipe"
assert result["name"] == "Pasta"
class TestRecipeResultToDict:
def test_success_dict(self):
result = RecipeResult(
title="Test Recipe",
ingredients=["eggs", "flour"],
instructions=["Mix", "Bake"],
extraction_method=ExtractionMethod.JSON_LD,
source_url="https://example.com",
)
d = result.to_dict()
assert d["title"] == "Test Recipe"
assert d["extracted_from"] == "json_ld"
assert "error" not in d
def test_error_dict(self):
result = RecipeResult(
extraction_method=ExtractionMethod.FAILED,
error="Could not fetch HTML",
source_url="https://example.com",
)
d = result.to_dict()
assert d["error"] == "Could not fetch HTML"
assert d["extracted_from"] == "failed"
class TestLiveExtraction:
@pytest.mark.skipif(
not os.environ.get("LIVE_TEST"),
reason="Set LIVE_TEST=1 to run live extraction tests"
)
def test_grilledcheesesocial_json_ld(self):
"""Live test: verify JSON-LD extraction on grilledcheesesocial.com."""
url = "https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/"
result = extract_recipe(url, total_timeout=30)
assert result.error is None, f"Extraction failed: {result.error}"
assert result.extraction_method == ExtractionMethod.JSON_LD, \
f"Expected json_ld extraction, got {result.extraction_method}"
assert "Funeral Sandwiches" in result.title or "funeral" in result.title.lower(), \
f"Unexpected title: {result.title}"
assert len(result.ingredients) > 0, "No ingredients found"
assert len(result.instructions) > 0, "No instructions found"
@pytest.mark.skipif(
not os.environ.get("LIVE_TEST"),
reason="Set LIVE_TEST=1 to run live extraction tests"
)
def test_allrecipes_json_ld(self):
"""Live test: verify JSON-LD on a major recipe site."""
url = "https://www.allrecipes.com/recipe/228285/classic-funeral-sandwiches/"
result = extract_recipe(url, total_timeout=30)
assert result.error is None, f"Extraction failed: {result.error}"
assert result.extraction_method == ExtractionMethod.JSON_LD, \
f"Expected json_ld extraction, got {result.extraction_method}"
assert len(result.ingredients) > 0, "No ingredients found"
if name == "main":
pytest.main([file, "-v"])