📄 test_recipe_extractor.py 23,238 bytes Apr 27, 2026 📋 Raw

"""Tests for the waterfall recipe extractor.

Test case: https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/
Expected: JSON-LD sniper finds Recipe schema, returns ingredients without LLM call.
"""

import json
import os
import sys
import pytest

Add parent to path for imports

Name: Funeral Sandwiches
Prep time: 15 min
Cook time: 15 min
Total time: 30 min
Yield: 12 sandwiches
Cuisine: American
Ingredients: 1 lb deli ham; 1 lb Swiss cheese, sliced; 1/2 cup butter, softened; 1 tbsp poppy seeds; 1 tsp mustard; 12 slider buns

sys.path.insert(0, os.path.join(os.path.dirname(file), ".."))

from bs4 import BeautifulSoup

from costco_route.extractors.recipe import (
extract_json_ld,
extract_recipe,
extract_recipe_waterfall,
recipe_from_json_ld,
RecipeResult,
ExtractionMethod,
_flatten_ld_json,
_is_recipe_type,
_fix_json_newlines,
_extract_html_chunk,
_parse_servings,
_parse_duration,
_clean_ingredients,
_parse_instructions,
_extract_tags,
)

---------------------------------------------------------------------------

Sample ld+json data for unit tests

---------------------------------------------------------------------------

SAMPLE_LD_JSON_DIRECT = '''

'''

SAMPLE_LD_JSON_GRAPH = '''

'''

SAMPLE_LD_JSON_LIST = '''

'''

SAMPLE_HTML_NO_LD = '''

Simple Pasta

Ingredients: 1 lb pasta, 2 cups sauce, 1 tbsp olive oil

Instructions: Boil pasta. Add sauce. Serve.

'''

SAMPLE_HTML_WITH_LD = f'''

{SAMPLE_LD_JSON_DIRECT}

Funeral Sandwiches

'''

---------------------------------------------------------------------------

Test: _flatten_ld_json

---------------------------------------------------------------------------

class TestFlattenLdJson:
def test_direct_dict(self):
data = {"@type": "Recipe", "name": "Test"}
result = _flatten_ld_json(data)
assert len(result) == 1
assert result[0]["@type"] == "Recipe"

def test_list(self):
    data = [
        {"@type": "WebPage"},
        {"@type": "Recipe", "name": "Test"},
    ]
    result = _flatten_ld_json(data)
    assert len(result) == 2

def test_graph(self):
    data = {
        "@context": "https://schema.org",
        "@graph": [
            {"@type": "WebPage"},
            {"@type": "Recipe", "name": "Test"},
        ]
    }
    result = _flatten_ld_json(data)
    # Should have parent dict + 2 graph items
    assert len(result) >= 3
    recipe_items = [r for r in result if r.get("@type") == "Recipe"]
    assert len(recipe_items) == 1

def test_nested_graph(self):
    data = {
        "@context": "https://schema.org",
        "@graph": [
            {"@type": "WebPage"},
            {
                "@type": "ItemList",
                "@graph": [
                    {"@type": "Recipe", "name": "Nested"}
                ]
            }
        ]
    }
    result = _flatten_ld_json(data)
    # Should recursively flatten
    assert len(result) >= 3

---------------------------------------------------------------------------

Test: _is_recipe_type

---------------------------------------------------------------------------

class TestIsRecipeType:
def test_string_type(self):
assert _is_recipe_type({"@type": "Recipe"}) is True

def test_list_type(self):
    assert _is_recipe_type({"@type": ["Recipe", "CreativeWork"]}) is True

def test_non_recipe(self):
    assert _is_recipe_type({"@type": "WebPage"}) is False

def test_no_type(self):
    assert _is_recipe_type({}) is False

def test_non_dict(self):
    assert _is_recipe_type("string") is False

---------------------------------------------------------------------------

Test: _fix_json_newlines

---------------------------------------------------------------------------

class TestFixJsonNewlines:
def test_no_newlines(self):
assert _fix_json_newlines('{"key": "value"}') == '{"key": "value"}'

def test_escaped_newline_preserved(self):
    # Already escaped newlines should be preserved
    s = '{"key": "line1\\nline2"}'
    result = _fix_json_newlines(s)
    # The \\n should stay as \\n
    assert "\\n" in result

def test_unescaped_newline_fixed(self):
    # Unescaped newline inside a string should be fixed
    s = '{"key": "line1\nline2"}'
    result = _fix_json_newlines(s)
    # Should now be parseable JSON
    data = json.loads(result)
    assert data["key"] == "line1\nline2"

def test_tab_inside_string(self):
    s = '{"key": "col1\tcol2"}'
    result = _fix_json_newlines(s)
    data = json.loads(result)
    assert data["key"] == "col1\tcol2"

---------------------------------------------------------------------------

Test: extract_json_ld

---------------------------------------------------------------------------

class TestExtractJsonLd:
def test_direct_recipe_type(self):
result = extract_json_ld(SAMPLE_HTML_WITH_LD)
assert result is not None
assert result.get("@type") == "Recipe"
assert result.get("name") == "Funeral Sandwiches"
assert len(result.get("recipeIngredient", [])) == 6

def test_graph_structure(self):
    html = f"<html><head>{SAMPLE_LD_JSON_GRAPH}</head><body></body></html>"
    result = extract_json_ld(html)
    assert result is not None
    assert result.get("@type") == "Recipe"
    assert result.get("name") == "Funeral Sandwiches"

def test_list_of_scripts(self):
    html = f"<html><head>{SAMPLE_LD_JSON_LIST}</head><body></body></html>"
    result = extract_json_ld(html)
    assert result is not None
    assert result.get("@type") == "Recipe"

def test_no_ld_json(self):
    result = extract_json_ld(SAMPLE_HTML_NO_LD)
    assert result is None

def test_broken_json_skipped(self):
    html = '''
    <html><head>
    <script type="application/ld+json">{broken json</script>
    <script type="application/ld+json">{"@type": "Recipe", "name": "Works"}</script>
    </head><body></body></html>
    '''
    result = extract_json_ld(html)
    assert result is not None
    assert result.get("name") == "Works"

def test_multiple_types_in_array(self):
    html = '''
    <html><head>
    <script type="application/ld+json">
    {"@type": ["Recipe", "CreativeWork"], "name": "Multi-type", "recipeIngredient": ["eggs"], "recipeInstructions": "Cook."}
    </script>
    </head><body></body></html>
    '''
    result = extract_json_ld(html)
    assert result is not None
    assert "Recipe" in result.get("@type", [])

---------------------------------------------------------------------------

Test: recipe_from_json_ld

---------------------------------------------------------------------------

class TestRecipeFromJsonLd:
def test_full_recipe(self):
ld = {
"@type": "Recipe",
"name": "Funeral Sandwiches",
"recipeIngredient": [
"1 lb deli ham",
"Swiss cheese",
"Butter",
"Poppy seeds",
],
"recipeInstructions": [
{"@type": "HowToStep", "text": "Preheat oven."},
{"@type": "HowToStep", "text": "Assemble."},
],
"recipeYield": "12 sandwiches",
"prepTime": "PT15M",
"cookTime": "PT15M",
"totalTime": "PT30M",
"recipeCategory": "Appetizer",
}
result = recipe_from_json_ld(ld, source_url="https://example.com/test")
assert isinstance(result, RecipeResult)
assert result.title == "Funeral Sandwiches"
assert result.servings == "12"
assert result.prep_time == "15 min"
assert result.cook_time == "15 min"
assert result.total_time == "30 min"
assert len(result.ingredients) == 4
assert len(result.instructions) == 2
assert result.extraction_method == ExtractionMethod.JSON_LD
assert result.source_url == "https://example.com/test"

def test_how_to_section(self):
    """Test that HowToSection items are flattened."""
    ld = {
        "@type": "Recipe",
        "name": "Layered Recipe",
        "recipeIngredient": ["flour", "water"],
        "recipeInstructions": [
            {
                "@type": "HowToSection",
                "name": "Prep",
                "itemListElement": [
                    {"@type": "HowToStep", "text": "Mix flour and water."},
                    {"@type": "HowToStep", "text": "Knead for 5 minutes."},
                ]
            },
            {
                "@type": "HowToSection",
                "name": "Cook",
                "itemListElement": [
                    {"@type": "HowToStep", "text": "Bake at 350°F."},
                ]
            }
        ],
    }
    result = recipe_from_json_ld(ld)
    assert len(result.instructions) == 3
    assert result.instructions[0] == "Mix flour and water."

---------------------------------------------------------------------------

Test: _parse_servings

---------------------------------------------------------------------------

class TestParseServings:
def test_simple_number(self):
assert _parse_servings("4") == "4"

def test_serves_prefix(self):
    assert _parse_servings("Serves 6") == "6"

def test_range(self):
    assert _parse_servings("4-6") == "4-6"

def test_serves_range(self):
    assert _parse_servings("Serves 4-6") == "4-6"

def test_none(self):
    assert _parse_servings(None) is None

def test_empty(self):
    assert _parse_servings("") is None

---------------------------------------------------------------------------

Test: _parse_duration

---------------------------------------------------------------------------

class TestParseDuration:
def test_minutes(self):
assert _parse_duration("PT30M") == "30 min"

def test_hours_and_minutes(self):
    assert _parse_duration("PT1H30M") == "1 hr 30 min"

def test_hours_only(self):
    assert _parse_duration("PT2H") == "2 hr"

def test_none(self):
    assert _parse_duration(None) is None

def test_non_iso(self):
    # Pass through if not ISO format
    assert _parse_duration("30 minutes") == "30 minutes"

---------------------------------------------------------------------------

Test: _clean_ingredients

---------------------------------------------------------------------------

class TestCleanIngredients:
def test_basic(self):
assert _clean_ingredients(["1 cup flour", "2 eggs"]) == ["1 cup flour", "2 eggs"]

def test_newlines(self):
    assert _clean_ingredients(["1 cup\nflour"]) == ["1 cup flour"]

def test_empty_entries(self):
    assert _clean_ingredients(["eggs", "", "flour"]) == ["eggs", "flour"]

def test_none_entries(self):
    assert _clean_ingredients(["eggs", None, "flour"]) == ["eggs", "flour"]

---------------------------------------------------------------------------

Test: _parse_instructions

---------------------------------------------------------------------------

class TestParseInstructions:
def test_how_to_steps(self):
instructions = [
{"@type": "HowToStep", "text": "Step 1"},
{"@type": "HowToStep", "text": "Step 2"},
]
result = _parse_instructions(instructions)
assert result == ["Step 1", "Step 2"]

def test_string_instructions(self):
    result = _parse_instructions("Mix. Bake. Serve.")
    assert len(result) >= 1

def test_mixed_types(self):
    instructions = [
        {"@type": "HowToStep", "text": "Step 1"},
        "Step 2 as string",
    ]
    result = _parse_instructions(instructions)
    assert len(result) == 2

def test_empty_list(self):
    assert _parse_instructions([]) == []

---------------------------------------------------------------------------

Test: _extract_html_chunk

---------------------------------------------------------------------------

class TestExtractHtmlChunk:
def test_recipe_container(self):
html = '

2 cups flour

'
result = _extract_html_chunk(html)
assert "2 cups flour" in result

def test_article_fallback(self):
    html = '<html><body><article><p>Mix everything together</p></article></body></html>'
    result = _extract_html_chunk(html)
    assert "Mix everything together" in result

def test_script_removed(self):
    html = '<html><body><article><script>var x = 1;</script><p>Real content</p></article></body></html>'
    result = _extract_html_chunk(html)
    assert "var x" not in result
    assert "Real content" in result

def test_truncation(self):
    # Very long content should be truncated
    html = '<html><body><article>' + '<p>Content</p>' * 10000 + '</article></body></html>'
    result = _extract_html_chunk(html)
    assert len(result) <= 15000

---------------------------------------------------------------------------

Test: _extract_tags

---------------------------------------------------------------------------

class TestExtractTags:
def test_from_category(self):
result = _extract_tags({"recipeCategory": "Dessert", "name": "Cake"})
assert "Dessert" in result

def test_from_cuisine(self):
    result = _extract_tags({"recipeCuisine": "Italian", "name": "Pasta"})
    assert "Italian" in result

def test_from_keywords(self):
    result = _extract_tags({"keywords": "easy, quick, dinner", "name": "Pasta"})
    assert "easy" in result
    assert "quick" in result

def test_infer_dinner(self):
    result = _extract_tags({"name": "Chicken Stir Fry"})
    assert "dinner" in result

def test_dedup_limit(self):
    result = _extract_tags({"keywords": "a,b,c,d,e,f,g,h,i", "name": "Test"})
    assert len(result) <= 8

---------------------------------------------------------------------------

Integration test: extract_json_ld on real-world-like HTML

---------------------------------------------------------------------------

class TestExtractJsonLdRealistic:
def test_grilled_cheese_social_style(self):
"""Simulate the structure of grilledcheesesocial.com ld+json."""
html = '''

Funeral Sandwiches

'''

    result = extract_json_ld(html)
    assert result is not None
    assert result["name"] == "Funeral Sandwiches"
    assert len(result["recipeIngredient"]) == 8
    assert len(result["recipeInstructions"]) == 5

    # Convert to RecipeResult
    recipe = recipe_from_json_ld(result, source_url="https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/")
    assert recipe.title == "Funeral Sandwiches"
    assert recipe.extraction_method == ExtractionMethod.JSON_LD
    assert recipe.servings == "12"
    assert recipe.prep_time == "20 min"
    assert recipe.cook_time == "15 min"
    assert recipe.total_time == "35 min"
    assert "Appetizer" in recipe.tags
    assert "Southern" in recipe.tags

def test_multiple_ld_json_blocks(self):
    """Test that we skip non-Recipe blocks and find the Recipe."""
    html = '''<html><head>
    <script type="application/ld+json">
    {"@context": "https://schema.org", "@type": "BreadcrumbList", "itemListElement": [{"@type": "ListItem", "position": 1, "name": "Home"}]}
    </script>
    <script type="application/ld+json">
    {"@context": "https://schema.org", "@type": "Recipe", "name": "Test", "recipeIngredient": ["eggs"], "recipeInstructions": "Cook."}
    </script>
    </head><body></body></html>'''

    result = extract_json_ld(html)
    assert result is not None
    assert result["@type"] == "Recipe"

def test_recipe_in_graph_with_other_items(self):
    """Test @graph containing both Recipe and other types."""
    html = '''<html><head>
    <script type="application/ld+json">
    {
      "@context": "https://schema.org",
      "@graph": [
        {"@type": "WebSite", "name": "Example", "url": "https://example.com"},
        {"@type": "BreadcrumbList", "itemListElement": []},
        {"@type": "Recipe", "name": "Pasta", "recipeIngredient": ["pasta", "sauce"], "recipeInstructions": "Boil pasta."}
      ]
    }
    </script>
    </head><body></body></html>'''

    result = extract_json_ld(html)
    assert result is not None
    assert result["@type"] == "Recipe"
    assert result["name"] == "Pasta"

---------------------------------------------------------------------------

Test: RecipeResult.to_dict()

---------------------------------------------------------------------------

class TestRecipeResultToDict:
def test_success_dict(self):
result = RecipeResult(
title="Test Recipe",
ingredients=["eggs", "flour"],
instructions=["Mix", "Bake"],
extraction_method=ExtractionMethod.JSON_LD,
source_url="https://example.com",
)
d = result.to_dict()
assert d["title"] == "Test Recipe"
assert d["extracted_from"] == "json_ld"
assert "error" not in d

def test_error_dict(self):
    result = RecipeResult(
        extraction_method=ExtractionMethod.FAILED,
        error="Could not fetch HTML",
        source_url="https://example.com",
    )
    d = result.to_dict()
    assert d["error"] == "Could not fetch HTML"
    assert d["extracted_from"] == "failed"

---------------------------------------------------------------------------

Live integration test (skipped unless --live flag)

---------------------------------------------------------------------------

class TestLiveExtraction:
@pytest.mark.skipif(
not os.environ.get("LIVE_TEST"),
reason="Set LIVE_TEST=1 to run live extraction tests"
)
def test_grilledcheesesocial_json_ld(self):
"""Live test: verify JSON-LD extraction on grilledcheesesocial.com."""
url = "https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/"
result = extract_recipe(url, total_timeout=30)

    assert result.error is None, f"Extraction failed: {result.error}"
    assert result.extraction_method == ExtractionMethod.JSON_LD, \
        f"Expected json_ld extraction, got {result.extraction_method}"
    assert "Funeral Sandwiches" in result.title or "funeral" in result.title.lower(), \
        f"Unexpected title: {result.title}"
    assert len(result.ingredients) > 0, "No ingredients found"
    assert len(result.instructions) > 0, "No instructions found"

@pytest.mark.skipif(
    not os.environ.get("LIVE_TEST"),
    reason="Set LIVE_TEST=1 to run live extraction tests"
)
def test_allrecipes_json_ld(self):
    """Live test: verify JSON-LD on a major recipe site."""
    url = "https://www.allrecipes.com/recipe/228285/classic-funeral-sandwiches/"
    result = extract_recipe(url, total_timeout=30)

    assert result.error is None, f"Extraction failed: {result.error}"
    assert result.extraction_method == ExtractionMethod.JSON_LD, \
        f"Expected json_ld extraction, got {result.extraction_method}"
    assert len(result.ingredients) > 0, "No ingredients found"

if name == "main":
pytest.main([file, "-v"])

← Back