📄 test_recipe_extractor.py 23,238 bytes Apr 27, 2026 📋 Raw

"""Tests for the waterfall recipe extractor.

Test case: https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/
Expected: JSON-LD sniper finds Recipe schema, returns ingredients without LLM call.
"""

import json
import os
import sys
import pytest

Add parent to path for imports

sys.path.insert(0, os.path.join(os.path.dirname(file), ".."))

from bs4 import BeautifulSoup

from costco_route.extractors.recipe import (
extract_json_ld,
extract_recipe,
extract_recipe_waterfall,
recipe_from_json_ld,
RecipeResult,
ExtractionMethod,
_flatten_ld_json,
_is_recipe_type,
_fix_json_newlines,
_extract_html_chunk,
_parse_servings,
_parse_duration,
_clean_ingredients,
_parse_instructions,
_extract_tags,
)

---------------------------------------------------------------------------

Sample ld+json data for unit tests

---------------------------------------------------------------------------

SAMPLE_LD_JSON_DIRECT = '''

'''

SAMPLE_LD_JSON_GRAPH = '''

'''

SAMPLE_LD_JSON_LIST = '''

'''

SAMPLE_HTML_NO_LD = '''

Simple Pasta

Ingredients: 1 lb pasta, 2 cups sauce, 1 tbsp olive oil

Instructions: Boil pasta. Add sauce. Serve.

'''

SAMPLE_HTML_WITH_LD = f'''

{SAMPLE_LD_JSON_DIRECT}

Funeral Sandwiches

'''

---------------------------------------------------------------------------

Test: _flatten_ld_json

---------------------------------------------------------------------------

class TestFlattenLdJson:
def test_direct_dict(self):
data = {"@type": "Recipe", "name": "Test"}
result = _flatten_ld_json(data)
assert len(result) == 1
assert result[0]["@type"] == "Recipe"

def test_list(self):
    data = [
        {"@type": "WebPage"},
        {"@type": "Recipe", "name": "Test"},
    ]
    result = _flatten_ld_json(data)
    assert len(result) == 2

def test_graph(self):
    data = {
        "@context": "https://schema.org",
        "@graph": [
            {"@type": "WebPage"},
            {"@type": "Recipe", "name": "Test"},
        ]
    }
    result = _flatten_ld_json(data)
    # Should have parent dict + 2 graph items
    assert len(result) >= 3
    recipe_items = [r for r in result if r.get("@type") == "Recipe"]
    assert len(recipe_items) == 1

def test_nested_graph(self):
    data = {
        "@context": "https://schema.org",
        "@graph": [
            {"@type": "WebPage"},
            {
                "@type": "ItemList",
                "@graph": [
                    {"@type": "Recipe", "name": "Nested"}
                ]
            }
        ]
    }
    result = _flatten_ld_json(data)
    # Should recursively flatten
    assert len(result) >= 3

---------------------------------------------------------------------------

Test: _is_recipe_type

---------------------------------------------------------------------------

class TestIsRecipeType:
def test_string_type(self):
assert _is_recipe_type({"@type": "Recipe"}) is True

def test_list_type(self):
    assert _is_recipe_type({"@type": ["Recipe", "CreativeWork"]}) is True

def test_non_recipe(self):
    assert _is_recipe_type({"@type": "WebPage"}) is False

def test_no_type(self):
    assert _is_recipe_type({}) is False

def test_non_dict(self):
    assert _is_recipe_type("string") is False

---------------------------------------------------------------------------

Test: _fix_json_newlines

---------------------------------------------------------------------------

class TestFixJsonNewlines:
def test_no_newlines(self):
assert _fix_json_newlines('{"key": "value"}') == '{"key": "value"}'

def test_escaped_newline_preserved(self):
    # Already escaped newlines should be preserved
    s = '{"key": "line1\\nline2"}'
    result = _fix_json_newlines(s)
    # The \\n should stay as \\n
    assert "\\n" in result

def test_unescaped_newline_fixed(self):
    # Unescaped newline inside a string should be fixed
    s = '{"key": "line1\nline2"}'
    result = _fix_json_newlines(s)
    # Should now be parseable JSON
    data = json.loads(result)
    assert data["key"] == "line1\nline2"

def test_tab_inside_string(self):
    s = '{"key": "col1\tcol2"}'
    result = _fix_json_newlines(s)
    data = json.loads(result)
    assert data["key"] == "col1\tcol2"

---------------------------------------------------------------------------

Test: extract_json_ld

---------------------------------------------------------------------------

class TestExtractJsonLd:
def test_direct_recipe_type(self):
result = extract_json_ld(SAMPLE_HTML_WITH_LD)
assert result is not None
assert result.get("@type") == "Recipe"
assert result.get("name") == "Funeral Sandwiches"
assert len(result.get("recipeIngredient", [])) == 6

def test_graph_structure(self):
    html = f"<html><head>{SAMPLE_LD_JSON_GRAPH}</head><body></body></html>"
    result = extract_json_ld(html)
    assert result is not None
    assert result.get("@type") == "Recipe"
    assert result.get("name") == "Funeral Sandwiches"

def test_list_of_scripts(self):
    html = f"<html><head>{SAMPLE_LD_JSON_LIST}</head><body></body></html>"
    result = extract_json_ld(html)
    assert result is not None
    assert result.get("@type") == "Recipe"

def test_no_ld_json(self):
    result = extract_json_ld(SAMPLE_HTML_NO_LD)
    assert result is None

def test_broken_json_skipped(self):
    html = '''
    <html><head>
    <script type="application/ld+json">{broken json</script>
    <script type="application/ld+json">{"@type": "Recipe", "name": "Works"}</script>
    </head><body></body></html>
    '''
    result = extract_json_ld(html)
    assert result is not None
    assert result.get("name") == "Works"

def test_multiple_types_in_array(self):
    html = '''
    <html><head>
    <script type="application/ld+json">
    {"@type": ["Recipe", "CreativeWork"], "name": "Multi-type", "recipeIngredient": ["eggs"], "recipeInstructions": "Cook."}
    </script>
    </head><body></body></html>
    '''
    result = extract_json_ld(html)
    assert result is not None
    assert "Recipe" in result.get("@type", [])

---------------------------------------------------------------------------

Test: recipe_from_json_ld

---------------------------------------------------------------------------

class TestRecipeFromJsonLd:
def test_full_recipe(self):
ld = {
"@type": "Recipe",
"name": "Funeral Sandwiches",
"recipeIngredient": [
"1 lb deli ham",
"Swiss cheese",
"Butter",
"Poppy seeds",
],
"recipeInstructions": [
{"@type": "HowToStep", "text": "Preheat oven."},
{"@type": "HowToStep", "text": "Assemble."},
],
"recipeYield": "12 sandwiches",
"prepTime": "PT15M",
"cookTime": "PT15M",
"totalTime": "PT30M",
"recipeCategory": "Appetizer",
}
result = recipe_from_json_ld(ld, source_url="https://example.com/test")
assert isinstance(result, RecipeResult)
assert result.title == "Funeral Sandwiches"
assert result.servings == "12"
assert result.prep_time == "15 min"
assert result.cook_time == "15 min"
assert result.total_time == "30 min"
assert len(result.ingredients) == 4
assert len(result.instructions) == 2
assert result.extraction_method == ExtractionMethod.JSON_LD
assert result.source_url == "https://example.com/test"

def test_how_to_section(self):
    """Test that HowToSection items are flattened."""
    ld = {
        "@type": "Recipe",
        "name": "Layered Recipe",
        "recipeIngredient": ["flour", "water"],
        "recipeInstructions": [
            {
                "@type": "HowToSection",
                "name": "Prep",
                "itemListElement": [
                    {"@type": "HowToStep", "text": "Mix flour and water."},
                    {"@type": "HowToStep", "text": "Knead for 5 minutes."},
                ]
            },
            {
                "@type": "HowToSection",
                "name": "Cook",
                "itemListElement": [
                    {"@type": "HowToStep", "text": "Bake at 350°F."},
                ]
            }
        ],
    }
    result = recipe_from_json_ld(ld)
    assert len(result.instructions) == 3
    assert result.instructions[0] == "Mix flour and water."

---------------------------------------------------------------------------

Test: _parse_servings

---------------------------------------------------------------------------

class TestParseServings:
def test_simple_number(self):
assert _parse_servings("4") == "4"

def test_serves_prefix(self):
    assert _parse_servings("Serves 6") == "6"

def test_range(self):
    assert _parse_servings("4-6") == "4-6"

def test_serves_range(self):
    assert _parse_servings("Serves 4-6") == "4-6"

def test_none(self):
    assert _parse_servings(None) is None

def test_empty(self):
    assert _parse_servings("") is None

---------------------------------------------------------------------------

Test: _parse_duration

---------------------------------------------------------------------------

class TestParseDuration:
def test_minutes(self):
assert _parse_duration("PT30M") == "30 min"

def test_hours_and_minutes(self):
    assert _parse_duration("PT1H30M") == "1 hr 30 min"

def test_hours_only(self):
    assert _parse_duration("PT2H") == "2 hr"

def test_none(self):
    assert _parse_duration(None) is None

def test_non_iso(self):
    # Pass through if not ISO format
    assert _parse_duration("30 minutes") == "30 minutes"

---------------------------------------------------------------------------

Test: _clean_ingredients

---------------------------------------------------------------------------

class TestCleanIngredients:
def test_basic(self):
assert _clean_ingredients(["1 cup flour", "2 eggs"]) == ["1 cup flour", "2 eggs"]

def test_newlines(self):
    assert _clean_ingredients(["1 cup\nflour"]) == ["1 cup flour"]

def test_empty_entries(self):
    assert _clean_ingredients(["eggs", "", "flour"]) == ["eggs", "flour"]

def test_none_entries(self):
    assert _clean_ingredients(["eggs", None, "flour"]) == ["eggs", "flour"]

---------------------------------------------------------------------------

Test: _parse_instructions

---------------------------------------------------------------------------

class TestParseInstructions:
def test_how_to_steps(self):
instructions = [
{"@type": "HowToStep", "text": "Step 1"},
{"@type": "HowToStep", "text": "Step 2"},
]
result = _parse_instructions(instructions)
assert result == ["Step 1", "Step 2"]

def test_string_instructions(self):
    result = _parse_instructions("Mix. Bake. Serve.")
    assert len(result) >= 1

def test_mixed_types(self):
    instructions = [
        {"@type": "HowToStep", "text": "Step 1"},
        "Step 2 as string",
    ]
    result = _parse_instructions(instructions)
    assert len(result) == 2

def test_empty_list(self):
    assert _parse_instructions([]) == []

---------------------------------------------------------------------------

Test: _extract_html_chunk

---------------------------------------------------------------------------

class TestExtractHtmlChunk:
def test_recipe_container(self):
html = '

2 cups flour

'
result = _extract_html_chunk(html)
assert "2 cups flour" in result

def test_article_fallback(self):
    html = '<html><body><article><p>Mix everything together</p></article></body></html>'
    result = _extract_html_chunk(html)
    assert "Mix everything together" in result

def test_script_removed(self):
    html = '<html><body><article><script>var x = 1;</script><p>Real content</p></article></body></html>'
    result = _extract_html_chunk(html)
    assert "var x" not in result
    assert "Real content" in result

def test_truncation(self):
    # Very long content should be truncated
    html = '<html><body><article>' + '<p>Content</p>' * 10000 + '</article></body></html>'
    result = _extract_html_chunk(html)
    assert len(result) <= 15000

---------------------------------------------------------------------------

Test: _extract_tags

---------------------------------------------------------------------------

class TestExtractTags:
def test_from_category(self):
result = _extract_tags({"recipeCategory": "Dessert", "name": "Cake"})
assert "Dessert" in result

def test_from_cuisine(self):
    result = _extract_tags({"recipeCuisine": "Italian", "name": "Pasta"})
    assert "Italian" in result

def test_from_keywords(self):
    result = _extract_tags({"keywords": "easy, quick, dinner", "name": "Pasta"})
    assert "easy" in result
    assert "quick" in result

def test_infer_dinner(self):
    result = _extract_tags({"name": "Chicken Stir Fry"})
    assert "dinner" in result

def test_dedup_limit(self):
    result = _extract_tags({"keywords": "a,b,c,d,e,f,g,h,i", "name": "Test"})
    assert len(result) <= 8

---------------------------------------------------------------------------

Integration test: extract_json_ld on real-world-like HTML

---------------------------------------------------------------------------

class TestExtractJsonLdRealistic:
def test_grilled_cheese_social_style(self):
"""Simulate the structure of grilledcheesesocial.com ld+json."""
html = '''

Funeral Sandwiches

'''

    result = extract_json_ld(html)
    assert result is not None
    assert result["name"] == "Funeral Sandwiches"
    assert len(result["recipeIngredient"]) == 8
    assert len(result["recipeInstructions"]) == 5

    # Convert to RecipeResult
    recipe = recipe_from_json_ld(result, source_url="https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/")
    assert recipe.title == "Funeral Sandwiches"
    assert recipe.extraction_method == ExtractionMethod.JSON_LD
    assert recipe.servings == "12"
    assert recipe.prep_time == "20 min"
    assert recipe.cook_time == "15 min"
    assert recipe.total_time == "35 min"
    assert "Appetizer" in recipe.tags
    assert "Southern" in recipe.tags

def test_multiple_ld_json_blocks(self):
    """Test that we skip non-Recipe blocks and find the Recipe."""
    html = '''<html><head>
    <script type="application/ld+json">
    {"@context": "https://schema.org", "@type": "BreadcrumbList", "itemListElement": [{"@type": "ListItem", "position": 1, "name": "Home"}]}
    </script>
    <script type="application/ld+json">
    {"@context": "https://schema.org", "@type": "Recipe", "name": "Test", "recipeIngredient": ["eggs"], "recipeInstructions": "Cook."}
    </script>
    </head><body></body></html>'''

    result = extract_json_ld(html)
    assert result is not None
    assert result["@type"] == "Recipe"

def test_recipe_in_graph_with_other_items(self):
    """Test @graph containing both Recipe and other types."""
    html = '''<html><head>
    <script type="application/ld+json">
    {
      "@context": "https://schema.org",
      "@graph": [
        {"@type": "WebSite", "name": "Example", "url": "https://example.com"},
        {"@type": "BreadcrumbList", "itemListElement": []},
        {"@type": "Recipe", "name": "Pasta", "recipeIngredient": ["pasta", "sauce"], "recipeInstructions": "Boil pasta."}
      ]
    }
    </script>
    </head><body></body></html>'''

    result = extract_json_ld(html)
    assert result is not None
    assert result["@type"] == "Recipe"
    assert result["name"] == "Pasta"

---------------------------------------------------------------------------

Test: RecipeResult.to_dict()

---------------------------------------------------------------------------

class TestRecipeResultToDict:
def test_success_dict(self):
result = RecipeResult(
title="Test Recipe",
ingredients=["eggs", "flour"],
instructions=["Mix", "Bake"],
extraction_method=ExtractionMethod.JSON_LD,
source_url="https://example.com",
)
d = result.to_dict()
assert d["title"] == "Test Recipe"
assert d["extracted_from"] == "json_ld"
assert "error" not in d

def test_error_dict(self):
    result = RecipeResult(
        extraction_method=ExtractionMethod.FAILED,
        error="Could not fetch HTML",
        source_url="https://example.com",
    )
    d = result.to_dict()
    assert d["error"] == "Could not fetch HTML"
    assert d["extracted_from"] == "failed"

---------------------------------------------------------------------------

Live integration test (skipped unless --live flag)

---------------------------------------------------------------------------

class TestLiveExtraction:
@pytest.mark.skipif(
not os.environ.get("LIVE_TEST"),
reason="Set LIVE_TEST=1 to run live extraction tests"
)
def test_grilledcheesesocial_json_ld(self):
"""Live test: verify JSON-LD extraction on grilledcheesesocial.com."""
url = "https://grilledcheesesocial.com/2020/01/19/funeral-sandwiches/"
result = extract_recipe(url, total_timeout=30)

    assert result.error is None, f"Extraction failed: {result.error}"
    assert result.extraction_method == ExtractionMethod.JSON_LD, \
        f"Expected json_ld extraction, got {result.extraction_method}"
    assert "Funeral Sandwiches" in result.title or "funeral" in result.title.lower(), \
        f"Unexpected title: {result.title}"
    assert len(result.ingredients) > 0, "No ingredients found"
    assert len(result.instructions) > 0, "No instructions found"

@pytest.mark.skipif(
    not os.environ.get("LIVE_TEST"),
    reason="Set LIVE_TEST=1 to run live extraction tests"
)
def test_allrecipes_json_ld(self):
    """Live test: verify JSON-LD on a major recipe site."""
    url = "https://www.allrecipes.com/recipe/228285/classic-funeral-sandwiches/"
    result = extract_recipe(url, total_timeout=30)

    assert result.error is None, f"Extraction failed: {result.error}"
    assert result.extraction_method == ExtractionMethod.JSON_LD, \
        f"Expected json_ld extraction, got {result.extraction_method}"
    assert len(result.ingredients) > 0, "No ingredients found"

if name == "main":
pytest.main([file, "-v"])