"""Sanity checks for LLM classification — prevent hallucinations and drops.
Adapts costco_route/pipeline.py validation logic:
- Item name matching → Document hash matching
- Zone defaults → Member keyword inference
- Costco-specific keywords → Family-specific keywords
Sovereign: Zero imports from costco_route.
"""
import difflib
from typing import Optional
Family-specific keywords for fallback inference
MEMBER_KEYWORDS = {
"sully": ["first grade", "mrs. smith", "sullivan", "sully", "dinosaur", "space", "lego"],
"harper": ["pre-k", "preschool", "ms. johnson", "harper", "unicorn", "dance", "art"],
"aundrea": ["hospital", "work", "night shift", "aundrea", "mom"],
"matt": ["software", "meeting", "work", "matt", "dad"],
}
def validate_classification(
classified: dict[str, list[dict]],
original_documents: list[dict]
) -> dict[str, list[dict]]:
"""Remove hallucinated classifications, recover dropped documents.
The LLM sometimes:
1. Assigns documents not in original list (hallucination)
2. Drops documents from original list
Args:
classified: {member_id: [document_dicts]} from LLM
original_documents: Original document list with content_hash
Returns:
Validated classification dict
"""
# Build lookup of original docs by content hash
original_hashes = {doc["content_hash"]: doc for doc in original_documents}
matched_hashes = set()
# Filter out hallucinations
validated = {}
for member_id, docs in classified.items():
kept = []
for doc in docs:
doc_hash = doc.get("content_hash")
if doc_hash in original_hashes:
matched_hashes.add(doc_hash)
kept.append(doc)
if kept:
validated[member_id] = kept
# Find dropped documents
dropped = [doc for h, doc in original_hashes.items()
if h not in matched_hashes]
if dropped:
# Assign to member based on keywords
for doc in dropped:
member = _infer_member_from_content(doc)
validated.setdefault(member, []).append(doc)
return validated
def _infer_member_from_content(doc: dict) -> str:
"""Infer family member from document content (keyword fallback)."""
content = doc.get("content", "").lower()
for member, keywords in MEMBER_KEYWORDS.items():
if any(kw in content for kw in keywords):
return member
return "family" # Default — general family document
def validate_confidence(classified: dict[str, list[dict]], threshold: float = 0.7) -> dict[str, list[dict]]:
"""Flag low-confidence classifications for user review.
Args:
classified: {member_id: [document_dicts]}
threshold: Minimum confidence for auto-acceptance
Returns:
Same structure but with _meta.flags added to low-confidence docs
"""
for member_id, docs in classified.items():
for doc in docs:
confidence = doc.get("confidence", 0.0)
if confidence < threshold:
if "_meta" not in doc:
doc["_meta"] = {}
if "flags" not in doc["_meta"]:
doc["_meta"]["flags"] = []
doc["_meta"]["flags"].append("low_confidence")
doc["_meta"]["suggested_action"] = "user_review"
return classified
def detect_conflicts(classified: dict[str, list[dict]], calendar_events: list[dict] = None) -> list[dict]:
"""Detect calendar conflicts for classified documents.
Args:
classified: {member_id: [document_dicts]}
calendar_events: List of upcoming calendar events
Returns:
List of conflict descriptions
"""
if not calendar_events:
return []
conflicts = []
for member_id, docs in classified.items():
for doc in docs:
doc_date = doc.get("date")
if not doc_date:
continue
for event in calendar_events:
event_date = event.get("start", {}).get("date") or event.get("start", {}).get("dateTime", "")[:10]
if event_date == doc_date:
conflicts.append({
"document": doc.get("title", "Untitled"),
"member": member_id,
"conflict_with": event.get("summary", "Unknown event"),
"date": doc_date,
"severity": "warning"
})
return conflicts