"""Sanity checks for LLM classification — prevent hallucinations and drops. Adapts costco_route/pipeline.py validation logic: - Item name matching → Document hash matching - Zone defaults → Member keyword inference - Costco-specific keywords → Family-specific keywords Sovereign: Zero imports from costco_route. """ import difflib from typing import Optional # Family-specific keywords for fallback inference MEMBER_KEYWORDS = { "sully": ["first grade", "mrs. smith", "sullivan", "sully", "dinosaur", "space", "lego"], "harper": ["pre-k", "preschool", "ms. johnson", "harper", "unicorn", "dance", "art"], "aundrea": ["hospital", "work", "night shift", "aundrea", "mom"], "matt": ["software", "meeting", "work", "matt", "dad"], } def validate_classification( classified: dict[str, list[dict]], original_documents: list[dict] ) -> dict[str, list[dict]]: """Remove hallucinated classifications, recover dropped documents. The LLM sometimes: 1. Assigns documents not in original list (hallucination) 2. Drops documents from original list Args: classified: {member_id: [document_dicts]} from LLM original_documents: Original document list with content_hash Returns: Validated classification dict """ # Build lookup of original docs by content hash original_hashes = {doc["content_hash"]: doc for doc in original_documents} matched_hashes = set() # Filter out hallucinations validated = {} for member_id, docs in classified.items(): kept = [] for doc in docs: doc_hash = doc.get("content_hash") if doc_hash in original_hashes: matched_hashes.add(doc_hash) kept.append(doc) if kept: validated[member_id] = kept # Find dropped documents dropped = [doc for h, doc in original_hashes.items() if h not in matched_hashes] if dropped: # Assign to member based on keywords for doc in dropped: member = _infer_member_from_content(doc) validated.setdefault(member, []).append(doc) return validated def _infer_member_from_content(doc: dict) -> str: """Infer family member from document content (keyword fallback).""" content = doc.get("content", "").lower() for member, keywords in MEMBER_KEYWORDS.items(): if any(kw in content for kw in keywords): return member return "family" # Default — general family document def validate_confidence(classified: dict[str, list[dict]], threshold: float = 0.7) -> dict[str, list[dict]]: """Flag low-confidence classifications for user review. Args: classified: {member_id: [document_dicts]} threshold: Minimum confidence for auto-acceptance Returns: Same structure but with _meta.flags added to low-confidence docs """ for member_id, docs in classified.items(): for doc in docs: confidence = doc.get("confidence", 0.0) if confidence < threshold: if "_meta" not in doc: doc["_meta"] = {} if "flags" not in doc["_meta"]: doc["_meta"]["flags"] = [] doc["_meta"]["flags"].append("low_confidence") doc["_meta"]["suggested_action"] = "user_review" return classified def detect_conflicts(classified: dict[str, list[dict]], calendar_events: list[dict] = None) -> list[dict]: """Detect calendar conflicts for classified documents. Args: classified: {member_id: [document_dicts]} calendar_events: List of upcoming calendar events Returns: List of conflict descriptions """ if not calendar_events: return [] conflicts = [] for member_id, docs in classified.items(): for doc in docs: doc_date = doc.get("date") if not doc_date: continue for event in calendar_events: event_date = event.get("start", {}).get("date") or event.get("start", {}).get("dateTime", "")[:10] if event_date == doc_date: conflicts.append({ "document": doc.get("title", "Untitled"), "member": member_id, "conflict_with": event.get("summary", "Unknown event"), "date": doc_date, "severity": "warning" }) return conflicts