item_memory.py — HoffDoc

📄 item_memory.py 5,905 bytes Apr 19, 2026 📋 Raw

"""ChromaDB-based learned item location store.

Remembers where items actually are after user calibration.
Overrides LLM classification on future trips.

Two-tier matching:
- User calibrations (source=user_calibration): standard threshold (0.85)
- Instacart seed data (source=instacart): higher threshold (0.93) to prevent
false matches from the 50K product database
"""

import chromadb
from costco_route.config import CHROMA_PATH, CHROMA_COLLECTION, EMBED_MODEL
from costco_route.llm_client import get_embedding

def _get_collection():
"""Get or create the ChromaDB collection."""
client = chromadb.PersistentClient(path=CHROMA_PATH)
return client.get_or_create_collection(
name=CHROMA_COLLECTION,
metadata={"hnsw:space": "cosine"},
)

def learn_item(item: str, zone_id: str, notes: str = ""):
"""Save a learned item location. Overwrites previous entry for same item.

Args:
    item: Item name (e.g., "Kirkland Organic Milk")
    zone_id: Zone ID (01-10)
    notes: Optional notes (e.g., "back corner by bakery")
"""
col = _get_collection()
emb = get_embedding(item)
doc_id = item.lower().strip().replace(" ", "_")[:64]
col.upsert(
    ids=[doc_id],
    embeddings=[emb],
    documents=[item],
    metadatas=[{"zone": zone_id, "notes": notes, "source": "user_calibration"}],
)

def lookup_item(item: str, threshold: float = 0.85) -> dict | None:
"""Check if we have a learned location for an item.

Two-tier matching:
- User calibrations (source=user_calibration): use the standard threshold
- Instacart seed data (source=instacart): require higher similarity (0.93+)
  to prevent false matches from the 50K product database

Args:
    item: Item to look up
    threshold: Cosine similarity threshold for user calibrations (0-1).

Returns:
    Dict with zone, notes, item match — or None if no match found.
"""
col = _get_collection()
if col.count() == 0:
    return None

emb = get_embedding(item)
results = col.query(
    query_embeddings=[emb],
    n_results=5,
    include=["documents", "metadatas", "distances"],
)

if not results["ids"] or not results["ids"][0]:
    return None

# ChromaDB cosine distance: 0 = identical, 2 = opposite
# Convert to similarity: 1 - distance/2
#
# Priority: user_calibration > instacart seed
# Instacart matches need higher similarity to avoid false positives
best_user_match = None
best_seed_match = None

for i, doc_id in enumerate(results["ids"][0]):
    distance = results["distances"][0][i]
    similarity = 1.0 - (distance / 2.0)
    source = results["metadatas"][0][i].get("source", "unknown")

    if source == "user_calibration" and similarity >= threshold:
        if best_user_match is None or similarity > best_user_match["similarity"]:
            best_user_match = {
                "item": results["documents"][0][i],
                "zone": results["metadatas"][0][i]["zone"],
                "notes": results["metadatas"][0][i].get("notes", ""),
                "similarity": round(similarity, 3),
                "source": source,
            }
    elif source == "instacart" and similarity >= 0.93:
        if best_seed_match is None or similarity > best_seed_match["similarity"]:
            best_seed_match = {
                "item": results["documents"][0][i],
                "zone": results["metadatas"][0][i]["zone"],
                "notes": results["metadatas"][0][i].get("notes", ""),
                "similarity": round(similarity, 3),
                "source": source,
            }

# User calibrations always win
return best_user_match or best_seed_match

def lookup_items(items: list[str], threshold: float = 0.85) -> dict[str, dict]:
"""Batch lookup for multiple items.

Returns:
    Dict of item → lookup result (only items with matches).
"""
results = {}
for item in items:
    match = lookup_item(item, threshold)
    if match:
        results[item] = match
return results

def lookup_item_exact(item: str) -> dict | None:
"""Exact match lookup — only returns user calibrations, not seed data.

Use for the /learn command to verify existing calibrations.
"""
col = _get_collection()
doc_id = item.lower().strip().replace(" ", "_")[:64]
try:
    result = col.get(ids=[doc_id], include=["documents", "metadatas"])
    if result["ids"]:
        return {
            "item": result["documents"][0],
            "zone": result["metadatas"][0]["zone"],
            "notes": result["metadatas"][0].get("notes", ""),
        }
except Exception:
    pass
return None

def stats() -> dict:
"""Return stats about the learned item database."""
col = _get_collection()
total = col.count()

# Count by source
user_count = 0
seed_count = 0
try:
    user_results = col.get(where={"source": "user_calibration"}, include=[])
    user_count = len(user_results["ids"])
    seed_results = col.get(where={"source": "instacart"}, include=[])
    seed_count = len(seed_results["ids"])
except Exception:
    pass

return {
    "total_items": total,
    "user_calibrations": user_count,
    "instacart_seeded": seed_count,
    "collection": CHROMA_COLLECTION,
    "path": CHROMA_PATH,
}

def purge_old(months: int = 6) -> dict:
"""Remove entries older than N months.

Costco moves products around, so old calibrations may be stale.
"""
# ChromaDB doesn't have native date filtering on metadata easily,
# so for now this is a placeholder. Could add timestamp metadata.
return {"purged": 0, "remaining": stats()["total_items"]}

← Back