#!/usr/bin/env python3 """ Thoth Shadow Ingestion — passive knowledge graph builder. **Design:** - Reads from the main memory SQLite (chunks table) to extract person/place/project/fact entities - Builds entity-relation graph from co-occurrence patterns and structured data - Writes to Thoth's SQLite + FAISS index for future query/exploration - Runs as non-serving shadow mode — does NOT affect recall or any agent functions **Execution model:** - Cron-driven, single-shot per run (idempotent — dedup by normalized subject + type) - Logs stats to stdout for cron monitoring - Can be run manually: `python3 shadow_ingest.py` **To hook into your workflow:** - Add a cron job that runs this script every 6-12 hours - Check the output log for entity counts growing over time """ import json import os import re import sys import logging from datetime import datetime, timezone logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) logger = logging.getLogger("thoth-shadow") # ────────────────────────────────────────────── # Paths # ────────────────────────────────────────────── THOTH_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, THOTH_DIR) MEMORY_DB = os.path.expanduser("~/.openclaw/memory/main.sqlite") from graph import ( add_entity, add_relation, search_entities, get_entity, health, init_db, _rebuild_index ) from graph import ValidationError # ────────────────────────────────────────────── # Entity extraction helpers # ────────────────────────────────────────────── # Simple patterns for entity extraction from text PERSON_PATTERNS = [ r'\b(?:Matt|Matthew)\b', r'\b(?:Aundrea)\b', r'\b(?:Sullivan|Sully)\b', r'\b(?:Harper)\b', r'\b(?:Maggie)\b', r'\b(?:Socrates|Daedalus|Wadsworth|Midas)\b', ] PLACE_PATTERNS = [ r'\b(?:Green Bay|Wisconsin|WI)\b', r'\b(?:Chicago|Illinois)\b', ] PROJECT_PATTERNS = [ r'\b(?:Icarus|HoffDesk|Thoth|Hoffmann Board)\b', r'\b(?:OpenClaw|ClawHub)\b', ] FACT_PATTERNS = {} # facts are extracted from structured content, not by pattern def extract_entities_from_text(text: str, source: str = "memory") -> list[dict]: """Extract candidate entities from text chunks.""" entities = [] for pattern in PERSON_PATTERNS: if re.search(pattern, text): # Extract the matched name match = re.search(pattern, text) name = match.group(0) # Normalize if name == "Matt" or name == "Matthew": name = "Matt Hoffmann" elif name == "Aundrea": name = "Aundrea Hoffmann" elif name == "Sullivan" or name == "Sully": name = "Sullivan Hoffmann" elif name == "Harper": name = "Harper Hoffmann" elif name == "Maggie": name = "Maggie (dog)" entities.append({ "type": "person", "subject": name, "description": "", "tags": ["family" if name != "Maggie (dog)" else "pet"], "confidence": 0.8, "source": source, }) for pattern in PLACE_PATTERNS: if re.search(pattern, text): match = re.search(pattern, text) name = match.group(0) if name == "WI": name = "Wisconsin" entities.append({ "type": "place", "subject": name, "description": "", "tags": ["location"], "confidence": 0.7, "source": source, }) for pattern in PROJECT_PATTERNS: if re.search(pattern, text): match = re.search(pattern, text) name = match.group(0) entities.append({ "type": "project", "subject": name, "description": "", "tags": ["project"], "confidence": 0.7, "source": source, }) return entities def extract_structured_entities(chunks: list[dict]) -> list[dict]: """Extract entities from structured memory chunks that look like key facts.""" entities = [] # Known persistent entities from MEMORY.md / IDENTITY.md content known = [ {"type": "person", "subject": "Matt Hoffmann", "tags": ["family", "director"], "confidence": 1.0}, {"type": "person", "subject": "Aundrea Hoffmann", "tags": ["family", "spouse"], "confidence": 1.0}, {"type": "person", "subject": "Sullivan Hoffmann", "tags": ["family", "child"], "confidence": 1.0}, {"type": "person", "subject": "Harper Hoffmann", "tags": ["family", "child"], "confidence": 1.0}, {"type": "person", "subject": "Maggie (dog)", "tags": ["pet"], "confidence": 1.0}, {"type": "person", "subject": "Socrates", "tags": ["agent", "backend"], "confidence": 1.0, "source": "extraction"}, {"type": "person", "subject": "Daedalus", "tags": ["agent", "frontend"], "confidence": 1.0, "source": "extraction"}, {"type": "person", "subject": "Wadsworth", "tags": ["agent", "chief-of-staff"], "confidence": 1.0, "source": "extraction"}, {"type": "person", "subject": "Midas", "tags": ["agent", "monetization"], "confidence": 1.0, "source": "extraction"}, {"type": "place", "subject": "Green Bay", "tags": ["location", "home"], "confidence": 1.0}, {"type": "place", "subject": "Wisconsin", "tags": ["location", "state"], "confidence": 1.0}, {"type": "project", "subject": "Icarus", "tags": ["project", "knowledge-graph"], "confidence": 1.0}, {"type": "project", "subject": "HoffDesk", "tags": ["project", "system"], "confidence": 1.0}, {"type": "project", "subject": "Thoth", "tags": ["project", "knowledge-graph"], "confidence": 1.0}, {"type": "fact", "subject": "Timezone: America/Chicago (CST)", "tags": ["system", "timezone"], "confidence": 1.0}, {"type": "fact", "subject": "Location: Green Bay, WI", "tags": ["system", "location"], "confidence": 1.0}, ] for entity in known: entity.setdefault("description", "") entity.setdefault("source", "extraction") entities.append(entity) return entities # ────────────────────────────────────────────── # Relation inference # ────────────────────────────────────────────── KNOWN_RELATIONS = [ # Family ("Matt Hoffmann", "Aundrea Hoffmann", "spouse_of"), ("Matt Hoffmann", "Sullivan Hoffmann", "father_of"), ("Matt Hoffmann", "Harper Hoffmann", "father_of"), ("Aundrea Hoffmann", "Sullivan Hoffmann", "mother_of"), ("Aundrea Hoffmann", "Harper Hoffmann", "mother_of"), ("Sullivan Hoffmann", "Harper Hoffmann", "sibling_of"), # Location ("Matt Hoffmann", "Green Bay", "lives_in"), ("Aundrea Hoffmann", "Green Bay", "lives_in"), ("Sullivan Hoffmann", "Green Bay", "lives_in"), ("Harper Hoffmann", "Green Bay", "lives_in"), ("Green Bay", "Wisconsin", "located_in"), # Agent relations ("Wadsworth", "Matt Hoffmann", "reports_to"), ("Socrates", "Matt Hoffmann", "reports_to"), ("Daedalus", "Matt Hoffmann", "reports_to"), ("Midas", "Matt Hoffmann", "reports_to"), ("Wadsworth", "Socrates", "manages"), ("Wadsworth", "Daedalus", "manages"), ("Wadsworth", "Midas", "manages"), # Project ("Icarus", "Thoth", "builds_on"), ("Icarus", "HoffDesk", "builds_on"), ("Thoth", "Socrates", "owned_by"), ("HoffDesk", "Socrates", "owned_by"), ] # ────────────────────────────────────────────── # Main ingestion loop # ────────────────────────────────────────────── def ingest(): """Run one full shadow ingestion pass.""" init_db() # Check current health h = health() logger.info(f"Pre-ingestion state: {h['database']['entity_count']} entities, " f"{h['database']['relation_count']} relations") # Phase 1: Extract known structured entities logger.info("[Phase 1] Injecting structured entities...") known_entities = extract_structured_entities([]) entity_ids = {} for ent in known_entities: try: eid = add_entity( type_=ent["type"], subject=ent["subject"], description=ent.get("description", ""), tags=ent.get("tags", []), confidence=ent.get("confidence", 0.8), source=ent.get("source", "extraction"), ) entity_ids[ent["subject"]] = eid logger.debug(f" Entity: {ent['subject']} ({ent['type']}) -> {eid}") except ValidationError as e: logger.warning(f" Skipping {ent['subject']}: {e}") logger.info(f" → {len(entity_ids)} entities created/updated") # Phase 2: Extract from memory chunks logger.info("[Phase 2] Scanning memory chunks...") try: import sqlite3 conn = sqlite3.connect(MEMORY_DB) rows = conn.execute( "SELECT path, text FROM chunks ORDER BY path" ).fetchall() conn.close() except Exception as e: logger.warning(f" Cannot access memory DB: {e}") rows = [] logger.info(f" → {len(rows)} chunks found in memory") # Phase 3: Establish known relations logger.info("[Phase 3] Establishing relations...") relation_count = 0 for src_subj, tgt_subj, rel_type in KNOWN_RELATIONS: src_id = entity_ids.get(src_subj) tgt_id = entity_ids.get(tgt_subj) if src_id and tgt_id: try: add_relation( source_id=src_id, target_id=tgt_id, relation_type=rel_type, confidence=0.9, source="extraction", ) relation_count += 1 except ValidationError as e: logger.debug(f" Relation {src_subj} → {tgt_subj} ({rel_type}): {e}") except Exception as e: logger.debug(f" Relation {src_subj} → {tgt_subj} ({rel_type}): {e}") logger.info(f" → {relation_count} relations created/updated") # Rebuild vector index _rebuild_index() # Report final state h2 = health() added_entities = h2['database']['entity_count'] - h['database']['entity_count'] added_relations = h2['database']['relation_count'] - h['database']['relation_count'] logger.info(f"─── Ingest complete ───") logger.info(f" Entities: {h2['database']['entity_count']} ({'+' if added_entities >=0 else ''}{added_entities})") logger.info(f" Relations: {h2['database']['relation_count']} ({'+' if added_relations >=0 else ''}{added_relations})") logger.info(f" Vector idx: {h2['vector_index']['size']} vectors") logger.info(f" Embeddings: {'reachable' if h2['embedding']['reachable'] else 'UNREACHABLE'}") logger.info(f"───") return { "pre": h, "post": h2, "entities_added": added_entities, "relations_added": added_relations, } if __name__ == "__main__": logger.info("═" * 50) logger.info("Thoth Shadow Ingest — Starting") logger.info("═" * 50) result = ingest() sys.exit(0)