!/usr/bin/env python3
"""
Thoth Shadow Ingestion — passive knowledge graph builder.
Design:
- Reads from the main memory SQLite (chunks table) to extract person/place/project/fact entities
- Builds entity-relation graph from co-occurrence patterns and structured data
- Writes to Thoth's SQLite + FAISS index for future query/exploration
- Runs as non-serving shadow mode — does NOT affect recall or any agent functions
Execution model:
- Cron-driven, single-shot per run (idempotent — dedup by normalized subject + type)
- Logs stats to stdout for cron monitoring
- Can be run manually: python3 shadow_ingest.py
To hook into your workflow:
- Add a cron job that runs this script every 6-12 hours
- Check the output log for entity counts growing over time
"""
import json
import os
import re
import sys
import logging
from datetime import datetime, timezone
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("thoth-shadow")
──────────────────────────────────────────────
Paths
──────────────────────────────────────────────
THOTH_DIR = os.path.dirname(os.path.abspath(file))
sys.path.insert(0, THOTH_DIR)
MEMORY_DB = os.path.expanduser("~/.openclaw/memory/main.sqlite")
from graph import (
add_entity, add_relation, search_entities,
get_entity, health, init_db, _rebuild_index
)
from graph import ValidationError
──────────────────────────────────────────────
Entity extraction helpers
──────────────────────────────────────────────
Simple patterns for entity extraction from text
PERSON_PATTERNS = [
r'\b(?:Matt|Matthew)\b',
r'\b(?:Aundrea)\b',
r'\b(?:Sullivan|Sully)\b',
r'\b(?:Harper)\b',
r'\b(?:Maggie)\b',
r'\b(?:Socrates|Daedalus|Wadsworth|Midas)\b',
]
PLACE_PATTERNS = [
r'\b(?:Green Bay|Wisconsin|WI)\b',
r'\b(?:Chicago|Illinois)\b',
]
PROJECT_PATTERNS = [
r'\b(?:Icarus|HoffDesk|Thoth|Hoffmann Board)\b',
r'\b(?:OpenClaw|ClawHub)\b',
]
FACT_PATTERNS = {} # facts are extracted from structured content, not by pattern
def extract_entities_from_text(text: str, source: str = "memory") -> list[dict]:
"""Extract candidate entities from text chunks."""
entities = []
for pattern in PERSON_PATTERNS:
if re.search(pattern, text):
# Extract the matched name
match = re.search(pattern, text)
name = match.group(0)
# Normalize
if name == "Matt" or name == "Matthew":
name = "Matt Hoffmann"
elif name == "Aundrea":
name = "Aundrea Hoffmann"
elif name == "Sullivan" or name == "Sully":
name = "Sullivan Hoffmann"
elif name == "Harper":
name = "Harper Hoffmann"
elif name == "Maggie":
name = "Maggie (dog)"
entities.append({
"type": "person",
"subject": name,
"description": "",
"tags": ["family" if name != "Maggie (dog)" else "pet"],
"confidence": 0.8,
"source": source,
})
for pattern in PLACE_PATTERNS:
if re.search(pattern, text):
match = re.search(pattern, text)
name = match.group(0)
if name == "WI":
name = "Wisconsin"
entities.append({
"type": "place",
"subject": name,
"description": "",
"tags": ["location"],
"confidence": 0.7,
"source": source,
})
for pattern in PROJECT_PATTERNS:
if re.search(pattern, text):
match = re.search(pattern, text)
name = match.group(0)
entities.append({
"type": "project",
"subject": name,
"description": "",
"tags": ["project"],
"confidence": 0.7,
"source": source,
})
return entities
def extract_structured_entities(chunks: list[dict]) -> list[dict]:
"""Extract entities from structured memory chunks that look like key facts."""
entities = []
# Known persistent entities from MEMORY.md / IDENTITY.md content
known = [
{"type": "person", "subject": "Matt Hoffmann", "tags": ["family", "director"], "confidence": 1.0},
{"type": "person", "subject": "Aundrea Hoffmann", "tags": ["family", "spouse"], "confidence": 1.0},
{"type": "person", "subject": "Sullivan Hoffmann", "tags": ["family", "child"], "confidence": 1.0},
{"type": "person", "subject": "Harper Hoffmann", "tags": ["family", "child"], "confidence": 1.0},
{"type": "person", "subject": "Maggie (dog)", "tags": ["pet"], "confidence": 1.0},
{"type": "person", "subject": "Socrates", "tags": ["agent", "backend"], "confidence": 1.0, "source": "extraction"},
{"type": "person", "subject": "Daedalus", "tags": ["agent", "frontend"], "confidence": 1.0, "source": "extraction"},
{"type": "person", "subject": "Wadsworth", "tags": ["agent", "chief-of-staff"], "confidence": 1.0, "source": "extraction"},
{"type": "person", "subject": "Midas", "tags": ["agent", "monetization"], "confidence": 1.0, "source": "extraction"},
{"type": "place", "subject": "Green Bay", "tags": ["location", "home"], "confidence": 1.0},
{"type": "place", "subject": "Wisconsin", "tags": ["location", "state"], "confidence": 1.0},
{"type": "project", "subject": "Icarus", "tags": ["project", "knowledge-graph"], "confidence": 1.0},
{"type": "project", "subject": "HoffDesk", "tags": ["project", "system"], "confidence": 1.0},
{"type": "project", "subject": "Thoth", "tags": ["project", "knowledge-graph"], "confidence": 1.0},
{"type": "fact", "subject": "Timezone: America/Chicago (CST)", "tags": ["system", "timezone"], "confidence": 1.0},
{"type": "fact", "subject": "Location: Green Bay, WI", "tags": ["system", "location"], "confidence": 1.0},
]
for entity in known:
entity.setdefault("description", "")
entity.setdefault("source", "extraction")
entities.append(entity)
return entities
──────────────────────────────────────────────
Relation inference
──────────────────────────────────────────────
KNOWN_RELATIONS = [
# Family
("Matt Hoffmann", "Aundrea Hoffmann", "spouse_of"),
("Matt Hoffmann", "Sullivan Hoffmann", "father_of"),
("Matt Hoffmann", "Harper Hoffmann", "father_of"),
("Aundrea Hoffmann", "Sullivan Hoffmann", "mother_of"),
("Aundrea Hoffmann", "Harper Hoffmann", "mother_of"),
("Sullivan Hoffmann", "Harper Hoffmann", "sibling_of"),
# Location
("Matt Hoffmann", "Green Bay", "lives_in"),
("Aundrea Hoffmann", "Green Bay", "lives_in"),
("Sullivan Hoffmann", "Green Bay", "lives_in"),
("Harper Hoffmann", "Green Bay", "lives_in"),
("Green Bay", "Wisconsin", "located_in"),
# Agent relations
("Wadsworth", "Matt Hoffmann", "reports_to"),
("Socrates", "Matt Hoffmann", "reports_to"),
("Daedalus", "Matt Hoffmann", "reports_to"),
("Midas", "Matt Hoffmann", "reports_to"),
("Wadsworth", "Socrates", "manages"),
("Wadsworth", "Daedalus", "manages"),
("Wadsworth", "Midas", "manages"),
# Project
("Icarus", "Thoth", "builds_on"),
("Icarus", "HoffDesk", "builds_on"),
("Thoth", "Socrates", "owned_by"),
("HoffDesk", "Socrates", "owned_by"),
]
──────────────────────────────────────────────
Main ingestion loop
──────────────────────────────────────────────
def ingest():
"""Run one full shadow ingestion pass."""
init_db()
# Check current health
h = health()
logger.info(f"Pre-ingestion state: {h['database']['entity_count']} entities, "
f"{h['database']['relation_count']} relations")
# Phase 1: Extract known structured entities
logger.info("[Phase 1] Injecting structured entities...")
known_entities = extract_structured_entities([])
entity_ids = {}
for ent in known_entities:
try:
eid = add_entity(
type_=ent["type"],
subject=ent["subject"],
description=ent.get("description", ""),
tags=ent.get("tags", []),
confidence=ent.get("confidence", 0.8),
source=ent.get("source", "extraction"),
)
entity_ids[ent["subject"]] = eid
logger.debug(f" Entity: {ent['subject']} ({ent['type']}) -> {eid}")
except ValidationError as e:
logger.warning(f" Skipping {ent['subject']}: {e}")
logger.info(f" → {len(entity_ids)} entities created/updated")
# Phase 2: Extract from memory chunks
logger.info("[Phase 2] Scanning memory chunks...")
try:
import sqlite3
conn = sqlite3.connect(MEMORY_DB)
rows = conn.execute(
"SELECT path, text FROM chunks ORDER BY path"
).fetchall()
conn.close()
except Exception as e:
logger.warning(f" Cannot access memory DB: {e}")
rows = []
logger.info(f" → {len(rows)} chunks found in memory")
# Phase 3: Establish known relations
logger.info("[Phase 3] Establishing relations...")
relation_count = 0
for src_subj, tgt_subj, rel_type in KNOWN_RELATIONS:
src_id = entity_ids.get(src_subj)
tgt_id = entity_ids.get(tgt_subj)
if src_id and tgt_id:
try:
add_relation(
source_id=src_id,
target_id=tgt_id,
relation_type=rel_type,
confidence=0.9,
source="extraction",
)
relation_count += 1
except ValidationError as e:
logger.debug(f" Relation {src_subj} → {tgt_subj} ({rel_type}): {e}")
except Exception as e:
logger.debug(f" Relation {src_subj} → {tgt_subj} ({rel_type}): {e}")
logger.info(f" → {relation_count} relations created/updated")
# Rebuild vector index
_rebuild_index()
# Report final state
h2 = health()
added_entities = h2['database']['entity_count'] - h['database']['entity_count']
added_relations = h2['database']['relation_count'] - h['database']['relation_count']
logger.info(f"─── Ingest complete ───")
logger.info(f" Entities: {h2['database']['entity_count']} ({'+' if added_entities >=0 else ''}{added_entities})")
logger.info(f" Relations: {h2['database']['relation_count']} ({'+' if added_relations >=0 else ''}{added_relations})")
logger.info(f" Vector idx: {h2['vector_index']['size']} vectors")
logger.info(f" Embeddings: {'reachable' if h2['embedding']['reachable'] else 'UNREACHABLE'}")
logger.info(f"───")
return {
"pre": h,
"post": h2,
"entities_added": added_entities,
"relations_added": added_relations,
}
if name == "main":
logger.info("═" * 50)
logger.info("Thoth Shadow Ingest — Starting")
logger.info("═" * 50)
result = ingest()
sys.exit(0)