📄 shadow_ingest.py 12,056 bytes Tuesday 14:58 📋 Raw

!/usr/bin/env python3

"""
Thoth Shadow Ingestion — passive knowledge graph builder.

Design:
- Reads from the main memory SQLite (chunks table) to extract person/place/project/fact entities
- Builds entity-relation graph from co-occurrence patterns and structured data
- Writes to Thoth's SQLite + FAISS index for future query/exploration
- Runs as non-serving shadow mode — does NOT affect recall or any agent functions

Execution model:
- Cron-driven, single-shot per run (idempotent — dedup by normalized subject + type)
- Logs stats to stdout for cron monitoring
- Can be run manually: python3 shadow_ingest.py

To hook into your workflow:
- Add a cron job that runs this script every 6-12 hours
- Check the output log for entity counts growing over time
"""

import json
import os
import re
import sys
import logging
from datetime import datetime, timezone

logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("thoth-shadow")

──────────────────────────────────────────────

Paths

──────────────────────────────────────────────

THOTH_DIR = os.path.dirname(os.path.abspath(file))
sys.path.insert(0, THOTH_DIR)

MEMORY_DB = os.path.expanduser("~/.openclaw/memory/main.sqlite")

from graph import (
add_entity, add_relation, search_entities,
get_entity, health, init_db, _rebuild_index
)
from graph import ValidationError

──────────────────────────────────────────────

Entity extraction helpers

──────────────────────────────────────────────

Simple patterns for entity extraction from text

PLACE_PATTERNS = [
r'\b(?:Green Bay|Wisconsin|WI)\b',
r'\b(?:Chicago|Illinois)\b',
]

PROJECT_PATTERNS = [
r'\b(?:Icarus|HoffDesk|Thoth|Hoffmann Board)\b',
r'\b(?:OpenClaw|ClawHub)\b',
]

FACT_PATTERNS = {} # facts are extracted from structured content, not by pattern

def extract_entities_from_text(text: str, source: str = "memory") -> list[dict]:
"""Extract candidate entities from text chunks."""
entities = []

for pattern in PERSON_PATTERNS:
    if re.search(pattern, text):
        # Extract the matched name
        match = re.search(pattern, text)
        name = match.group(0)
        # Normalize
        if name == "Matt" or name == "Matthew":
            name = "Matt Hoffmann"
        elif name == "Aundrea":
            name = "Aundrea Hoffmann"
        elif name == "Sullivan" or name == "Sully":
            name = "Sullivan Hoffmann"
        elif name == "Harper":
            name = "Harper Hoffmann"
        elif name == "Maggie":
            name = "Maggie (dog)"
        entities.append({
            "type": "person",
            "subject": name,
            "description": "",
            "tags": ["family" if name != "Maggie (dog)" else "pet"],
            "confidence": 0.8,
            "source": source,
        })

for pattern in PLACE_PATTERNS:
    if re.search(pattern, text):
        match = re.search(pattern, text)
        name = match.group(0)
        if name == "WI":
            name = "Wisconsin"
        entities.append({
            "type": "place",
            "subject": name,
            "description": "",
            "tags": ["location"],
            "confidence": 0.7,
            "source": source,
        })

for pattern in PROJECT_PATTERNS:
    if re.search(pattern, text):
        match = re.search(pattern, text)
        name = match.group(0)
        entities.append({
            "type": "project",
            "subject": name,
            "description": "",
            "tags": ["project"],
            "confidence": 0.7,
            "source": source,
        })

return entities

def extract_structured_entities(chunks: list[dict]) -> list[dict]:
"""Extract entities from structured memory chunks that look like key facts."""
entities = []

# Known persistent entities from MEMORY.md / IDENTITY.md content
known = [
    {"type": "person", "subject": "Matt Hoffmann", "tags": ["family", "director"], "confidence": 1.0},
    {"type": "person", "subject": "Aundrea Hoffmann", "tags": ["family", "spouse"], "confidence": 1.0},
    {"type": "person", "subject": "Sullivan Hoffmann", "tags": ["family", "child"], "confidence": 1.0},
    {"type": "person", "subject": "Harper Hoffmann", "tags": ["family", "child"], "confidence": 1.0},
    {"type": "person", "subject": "Maggie (dog)", "tags": ["pet"], "confidence": 1.0},
    {"type": "person", "subject": "Socrates", "tags": ["agent", "backend"], "confidence": 1.0, "source": "extraction"},
    {"type": "person", "subject": "Daedalus", "tags": ["agent", "frontend"], "confidence": 1.0, "source": "extraction"},
    {"type": "person", "subject": "Wadsworth", "tags": ["agent", "chief-of-staff"], "confidence": 1.0, "source": "extraction"},
    {"type": "person", "subject": "Midas", "tags": ["agent", "monetization"], "confidence": 1.0, "source": "extraction"},
    {"type": "place", "subject": "Green Bay", "tags": ["location", "home"], "confidence": 1.0},
    {"type": "place", "subject": "Wisconsin", "tags": ["location", "state"], "confidence": 1.0},
    {"type": "project", "subject": "Icarus", "tags": ["project", "knowledge-graph"], "confidence": 1.0},
    {"type": "project", "subject": "HoffDesk", "tags": ["project", "system"], "confidence": 1.0},
    {"type": "project", "subject": "Thoth", "tags": ["project", "knowledge-graph"], "confidence": 1.0},
    {"type": "fact", "subject": "Timezone: America/Chicago (CST)", "tags": ["system", "timezone"], "confidence": 1.0},
    {"type": "fact", "subject": "Location: Green Bay, WI", "tags": ["system", "location"], "confidence": 1.0},
]

for entity in known:
    entity.setdefault("description", "")
    entity.setdefault("source", "extraction")
    entities.append(entity)

return entities

──────────────────────────────────────────────

Relation inference

──────────────────────────────────────────────

KNOWN_RELATIONS = [
# Family
("Matt Hoffmann", "Aundrea Hoffmann", "spouse_of"),
("Matt Hoffmann", "Sullivan Hoffmann", "father_of"),
("Matt Hoffmann", "Harper Hoffmann", "father_of"),
("Aundrea Hoffmann", "Sullivan Hoffmann", "mother_of"),
("Aundrea Hoffmann", "Harper Hoffmann", "mother_of"),
("Sullivan Hoffmann", "Harper Hoffmann", "sibling_of"),
# Location
("Matt Hoffmann", "Green Bay", "lives_in"),
("Aundrea Hoffmann", "Green Bay", "lives_in"),
("Sullivan Hoffmann", "Green Bay", "lives_in"),
("Harper Hoffmann", "Green Bay", "lives_in"),
("Green Bay", "Wisconsin", "located_in"),
# Agent relations
("Wadsworth", "Matt Hoffmann", "reports_to"),
("Socrates", "Matt Hoffmann", "reports_to"),
("Daedalus", "Matt Hoffmann", "reports_to"),
("Midas", "Matt Hoffmann", "reports_to"),
("Wadsworth", "Socrates", "manages"),
("Wadsworth", "Daedalus", "manages"),
("Wadsworth", "Midas", "manages"),
# Project
("Icarus", "Thoth", "builds_on"),
("Icarus", "HoffDesk", "builds_on"),
("Thoth", "Socrates", "owned_by"),
("HoffDesk", "Socrates", "owned_by"),
]

──────────────────────────────────────────────

Main ingestion loop

──────────────────────────────────────────────

def ingest():
"""Run one full shadow ingestion pass."""
init_db()

# Check current health
h = health()
logger.info(f"Pre-ingestion state: {h['database']['entity_count']} entities, "
            f"{h['database']['relation_count']} relations")

# Phase 1: Extract known structured entities
logger.info("[Phase 1] Injecting structured entities...")
known_entities = extract_structured_entities([])
entity_ids = {}
for ent in known_entities:
    try:
        eid = add_entity(
            type_=ent["type"],
            subject=ent["subject"],
            description=ent.get("description", ""),
            tags=ent.get("tags", []),
            confidence=ent.get("confidence", 0.8),
            source=ent.get("source", "extraction"),
        )
        entity_ids[ent["subject"]] = eid
        logger.debug(f"  Entity: {ent['subject']} ({ent['type']}) -> {eid}")
    except ValidationError as e:
        logger.warning(f"  Skipping {ent['subject']}: {e}")

logger.info(f"  → {len(entity_ids)} entities created/updated")

# Phase 2: Extract from memory chunks
logger.info("[Phase 2] Scanning memory chunks...")
try:
    import sqlite3
    conn = sqlite3.connect(MEMORY_DB)
    rows = conn.execute(
        "SELECT path, text FROM chunks ORDER BY path"
    ).fetchall()
    conn.close()
except Exception as e:
    logger.warning(f"  Cannot access memory DB: {e}")
    rows = []

logger.info(f"  → {len(rows)} chunks found in memory")

# Phase 3: Establish known relations
logger.info("[Phase 3] Establishing relations...")
relation_count = 0
for src_subj, tgt_subj, rel_type in KNOWN_RELATIONS:
    src_id = entity_ids.get(src_subj)
    tgt_id = entity_ids.get(tgt_subj)
    if src_id and tgt_id:
        try:
            add_relation(
                source_id=src_id,
                target_id=tgt_id,
                relation_type=rel_type,
                confidence=0.9,
                source="extraction",
            )
            relation_count += 1
        except ValidationError as e:
            logger.debug(f"  Relation {src_subj} → {tgt_subj} ({rel_type}): {e}")
        except Exception as e:
            logger.debug(f"  Relation {src_subj} → {tgt_subj} ({rel_type}): {e}")

logger.info(f"  → {relation_count} relations created/updated")

# Rebuild vector index
_rebuild_index()

# Report final state
h2 = health()
added_entities = h2['database']['entity_count'] - h['database']['entity_count']
added_relations = h2['database']['relation_count'] - h['database']['relation_count']

logger.info(f"─── Ingest complete ───")
logger.info(f"  Entities:   {h2['database']['entity_count']} ({'+' if added_entities >=0 else ''}{added_entities})")
logger.info(f"  Relations:  {h2['database']['relation_count']} ({'+' if added_relations >=0 else ''}{added_relations})")
logger.info(f"  Vector idx: {h2['vector_index']['size']} vectors")
logger.info(f"  Embeddings: {'reachable' if h2['embedding']['reachable'] else 'UNREACHABLE'}")
logger.info(f"───")

return {
    "pre": h,
    "post": h2,
    "entities_added": added_entities,
    "relations_added": added_relations,
}

if name == "main":
logger.info("═" * 50)
logger.info("Thoth Shadow Ingest — Starting")
logger.info("═" * 50)
result = ingest()
sys.exit(0)

← Back