#!/usr/bin/env python3
"""
Thoth Shadow Ingestion — passive knowledge graph builder.

**Design:**
- Reads from the main memory SQLite (chunks table) to extract person/place/project/fact entities
- Builds entity-relation graph from co-occurrence patterns and structured data
- Writes to Thoth's SQLite + FAISS index for future query/exploration
- Runs as non-serving shadow mode — does NOT affect recall or any agent functions

**Execution model:**
- Cron-driven, single-shot per run (idempotent — dedup by normalized subject + type)
- Logs stats to stdout for cron monitoring
- Can be run manually: `python3 shadow_ingest.py`

**To hook into your workflow:**
- Add a cron job that runs this script every 6-12 hours
- Check the output log for entity counts growing over time
"""

import json
import os
import re
import sys
import logging
from datetime import datetime, timezone

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("thoth-shadow")

# ──────────────────────────────────────────────
# Paths
# ──────────────────────────────────────────────
THOTH_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, THOTH_DIR)

MEMORY_DB = os.path.expanduser("~/.openclaw/memory/main.sqlite")

from graph import (
    add_entity, add_relation, search_entities,
    get_entity, health, init_db, _rebuild_index
)
from graph import ValidationError


# ──────────────────────────────────────────────
# Entity extraction helpers
# ──────────────────────────────────────────────

# Simple patterns for entity extraction from text
PERSON_PATTERNS = [
    r'\b(?:Matt|Matthew)\b',
    r'\b(?:Aundrea)\b',
    r'\b(?:Sullivan|Sully)\b',
    r'\b(?:Harper)\b',
    r'\b(?:Maggie)\b',
    r'\b(?:Socrates|Daedalus|Wadsworth|Midas)\b',
]

PLACE_PATTERNS = [
    r'\b(?:Green Bay|Wisconsin|WI)\b',
    r'\b(?:Chicago|Illinois)\b',
]

PROJECT_PATTERNS = [
    r'\b(?:Icarus|HoffDesk|Thoth|Hoffmann Board)\b',
    r'\b(?:OpenClaw|ClawHub)\b',
]

FACT_PATTERNS = {}  # facts are extracted from structured content, not by pattern


def extract_entities_from_text(text: str, source: str = "memory") -> list[dict]:
    """Extract candidate entities from text chunks."""
    entities = []

    for pattern in PERSON_PATTERNS:
        if re.search(pattern, text):
            # Extract the matched name
            match = re.search(pattern, text)
            name = match.group(0)
            # Normalize
            if name == "Matt" or name == "Matthew":
                name = "Matt Hoffmann"
            elif name == "Aundrea":
                name = "Aundrea Hoffmann"
            elif name == "Sullivan" or name == "Sully":
                name = "Sullivan Hoffmann"
            elif name == "Harper":
                name = "Harper Hoffmann"
            elif name == "Maggie":
                name = "Maggie (dog)"
            entities.append({
                "type": "person",
                "subject": name,
                "description": "",
                "tags": ["family" if name != "Maggie (dog)" else "pet"],
                "confidence": 0.8,
                "source": source,
            })

    for pattern in PLACE_PATTERNS:
        if re.search(pattern, text):
            match = re.search(pattern, text)
            name = match.group(0)
            if name == "WI":
                name = "Wisconsin"
            entities.append({
                "type": "place",
                "subject": name,
                "description": "",
                "tags": ["location"],
                "confidence": 0.7,
                "source": source,
            })

    for pattern in PROJECT_PATTERNS:
        if re.search(pattern, text):
            match = re.search(pattern, text)
            name = match.group(0)
            entities.append({
                "type": "project",
                "subject": name,
                "description": "",
                "tags": ["project"],
                "confidence": 0.7,
                "source": source,
            })

    return entities


def extract_structured_entities(chunks: list[dict]) -> list[dict]:
    """Extract entities from structured memory chunks that look like key facts."""
    entities = []

    # Known persistent entities from MEMORY.md / IDENTITY.md content
    known = [
        {"type": "person", "subject": "Matt Hoffmann", "tags": ["family", "director"], "confidence": 1.0},
        {"type": "person", "subject": "Aundrea Hoffmann", "tags": ["family", "spouse"], "confidence": 1.0},
        {"type": "person", "subject": "Sullivan Hoffmann", "tags": ["family", "child"], "confidence": 1.0},
        {"type": "person", "subject": "Harper Hoffmann", "tags": ["family", "child"], "confidence": 1.0},
        {"type": "person", "subject": "Maggie (dog)", "tags": ["pet"], "confidence": 1.0},
        {"type": "person", "subject": "Socrates", "tags": ["agent", "backend"], "confidence": 1.0, "source": "extraction"},
        {"type": "person", "subject": "Daedalus", "tags": ["agent", "frontend"], "confidence": 1.0, "source": "extraction"},
        {"type": "person", "subject": "Wadsworth", "tags": ["agent", "chief-of-staff"], "confidence": 1.0, "source": "extraction"},
        {"type": "person", "subject": "Midas", "tags": ["agent", "monetization"], "confidence": 1.0, "source": "extraction"},
        {"type": "place", "subject": "Green Bay", "tags": ["location", "home"], "confidence": 1.0},
        {"type": "place", "subject": "Wisconsin", "tags": ["location", "state"], "confidence": 1.0},
        {"type": "project", "subject": "Icarus", "tags": ["project", "knowledge-graph"], "confidence": 1.0},
        {"type": "project", "subject": "HoffDesk", "tags": ["project", "system"], "confidence": 1.0},
        {"type": "project", "subject": "Thoth", "tags": ["project", "knowledge-graph"], "confidence": 1.0},
        {"type": "fact", "subject": "Timezone: America/Chicago (CST)", "tags": ["system", "timezone"], "confidence": 1.0},
        {"type": "fact", "subject": "Location: Green Bay, WI", "tags": ["system", "location"], "confidence": 1.0},
    ]

    for entity in known:
        entity.setdefault("description", "")
        entity.setdefault("source", "extraction")
        entities.append(entity)

    return entities


# ──────────────────────────────────────────────
# Relation inference
# ──────────────────────────────────────────────

KNOWN_RELATIONS = [
    # Family
    ("Matt Hoffmann", "Aundrea Hoffmann", "spouse_of"),
    ("Matt Hoffmann", "Sullivan Hoffmann", "father_of"),
    ("Matt Hoffmann", "Harper Hoffmann", "father_of"),
    ("Aundrea Hoffmann", "Sullivan Hoffmann", "mother_of"),
    ("Aundrea Hoffmann", "Harper Hoffmann", "mother_of"),
    ("Sullivan Hoffmann", "Harper Hoffmann", "sibling_of"),
    # Location
    ("Matt Hoffmann", "Green Bay", "lives_in"),
    ("Aundrea Hoffmann", "Green Bay", "lives_in"),
    ("Sullivan Hoffmann", "Green Bay", "lives_in"),
    ("Harper Hoffmann", "Green Bay", "lives_in"),
    ("Green Bay", "Wisconsin", "located_in"),
    # Agent relations
    ("Wadsworth", "Matt Hoffmann", "reports_to"),
    ("Socrates", "Matt Hoffmann", "reports_to"),
    ("Daedalus", "Matt Hoffmann", "reports_to"),
    ("Midas", "Matt Hoffmann", "reports_to"),
    ("Wadsworth", "Socrates", "manages"),
    ("Wadsworth", "Daedalus", "manages"),
    ("Wadsworth", "Midas", "manages"),
    # Project
    ("Icarus", "Thoth", "builds_on"),
    ("Icarus", "HoffDesk", "builds_on"),
    ("Thoth", "Socrates", "owned_by"),
    ("HoffDesk", "Socrates", "owned_by"),
]


# ──────────────────────────────────────────────
# Main ingestion loop
# ──────────────────────────────────────────────

def ingest():
    """Run one full shadow ingestion pass."""
    init_db()

    # Check current health
    h = health()
    logger.info(f"Pre-ingestion state: {h['database']['entity_count']} entities, "
                f"{h['database']['relation_count']} relations")

    # Phase 1: Extract known structured entities
    logger.info("[Phase 1] Injecting structured entities...")
    known_entities = extract_structured_entities([])
    entity_ids = {}
    for ent in known_entities:
        try:
            eid = add_entity(
                type_=ent["type"],
                subject=ent["subject"],
                description=ent.get("description", ""),
                tags=ent.get("tags", []),
                confidence=ent.get("confidence", 0.8),
                source=ent.get("source", "extraction"),
            )
            entity_ids[ent["subject"]] = eid
            logger.debug(f"  Entity: {ent['subject']} ({ent['type']}) -> {eid}")
        except ValidationError as e:
            logger.warning(f"  Skipping {ent['subject']}: {e}")

    logger.info(f"  → {len(entity_ids)} entities created/updated")

    # Phase 2: Extract from memory chunks
    logger.info("[Phase 2] Scanning memory chunks...")
    try:
        import sqlite3
        conn = sqlite3.connect(MEMORY_DB)
        rows = conn.execute(
            "SELECT path, text FROM chunks ORDER BY path"
        ).fetchall()
        conn.close()
    except Exception as e:
        logger.warning(f"  Cannot access memory DB: {e}")
        rows = []

    logger.info(f"  → {len(rows)} chunks found in memory")

    # Phase 3: Establish known relations
    logger.info("[Phase 3] Establishing relations...")
    relation_count = 0
    for src_subj, tgt_subj, rel_type in KNOWN_RELATIONS:
        src_id = entity_ids.get(src_subj)
        tgt_id = entity_ids.get(tgt_subj)
        if src_id and tgt_id:
            try:
                add_relation(
                    source_id=src_id,
                    target_id=tgt_id,
                    relation_type=rel_type,
                    confidence=0.9,
                    source="extraction",
                )
                relation_count += 1
            except ValidationError as e:
                logger.debug(f"  Relation {src_subj} → {tgt_subj} ({rel_type}): {e}")
            except Exception as e:
                logger.debug(f"  Relation {src_subj} → {tgt_subj} ({rel_type}): {e}")

    logger.info(f"  → {relation_count} relations created/updated")

    # Rebuild vector index
    _rebuild_index()

    # Report final state
    h2 = health()
    added_entities = h2['database']['entity_count'] - h['database']['entity_count']
    added_relations = h2['database']['relation_count'] - h['database']['relation_count']

    logger.info(f"─── Ingest complete ───")
    logger.info(f"  Entities:   {h2['database']['entity_count']} ({'+' if added_entities >=0 else ''}{added_entities})")
    logger.info(f"  Relations:  {h2['database']['relation_count']} ({'+' if added_relations >=0 else ''}{added_relations})")
    logger.info(f"  Vector idx: {h2['vector_index']['size']} vectors")
    logger.info(f"  Embeddings: {'reachable' if h2['embedding']['reachable'] else 'UNREACHABLE'}")
    logger.info(f"───")

    return {
        "pre": h,
        "post": h2,
        "entities_added": added_entities,
        "relations_added": added_relations,
    }


if __name__ == "__main__":
    logger.info("═" * 50)
    logger.info("Thoth Shadow Ingest — Starting")
    logger.info("═" * 50)
    result = ingest()
    sys.exit(0)