#!/usr/bin/env python3
"""Benchmark qwen2.5-coder:7b vs phi4:14b on conflict resolution and rejection intent.

Tests structured reasoning tasks that the family assistant pipeline routes
through LLM_MODEL. Measures: latency, JSON validity, reasoning quality.

Usage:
  python benchmark_reasoning.py                        # Test all models on Gaming PC
  python benchmark_reasoning.py --url <url> --models qwen2.5-coder:7b phi4:14b
  python benchmark_reasoning.py --url http://127.0.0.1:11434/v1/chat/completions --models glm-5.1:cloud gemma4:31b-cloud
"""

import argparse
import json
import time
import sys
import requests

# ---------------------------------------------------------------------------
# Test cases
# ---------------------------------------------------------------------------

CONFLICT_RESOLVE_PROMPT = """You are a family scheduling assistant. Given a conflict between two events, suggest resolution options.

Family members: Matt, Aundrea, Sullivan (child), Harper (child), Maggie (dog)
Priority rules: medical > pet > personal; kids need adults; split before reschedule

Return JSON array of options:
[{{"option": N, "action": "SPLIT|REASSIGN|RESCHEDULE", "rationale": "...", "affects_event": 1|2, "affects_who": "name"}}]"""

REJECTION_INTENT_PROMPT = """You are a family scheduling assistant. Parse the user's rejection message into a structured rule.

Return JSON:
{{"pattern": "exact event summary to match", "reason": "why rejected", "scope": "all|event|newsletter"}}

Scope rules:
- "all" = skip forever (user said "always", "forever", "never again")
- "event" = one-time skip, calendar only (default)
- "newsletter" = skip extraction only
Pattern must be the exact event summary, not a generalized keyword."""

TEST_CASES = [
    {
        "name": "conflict_simple",
        "system": CONFLICT_RESOLVE_PROMPT,
        "user": "Conflict: \"Maggie grooming\" Sat 8:00-9:00 AM at Golrusk vs \"First Communion Practice\" Sat 9:00-10:30 AM at church. Both need a parent.",
    },
    {
        "name": "conflict_tough",
        "system": CONFLICT_RESOLVE_PROMPT,
        "user": "Conflict: \"Well Child Visit - Sullivan\" Fri 10:00-11:00 AM at Aurora Bay Care vs \"Sullivan School Field Trip\" Fri 9:00 AM-1:00 PM at Botanical Gardens. Both require a parent to attend.",
    },
    {
        "name": "rejection_simple",
        "system": REJECTION_INTENT_PROMPT,
        "user": "We don't need that mass, ignore it forever",
    },
    {
        "name": "rejection_ambiguous",
        "system": REJECTION_INTENT_PROMPT,
        "user": "Skip the grooming appointment this week, Aundrea can handle it next time",
    },
]

# ---------------------------------------------------------------------------
# Benchmark logic
# ---------------------------------------------------------------------------

def call_llm(url, model, system, user, timeout=120):
    """Call LLM and return (response_text, latency_seconds, error)."""
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        "stream": False,
        "options": {"temperature": 0, "num_predict": 2048},
    }
    start = time.time()
    try:
        resp = requests.post(url, json=payload, timeout=timeout)
        elapsed = time.time() - start
        if resp.status_code != 200:
            return None, elapsed, f"HTTP {resp.status_code}: {resp.text[:200]}"
        data = resp.json()
        content = data.get("message", {}).get("content", "") or data.get("choices", [{}])[0].get("message", {}).get("content", "")
        return content.strip(), elapsed, None
    except requests.exceptions.Timeout:
        elapsed = time.time() - start
        return None, elapsed, "TIMEOUT"
    except Exception as e:
        elapsed = time.time() - start
        return None, elapsed, str(e)[:200]


def validate_json(text):
    """Try to extract valid JSON from LLM output. Returns (json_obj, valid_bool)."""
    if not text:
        return None, False
    # Try direct parse
    try:
        return json.loads(text), True
    except json.JSONDecodeError:
        pass
    # Try extracting from markdown code block
    import re
    m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1).strip()), True
        except json.JSONDecodeError:
            pass
    # Try finding first { or [ to end
    for start_char, end_char in [("{", "}"), ("[", "]")]:
        idx = text.find(start_char)
        if idx >= 0:
            depth = 0
            for i in range(idx, len(text)):
                if text[i] == start_char:
                    depth += 1
                elif text[i] == end_char:
                    depth -= 1
                if depth == 0:
                    try:
                        return json.loads(text[idx:i+1]), True
                    except json.JSONDecodeError:
                        break
    return None, False


def score_conflict(parsed):
    """Score conflict resolution quality (0-3)."""
    if not isinstance(parsed, list):
        return 0
    score = 0
    for opt in parsed:
        if not isinstance(opt, dict):
            continue
        if "option" in opt and "action" in opt:
            score += 1  # Has structure
        if opt.get("action") in ("SPLIT", "REASSIGN", "RESCHEDULE"):
            score += 1  # Valid action
        if "rationale" in opt and len(opt["rationale"]) > 10:
            score += 1  # Has reasoning
    return min(score, 3)


def score_rejection(parsed):
    """Score rejection intent quality (0-3)."""
    if not isinstance(parsed, dict):
        return 0
    score = 0
    if "pattern" in parsed and len(parsed["pattern"]) > 3:
        score += 1  # Has specific pattern
    if "reason" in parsed:
        score += 1  # Has reason
    if "scope" in parsed and parsed["scope"] in ("all", "event", "newsletter"):
        score += 1  # Valid scope
    return score


def run_benchmark(url, models, timeout=120):
    results = []
    for model in models:
        print(f"\n{'='*60}")
        print(f"Model: {model}")
        print(f"{'='*60}")
        for tc in TEST_CASES:
            name = tc["name"]
            print(f"\n  {name}...", end=" ", flush=True)
            text, latency, error = call_llm(url, model, tc["system"], tc["user"], timeout)
            if error:
                print(f"❌ {error} ({latency:.1f}s)")
                results.append({"model": model, "test": name, "latency": latency, "valid": False, "score": 0, "error": error})
                continue
            parsed, valid = validate_json(text)
            if "conflict" in name:
                score = score_conflict(parsed)
            else:
                score = score_rejection(parsed)
            status = "✅" if valid else "⚠️"
            print(f"{status} {latency:.1f}s | score {score}/3 | valid={valid}")
            if valid:
                # Show a snippet of the parsed output
                snippet = json.dumps(parsed, indent=2)[:200]
                print(f"    {snippet}")
            else:
                print(f"    Raw: {text[:150]}")
            results.append({"model": model, "test": name, "latency": round(latency, 1), "valid": valid, "score": score, "error": None})
    return results


def print_summary(results):
    print(f"\n{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    models = sorted(set(r["model"] for r in results))
    # Header
    header = f"{'Model':<25} {'Valid':>5} {'Score':>5} {'Avg(s)':>7} {'Errors':>6}"
    print(header)
    print("-" * len(header))
    for model in models:
        model_results = [r for r in results if r["model"] == model]
        valid_count = sum(1 for r in model_results if r["valid"])
        total_score = sum(r["score"] for r in model_results)
        avg_latency = sum(r["latency"] for r in model_results) / len(model_results) if model_results else 0
        errors = sum(1 for r in model_results if r["error"])
        print(f"{model:<25} {valid_count}/{len(model_results):>4} {total_score}/{len(model_results)*3:>4} {avg_latency:>6.1f}s {errors:>5}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Benchmark reasoning models for family assistant")
    parser.add_argument("--url", default="http://100.104.147.116:11434/v1/chat/completions", help="LLM endpoint URL")
    parser.add_argument("--models", nargs="+", default=["qwen2.5-coder:7b", "phi4:14b"], help="Models to benchmark")
    parser.add_argument("--timeout", type=int, default=120, help="Timeout per request (seconds)")
    args = parser.parse_args()

    print(f"Endpoint: {args.url}")
    print(f"Models: {args.models}")
    print(f"Timeout: {args.timeout}s")

    results = run_benchmark(args.url, args.models, args.timeout)
    print_summary(results)

    # Save results
    outfile = "/home/hoffmann_admin/.openclaw/workspace/memory/benchmark-reasoning-results.json"
    with open(outfile, "w") as f:
        json.dump({"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S%Z"), "url": args.url, "results": results}, f, indent=2)
    print(f"\nResults saved to {outfile}")