#!/usr/bin/env python3
"""Benchmark deepseek-r1 vs phi4:14b on reasoning tasks."""

import json
import time
import requests

LLM_URL = "http://100.104.147.116:11434"
REASONING_MODELS = ["phi4:14b", "deepseek-r1:8b", "deepseek-r1:14b"]

# --- Reasoning-heavy prompts (not just JSON extraction) ---

CONFLICT_PROMPT = """You are a family scheduling assistant. Two events conflict:

Event A: "Sullivan well child visit" on April 22, 10:00-10:45 AM at Green Bay Pediatrics
Event B: "Sullivan OT session" on April 22, 10:30-11:15 AM at Therapy Center (45 min away)

Family: Matt (dad), Aundrea (mom), Sullivan (9, needs adult), Harper (7, needs adult), Maggie (dog)

Priority rules: medical > therapy > personal; kids need adult supervision; 45-min drive between locations

Explain the conflict, then recommend a specific resolution (SPLIT, REASSIGN, or RESCHEDULE) with reasoning.

Return JSON: {resolution_type, explanation, actions:[{event, new_time, who_attends}]}
"""

REJECTION_INTENT_PROMPT = """Parse the user's intent about rejecting/cancelling recurring items.

User message: "we don't need that mass, ignore it forever"

Context: The user is referring to "First Communion Mass (Sullivan)" on May 3, 2026.

Determine:
- Is this a rejection rule to save for future emails?
- Is this a one-time cancellation?
- What scope: "all" (skip extraction + calendar), "event" (just skip calendar), or "newsletter" (just skip extraction)?
- What pattern should match future occurrences?

Return JSON with: intent_type, scope, pattern, is_persistent_rule, reasoning
"""

NEWSLETTER_DEDUP_PROMPT = """Determine if these two newsletter items are duplicates:

Item A: {"title": "Summer Camp Registration Opens", "source": "St. Bernard School", "deadline": "May 1", "action_required": "register online"}
Item B: {"title": "Register for Summer Camp Now", "source": "St. Bernard Newsletter", "deadline": "May 1st", "action_required": "signup at signupgenius.com"}

Consider: same event described differently? Same deadline? Same required action?

Return JSON: {is_duplicate (boolean), confidence (0-1), reasoning}
"""

SCHEDULING_PUZZLE_PROMPT = """Matt needs to schedule 3 activities for Sullivan this week:
- Piano lesson: needs 30 min, any weekday 3-6 PM, preferably Tuesday or Thursday
- Speech therapy: needs 45 min, only available Mon/Wed/Fri mornings
- Dentist: needs 60 min, any day except Friday, before 4 PM

Aundrea works Mon-Thu 9-5. Matt is flexible except Thu morning.

Find valid slots. Return JSON: {piano_day_time, speech_day_time, dentist_day_time, reasoning}
"""

TESTS = [
    ("conflict_resolve", CONFLICT_PROMPT),
    ("rejection_intent", REJECTION_INTENT_PROMPT),
    ("newsletter_dedup", NEWSLETTER_DEDUP_PROMPT),
    ("scheduling_puzzle", SCHEDULING_PUZZLE_PROMPT),
]


def call_llm(model, prompt):
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.1, "num_predict": 2048},
    }
    start = time.time()
    resp = requests.post(f"{LLM_URL}/api/generate", json=payload, timeout=120)
    elapsed = time.time() - start

    data = resp.json()
    text = data.get("response", "")
    eval_count = data.get("eval_count", 0)
    eval_duration_ns = data.get("eval_duration", 0)
    tok_per_sec = eval_count / (eval_duration_ns / 1e9) if eval_duration_ns > 0 else 0

    return text, elapsed, tok_per_sec, eval_count


def has_reasoning(text):
    """Check if response shows chain-of-thought reasoning."""
    markers = ["think", "thinking", "reason", "because", "therefore", "first", "second", "finally", "step"]
    return any(m in text.lower() for m in markers)


def is_valid_json(text):
    import re
    cleaned = re.sub(r"^```(?:json)?\s*", "", text.strip())
    cleaned = re.sub(r"\s*```$", "", cleaned.strip())
    try:
        json.loads(cleaned)
        return True
    except json.JSONDecodeError:
        match = re.search(r"\[.*\]", cleaned, re.DOTALL) or re.search(r"\{.*\}", cleaned, re.DOTALL)
        if match:
            try:
                json.loads(match.group())
                return True
            except json.JSONDecodeError:
                pass
    return False


def warm_up(model):
    print(f"  Warmup {model}...", end=" ", flush=True)
    try:
        call_llm(model, "Hi")
        print("OK")
        return True
    except Exception as e:
        print(f"FAIL: {e}")
        return False


def main():
    results = {}

    for model in REASONING_MODELS:
        print(f"\n{'='*60}")
        print(f"  {model}")
        print(f"{'='*60}")

        if not warm_up(model):
            continue

        model_results = []
        for test_name, prompt in TESTS:
            print(f"  {test_name:<20}", end=" ", flush=True)
            try:
                text, elapsed, tok_per_sec, eval_count = call_llm(model, prompt)
                valid_json = is_valid_json(text)
                has_reason = has_reasoning(text)
                model_results.append({
                    "test": test_name,
                    "latency_s": round(elapsed, 2),
                    "tok_per_sec": round(tok_per_sec, 1),
                    "eval_count": eval_count,
                    "json_valid": valid_json,
                    "has_reasoning": has_reason,
                    "response_len": len(text),
                })
                status = "✅" if valid_json else "❌"
                reason_marker = "🧠" if has_reason else "  "
                print(f"{status}{reason_marker} {elapsed:5.1f}s | {tok_per_sec:5.0f}/s | {eval_count:5d}tok")
            except Exception as e:
                model_results.append({"test": test_name, "error": str(e)})
                print(f"ERROR: {e}")

        results[model] = model_results

    # Summary table
    print(f"\n{'='*75}")
    print(f"  COMPARISON: Reasoning Tasks")
    print(f"{'='*75}")
    print(f"{'Test':<18} | {'phi4:14b':<17} | {'r1:8b':<17} | {'r1:14b':<17}")
    print("-" * 75)

    for i, test_name in enumerate([t[0] for t in TESTS]):
        row = f"{test_name:<18}"
        for model in REASONING_MODELS:
            r = results.get(model, [])
            if i < len(r) and "latency_s" in r[i]:
                d = r[i]
                lat = f"{d['latency_s']:.1f}s"
                qual = "✅" if d["json_valid"] else "❌"
                think = "🧠" if d.get("has_reasoning") else "  "
                row += f" | {qual}{think} {lat:>8}"
            else:
                row += f" | {'ERR':<17}"
        print(row)

    # Quality comparison
    print(f"\n{'='*75}")
    print(f"  QUALITY SCORES (JSON + Reasoning)")
    print(f"{'='*75}")
    for model in REASONING_MODELS:
        r = results.get(model, [])
        json_ok = sum(1 for x in r if x.get("json_valid"))
        reason_ok = sum(1 for x in r if x.get("has_reasoning"))
        print(f"  {model:<25} JSON: {json_ok}/{len(r)} | Reasoning: {reason_ok}/{len(r)}")

    # Recommendation
    print(f"\n{'='*75}")
    print(f"  RECOMMENDATION")
    print(f"{'='*75}")

    # Find best model
    best_score = -1
    best_model = None
    for model in REASONING_MODELS:
        r = results.get(model, [])
        if not r:
            continue
        score = sum(1 for x in r if x.get("json_valid") and x.get("has_reasoning")) - (sum(x["latency_s"] for x in r if "latency_s" in x) / 10)
        if score > best_score:
            best_score = score
            best_model = model

    if best_model:
        print(f"  Best for local reasoning: {best_model}")
        print(f"  (Tradeoff: Keep phi4:14b for speed, switch to deepseek-r1:14b for quality)")
    else:
        print("  No clear winner — phi4:14b remains fastest")

    with open("/tmp/deepseek_benchmark.json", "w") as f:
        json.dump(results, f, indent=2)
    print(f"\n  Saved to /tmp/deepseek_benchmark.json")


if __name__ == "__main__":
    main()