#!/usr/bin/env python3
"""Benchmark qwen2.5-coder:7b vs qwen3:8b — fair comparison with /no_think."""

import json
import time
import requests

LLM_URL = "http://100.104.147.116:11434"
MODELS = ["qwen2.5-coder:7b", "qwen3:8b"]

APPOINTMENT_PROMPT = """You are a calendar assistant. Extract appointment details from the email below.

Today's date: 2026-04-17
Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)

Email:
From: greenbay.pediatrics@example.com
Subject: Upcoming Appointment Reminder
Body: Hi Matt, This is a reminder that Sullivan has a well child visit scheduled for April 22 at 10:00 AM at Green Bay Pediatrics. Please bring insurance cards and any vaccination records. The appointment should take about 45 minutes. Thanks!

Return JSON array with objects containing: who, what, when (ISO datetime with timezone America/Chicago), duration_minutes, where, type
"""

INTENT_PROMPT = """You are a calendar intent parser. Parse the user's message into a structured intent.

Today's date: 2026-04-17
Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)

User message: "cancel next Tuesday's speech therapy session"

Return JSON with: type, summary, instance_date, cancel_scope
"""

RECURRING_PROMPT = """You are a calendar assistant. Extract appointment details from the email below.

Today's date: 2026-04-17
Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)

Email:
From: therapy.center@example.com
Subject: Recurring Session Schedule
Body: Hi Aundrea, We've got Sullivan's occupational therapy set up. It will be every Tuesday from 4:00 PM to 5:00 PM, starting April 22nd for 10 sessions. Let us know if you need to reschedule any!

Return JSON array with objects containing: who, what, when (ISO datetime with timezone America/Chicago), duration_minutes, where, type, is_recurring (boolean), recurrence (object with frequency, days, interval, count)
"""

CONFLICT_PROMPT = """You are a scheduling conflict resolver. Two events overlap:

Event 1: "Well Child Visit (Sullivan)" on 2026-04-22 10:00-10:45 AM at Green Bay Pediatrics
Event 2: "OT Session (Sullivan)" on 2026-04-22 4:00-5:00 PM at Therapy Center

Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)
Priority rules: medical > pet > personal; kids need adults; split before reschedule

Return JSON with: resolution_type (split/reassign/reschedule), explanation, actions (array of objects with event_index and action)
"""

TESTS = [
    ("appointment", APPOINTMENT_PROMPT),
    ("intent", INTENT_PROMPT),
    ("recurring", RECURRING_PROMPT),
    ("conflict", CONFLICT_PROMPT),
]


def call_llm(model, prompt):
    # qwen3 needs /no_think to skip thinking tokens
    if "qwen3" in model:
        prompt = prompt.rstrip() + "\n\n/no_think"

    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": 0.1, "num_predict": 2048},
    }
    start = time.time()
    resp = requests.post(f"{LLM_URL}/api/generate", json=payload, timeout=120)
    elapsed = time.time() - start

    data = resp.json()
    text = data.get("response", "")
    eval_count = data.get("eval_count", 0)
    eval_duration_ns = data.get("eval_duration", 0)
    tok_per_sec = eval_count / (eval_duration_ns / 1e9) if eval_duration_ns > 0 else 0

    return text, elapsed, tok_per_sec, eval_count


def is_valid_json(text):
    import re
    cleaned = re.sub(r"^```(?:json)?\s*", "", text.strip())
    cleaned = re.sub(r"\s*```$", "", cleaned.strip())
    try:
        json.loads(cleaned)
        return True, cleaned
    except json.JSONDecodeError:
        match = re.search(r"\[.*\]", cleaned, re.DOTALL) or re.search(r"\{.*\}", cleaned, re.DOTALL)
        if match:
            try:
                json.loads(match.group())
                return True, match.group()
            except json.JSONDecodeError:
                pass
    return False, cleaned


def warm_up(model):
    print(f"  Warmup {model}...", end=" ", flush=True)
    try:
        call_llm(model, "Hi\n\n/no_think" if "qwen3" in model else "Hi")
        print("OK")
        return True
    except Exception as e:
        print(f"FAIL: {e}")
        return False


def main():
    results = {}

    for model in MODELS:
        print(f"\n{'='*60}")
        print(f"  {model}")
        print(f"{'='*60}")

        if not warm_up(model):
            continue

        model_results = []
        for test_name, prompt in TESTS:
            print(f"  {test_name:<20}", end=" ", flush=True)
            try:
                text, elapsed, tok_per_sec, eval_count = call_llm(model, prompt)
                valid, cleaned = is_valid_json(text)
                model_results.append({
                    "test": test_name,
                    "latency_s": round(elapsed, 2),
                    "tok_per_sec": round(tok_per_sec, 1),
                    "eval_count": eval_count,
                    "json_valid": valid,
                    "response_len": len(text),
                })
                status = "✅" if valid else "❌"
                print(f"{status} {elapsed:5.1f}s | {tok_per_sec:5.0f} tok/s | {eval_count:5d} tokens")
            except Exception as e:
                model_results.append({"test": test_name, "error": str(e)})
                print(f"ERROR: {e}")

        results[model] = model_results

    # Print comparison table
    print(f"\n{'='*65}")
    print(f"  COMPARISON TABLE")
    print(f"{'='*65}")
    print(f"{'Test':<15} | {'qwen2.5:7b latency':>18} | {'qwen3:8b latency':>18} | {'Winner':>10}")
    print("-" * 65)

    for i, test_name in enumerate([t[0] for t in TESTS]):
        r25 = results.get(MODELS[0], [])[i] if i < len(results.get(MODELS[0], [])) else None
        r3 = results.get(MODELS[1], [])[i] if i < len(results.get(MODELS[1], [])) else None

        lat25 = f"{r25['latency_s']:.1f}s" if r25 and "latency_s" in r25 else "ERR"
        lat3 = f"{r3['latency_s']:.1f}s" if r3 and "latency_s" in r3 else "ERR"

        if r25 and r3 and "latency_s" in r25 and "latency_s" in r3:
            winner = "qwen2.5" if r25["latency_s"] <= r3["latency_s"] else "qwen3"
        else:
            winner = "—"

        print(f"{test_name:<15} | {lat25:>18} | {lat3:>18} | {winner:>10}")

    # Quality comparison (JSON validity)
    print(f"\n{'='*65}")
    print(f"  JSON VALIDITY")
    print(f"{'='*65}")
    for model in MODELS:
        r = results.get(model, [])
        valid = sum(1 for x in r if x.get("json_valid"))
        print(f"  {model:<25} {valid}/{len(r)} valid")

    with open("/tmp/qwen3_benchmark_v2.json", "w") as f:
        json.dump(results, f, indent=2)


if __name__ == "__main__":
    main()