📄 bench_qwen3.py 6,787 bytes Apr 17, 2026 📋 Raw

!/usr/bin/env python3

"""Benchmark qwen2.5-coder:7b vs qwen3:8b — fair comparison with /no_think."""

import json
import time
import requests

LLM_URL = "http://100.104.147.116:11434"
MODELS = ["qwen2.5-coder:7b", "qwen3:8b"]

APPOINTMENT_PROMPT = """You are a calendar assistant. Extract appointment details from the email below.

Today's date: 2026-04-17
Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)

Email:
From: greenbay.pediatrics@example.com
Subject: Upcoming Appointment Reminder
Body: Hi Matt, This is a reminder that Sullivan has a well child visit scheduled for April 22 at 10:00 AM at Green Bay Pediatrics. Please bring insurance cards and any vaccination records. The appointment should take about 45 minutes. Thanks!

Return JSON array with objects containing: who, what, when (ISO datetime with timezone America/Chicago), duration_minutes, where, type
"""

INTENT_PROMPT = """You are a calendar intent parser. Parse the user's message into a structured intent.

Today's date: 2026-04-17
Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)

User message: "cancel next Tuesday's speech therapy session"

Return JSON with: type, summary, instance_date, cancel_scope
"""

RECURRING_PROMPT = """You are a calendar assistant. Extract appointment details from the email below.

Today's date: 2026-04-17
Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)

Email:
From: therapy.center@example.com
Subject: Recurring Session Schedule
Body: Hi Aundrea, We've got Sullivan's occupational therapy set up. It will be every Tuesday from 4:00 PM to 5:00 PM, starting April 22nd for 10 sessions. Let us know if you need to reschedule any!

Return JSON array with objects containing: who, what, when (ISO datetime with timezone America/Chicago), duration_minutes, where, type, is_recurring (boolean), recurrence (object with frequency, days, interval, count)
"""

CONFLICT_PROMPT = """You are a scheduling conflict resolver. Two events overlap:

Event 1: "Well Child Visit (Sullivan)" on 2026-04-22 10:00-10:45 AM at Green Bay Pediatrics
Event 2: "OT Session (Sullivan)" on 2026-04-22 4:00-5:00 PM at Therapy Center

Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)
Priority rules: medical > pet > personal; kids need adults; split before reschedule

Return JSON with: resolution_type (split/reassign/reschedule), explanation, actions (array of objects with event_index and action)
"""

TESTS = [
("appointment", APPOINTMENT_PROMPT),
("intent", INTENT_PROMPT),
("recurring", RECURRING_PROMPT),
("conflict", CONFLICT_PROMPT),
]

def call_llm(model, prompt):
# qwen3 needs /no_think to skip thinking tokens
if "qwen3" in model:
prompt = prompt.rstrip() + "\n\n/no_think"

payload = {
    "model": model,
    "prompt": prompt,
    "stream": False,
    "options": {"temperature": 0.1, "num_predict": 2048},
}
start = time.time()
resp = requests.post(f"{LLM_URL}/api/generate", json=payload, timeout=120)
elapsed = time.time() - start

data = resp.json()
text = data.get("response", "")
eval_count = data.get("eval_count", 0)
eval_duration_ns = data.get("eval_duration", 0)
tok_per_sec = eval_count / (eval_duration_ns / 1e9) if eval_duration_ns > 0 else 0

return text, elapsed, tok_per_sec, eval_count

def is_valid_json(text):
import re
cleaned = re.sub(r"^(?:json)?\s*", "", text.strip()) cleaned = re.sub(r"\s*$", "", cleaned.strip())
try:
json.loads(cleaned)
return True, cleaned
except json.JSONDecodeError:
match = re.search(r"[.]", cleaned, re.DOTALL) or re.search(r"{.}", cleaned, re.DOTALL)
if match:
try:
json.loads(match.group())
return True, match.group()
except json.JSONDecodeError:
pass
return False, cleaned

def warm_up(model):
print(f" Warmup {model}...", end=" ", flush=True)
try:
call_llm(model, "Hi\n\n/no_think" if "qwen3" in model else "Hi")
print("OK")
return True
except Exception as e:
print(f"FAIL: {e}")
return False

def main():
results = {}

for model in MODELS:
    print(f"\n{'='*60}")
    print(f"  {model}")
    print(f"{'='*60}")

    if not warm_up(model):
        continue

    model_results = []
    for test_name, prompt in TESTS:
        print(f"  {test_name:<20}", end=" ", flush=True)
        try:
            text, elapsed, tok_per_sec, eval_count = call_llm(model, prompt)
            valid, cleaned = is_valid_json(text)
            model_results.append({
                "test": test_name,
                "latency_s": round(elapsed, 2),
                "tok_per_sec": round(tok_per_sec, 1),
                "eval_count": eval_count,
                "json_valid": valid,
                "response_len": len(text),
            })
            status = "✅" if valid else "❌"
            print(f"{status} {elapsed:5.1f}s | {tok_per_sec:5.0f} tok/s | {eval_count:5d} tokens")
        except Exception as e:
            model_results.append({"test": test_name, "error": str(e)})
            print(f"ERROR: {e}")

    results[model] = model_results

# Print comparison table
print(f"\n{'='*65}")
print(f"  COMPARISON TABLE")
print(f"{'='*65}")
print(f"{'Test':<15} | {'qwen2.5:7b latency':>18} | {'qwen3:8b latency':>18} | {'Winner':>10}")
print("-" * 65)

for i, test_name in enumerate([t[0] for t in TESTS]):
    r25 = results.get(MODELS[0], [])[i] if i < len(results.get(MODELS[0], [])) else None
    r3 = results.get(MODELS[1], [])[i] if i < len(results.get(MODELS[1], [])) else None

    lat25 = f"{r25['latency_s']:.1f}s" if r25 and "latency_s" in r25 else "ERR"
    lat3 = f"{r3['latency_s']:.1f}s" if r3 and "latency_s" in r3 else "ERR"

    if r25 and r3 and "latency_s" in r25 and "latency_s" in r3:
        winner = "qwen2.5" if r25["latency_s"] <= r3["latency_s"] else "qwen3"
    else:
        winner = "—"

    print(f"{test_name:<15} | {lat25:>18} | {lat3:>18} | {winner:>10}")

# Quality comparison (JSON validity)
print(f"\n{'='*65}")
print(f"  JSON VALIDITY")
print(f"{'='*65}")
for model in MODELS:
    r = results.get(model, [])
    valid = sum(1 for x in r if x.get("json_valid"))
    print(f"  {model:<25} {valid}/{len(r)} valid")

with open("/tmp/qwen3_benchmark_v2.json", "w") as f:
    json.dump(results, f, indent=2)

if name == "main":
main()