!/usr/bin/env python3
"""Benchmark qwen2.5-coder:7b vs qwen3:8b — fair comparison with /no_think."""
import json
import time
import requests
LLM_URL = "http://100.104.147.116:11434"
MODELS = ["qwen2.5-coder:7b", "qwen3:8b"]
APPOINTMENT_PROMPT = """You are a calendar assistant. Extract appointment details from the email below.
Today's date: 2026-04-17
Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)
Email:
From: greenbay.pediatrics@example.com
Subject: Upcoming Appointment Reminder
Body: Hi Matt, This is a reminder that Sullivan has a well child visit scheduled for April 22 at 10:00 AM at Green Bay Pediatrics. Please bring insurance cards and any vaccination records. The appointment should take about 45 minutes. Thanks!
Return JSON array with objects containing: who, what, when (ISO datetime with timezone America/Chicago), duration_minutes, where, type
"""
INTENT_PROMPT = """You are a calendar intent parser. Parse the user's message into a structured intent.
Today's date: 2026-04-17
Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)
User message: "cancel next Tuesday's speech therapy session"
Return JSON with: type, summary, instance_date, cancel_scope
"""
RECURRING_PROMPT = """You are a calendar assistant. Extract appointment details from the email below.
Today's date: 2026-04-17
Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)
Email:
From: therapy.center@example.com
Subject: Recurring Session Schedule
Body: Hi Aundrea, We've got Sullivan's occupational therapy set up. It will be every Tuesday from 4:00 PM to 5:00 PM, starting April 22nd for 10 sessions. Let us know if you need to reschedule any!
Return JSON array with objects containing: who, what, when (ISO datetime with timezone America/Chicago), duration_minutes, where, type, is_recurring (boolean), recurrence (object with frequency, days, interval, count)
"""
CONFLICT_PROMPT = """You are a scheduling conflict resolver. Two events overlap:
Event 1: "Well Child Visit (Sullivan)" on 2026-04-22 10:00-10:45 AM at Green Bay Pediatrics
Event 2: "OT Session (Sullivan)" on 2026-04-22 4:00-5:00 PM at Therapy Center
Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog)
Priority rules: medical > pet > personal; kids need adults; split before reschedule
Return JSON with: resolution_type (split/reassign/reschedule), explanation, actions (array of objects with event_index and action)
"""
TESTS = [
("appointment", APPOINTMENT_PROMPT),
("intent", INTENT_PROMPT),
("recurring", RECURRING_PROMPT),
("conflict", CONFLICT_PROMPT),
]
def call_llm(model, prompt):
# qwen3 needs /no_think to skip thinking tokens
if "qwen3" in model:
prompt = prompt.rstrip() + "\n\n/no_think"
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 2048},
}
start = time.time()
resp = requests.post(f"{LLM_URL}/api/generate", json=payload, timeout=120)
elapsed = time.time() - start
data = resp.json()
text = data.get("response", "")
eval_count = data.get("eval_count", 0)
eval_duration_ns = data.get("eval_duration", 0)
tok_per_sec = eval_count / (eval_duration_ns / 1e9) if eval_duration_ns > 0 else 0
return text, elapsed, tok_per_sec, eval_count
def is_valid_json(text):
import re
cleaned = re.sub(r"^(?:json)?\s*", "", text.strip())
cleaned = re.sub(r"\s*$", "", cleaned.strip())
try:
json.loads(cleaned)
return True, cleaned
except json.JSONDecodeError:
match = re.search(r"[.]", cleaned, re.DOTALL) or re.search(r"{.}", cleaned, re.DOTALL)
if match:
try:
json.loads(match.group())
return True, match.group()
except json.JSONDecodeError:
pass
return False, cleaned
def warm_up(model):
print(f" Warmup {model}...", end=" ", flush=True)
try:
call_llm(model, "Hi\n\n/no_think" if "qwen3" in model else "Hi")
print("OK")
return True
except Exception as e:
print(f"FAIL: {e}")
return False
def main():
results = {}
for model in MODELS:
print(f"\n{'='*60}")
print(f" {model}")
print(f"{'='*60}")
if not warm_up(model):
continue
model_results = []
for test_name, prompt in TESTS:
print(f" {test_name:<20}", end=" ", flush=True)
try:
text, elapsed, tok_per_sec, eval_count = call_llm(model, prompt)
valid, cleaned = is_valid_json(text)
model_results.append({
"test": test_name,
"latency_s": round(elapsed, 2),
"tok_per_sec": round(tok_per_sec, 1),
"eval_count": eval_count,
"json_valid": valid,
"response_len": len(text),
})
status = "✅" if valid else "❌"
print(f"{status} {elapsed:5.1f}s | {tok_per_sec:5.0f} tok/s | {eval_count:5d} tokens")
except Exception as e:
model_results.append({"test": test_name, "error": str(e)})
print(f"ERROR: {e}")
results[model] = model_results
# Print comparison table
print(f"\n{'='*65}")
print(f" COMPARISON TABLE")
print(f"{'='*65}")
print(f"{'Test':<15} | {'qwen2.5:7b latency':>18} | {'qwen3:8b latency':>18} | {'Winner':>10}")
print("-" * 65)
for i, test_name in enumerate([t[0] for t in TESTS]):
r25 = results.get(MODELS[0], [])[i] if i < len(results.get(MODELS[0], [])) else None
r3 = results.get(MODELS[1], [])[i] if i < len(results.get(MODELS[1], [])) else None
lat25 = f"{r25['latency_s']:.1f}s" if r25 and "latency_s" in r25 else "ERR"
lat3 = f"{r3['latency_s']:.1f}s" if r3 and "latency_s" in r3 else "ERR"
if r25 and r3 and "latency_s" in r25 and "latency_s" in r3:
winner = "qwen2.5" if r25["latency_s"] <= r3["latency_s"] else "qwen3"
else:
winner = "—"
print(f"{test_name:<15} | {lat25:>18} | {lat3:>18} | {winner:>10}")
# Quality comparison (JSON validity)
print(f"\n{'='*65}")
print(f" JSON VALIDITY")
print(f"{'='*65}")
for model in MODELS:
r = results.get(model, [])
valid = sum(1 for x in r if x.get("json_valid"))
print(f" {model:<25} {valid}/{len(r)} valid")
with open("/tmp/qwen3_benchmark_v2.json", "w") as f:
json.dump(results, f, indent=2)
if name == "main":
main()