#!/usr/bin/env python3 """Benchmark qwen2.5-coder:7b vs qwen3:8b — fair comparison with /no_think.""" import json import time import requests LLM_URL = "http://100.104.147.116:11434" MODELS = ["qwen2.5-coder:7b", "qwen3:8b"] APPOINTMENT_PROMPT = """You are a calendar assistant. Extract appointment details from the email below. Today's date: 2026-04-17 Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog) Email: From: greenbay.pediatrics@example.com Subject: Upcoming Appointment Reminder Body: Hi Matt, This is a reminder that Sullivan has a well child visit scheduled for April 22 at 10:00 AM at Green Bay Pediatrics. Please bring insurance cards and any vaccination records. The appointment should take about 45 minutes. Thanks! Return JSON array with objects containing: who, what, when (ISO datetime with timezone America/Chicago), duration_minutes, where, type """ INTENT_PROMPT = """You are a calendar intent parser. Parse the user's message into a structured intent. Today's date: 2026-04-17 Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog) User message: "cancel next Tuesday's speech therapy session" Return JSON with: type, summary, instance_date, cancel_scope """ RECURRING_PROMPT = """You are a calendar assistant. Extract appointment details from the email below. Today's date: 2026-04-17 Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog) Email: From: therapy.center@example.com Subject: Recurring Session Schedule Body: Hi Aundrea, We've got Sullivan's occupational therapy set up. It will be every Tuesday from 4:00 PM to 5:00 PM, starting April 22nd for 10 sessions. Let us know if you need to reschedule any! Return JSON array with objects containing: who, what, when (ISO datetime with timezone America/Chicago), duration_minutes, where, type, is_recurring (boolean), recurrence (object with frequency, days, interval, count) """ CONFLICT_PROMPT = """You are a scheduling conflict resolver. Two events overlap: Event 1: "Well Child Visit (Sullivan)" on 2026-04-22 10:00-10:45 AM at Green Bay Pediatrics Event 2: "OT Session (Sullivan)" on 2026-04-22 4:00-5:00 PM at Therapy Center Family members: Sullivan (child), Harper (child), Matt (dad), Aundrea (mom), Maggie (dog) Priority rules: medical > pet > personal; kids need adults; split before reschedule Return JSON with: resolution_type (split/reassign/reschedule), explanation, actions (array of objects with event_index and action) """ TESTS = [ ("appointment", APPOINTMENT_PROMPT), ("intent", INTENT_PROMPT), ("recurring", RECURRING_PROMPT), ("conflict", CONFLICT_PROMPT), ] def call_llm(model, prompt): # qwen3 needs /no_think to skip thinking tokens if "qwen3" in model: prompt = prompt.rstrip() + "\n\n/no_think" payload = { "model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 2048}, } start = time.time() resp = requests.post(f"{LLM_URL}/api/generate", json=payload, timeout=120) elapsed = time.time() - start data = resp.json() text = data.get("response", "") eval_count = data.get("eval_count", 0) eval_duration_ns = data.get("eval_duration", 0) tok_per_sec = eval_count / (eval_duration_ns / 1e9) if eval_duration_ns > 0 else 0 return text, elapsed, tok_per_sec, eval_count def is_valid_json(text): import re cleaned = re.sub(r"^```(?:json)?\s*", "", text.strip()) cleaned = re.sub(r"\s*```$", "", cleaned.strip()) try: json.loads(cleaned) return True, cleaned except json.JSONDecodeError: match = re.search(r"\[.*\]", cleaned, re.DOTALL) or re.search(r"\{.*\}", cleaned, re.DOTALL) if match: try: json.loads(match.group()) return True, match.group() except json.JSONDecodeError: pass return False, cleaned def warm_up(model): print(f" Warmup {model}...", end=" ", flush=True) try: call_llm(model, "Hi\n\n/no_think" if "qwen3" in model else "Hi") print("OK") return True except Exception as e: print(f"FAIL: {e}") return False def main(): results = {} for model in MODELS: print(f"\n{'='*60}") print(f" {model}") print(f"{'='*60}") if not warm_up(model): continue model_results = [] for test_name, prompt in TESTS: print(f" {test_name:<20}", end=" ", flush=True) try: text, elapsed, tok_per_sec, eval_count = call_llm(model, prompt) valid, cleaned = is_valid_json(text) model_results.append({ "test": test_name, "latency_s": round(elapsed, 2), "tok_per_sec": round(tok_per_sec, 1), "eval_count": eval_count, "json_valid": valid, "response_len": len(text), }) status = "✅" if valid else "❌" print(f"{status} {elapsed:5.1f}s | {tok_per_sec:5.0f} tok/s | {eval_count:5d} tokens") except Exception as e: model_results.append({"test": test_name, "error": str(e)}) print(f"ERROR: {e}") results[model] = model_results # Print comparison table print(f"\n{'='*65}") print(f" COMPARISON TABLE") print(f"{'='*65}") print(f"{'Test':<15} | {'qwen2.5:7b latency':>18} | {'qwen3:8b latency':>18} | {'Winner':>10}") print("-" * 65) for i, test_name in enumerate([t[0] for t in TESTS]): r25 = results.get(MODELS[0], [])[i] if i < len(results.get(MODELS[0], [])) else None r3 = results.get(MODELS[1], [])[i] if i < len(results.get(MODELS[1], [])) else None lat25 = f"{r25['latency_s']:.1f}s" if r25 and "latency_s" in r25 else "ERR" lat3 = f"{r3['latency_s']:.1f}s" if r3 and "latency_s" in r3 else "ERR" if r25 and r3 and "latency_s" in r25 and "latency_s" in r3: winner = "qwen2.5" if r25["latency_s"] <= r3["latency_s"] else "qwen3" else: winner = "—" print(f"{test_name:<15} | {lat25:>18} | {lat3:>18} | {winner:>10}") # Quality comparison (JSON validity) print(f"\n{'='*65}") print(f" JSON VALIDITY") print(f"{'='*65}") for model in MODELS: r = results.get(model, []) valid = sum(1 for x in r if x.get("json_valid")) print(f" {model:<25} {valid}/{len(r)} valid") with open("/tmp/qwen3_benchmark_v2.json", "w") as f: json.dump(results, f, indent=2) if __name__ == "__main__": main()