📄 benchmark_reasoning.py 9,179 bytes Apr 19, 2026 📋 Raw

!/usr/bin/env python3

"""Benchmark qwen2.5-coder:7b vs phi4:14b on conflict resolution and rejection intent.

Tests structured reasoning tasks that the family assistant pipeline routes
through LLM_MODEL. Measures: latency, JSON validity, reasoning quality.

Usage:
python benchmark_reasoning.py # Test all models on Gaming PC
python benchmark_reasoning.py --url --models qwen2.5-coder:7b phi4:14b
python benchmark_reasoning.py --url http://127.0.0.1:11434/v1/chat/completions --models glm-5.1:cloud gemma4:31b-cloud
"""

import argparse
import json
import time
import sys
import requests

---------------------------------------------------------------------------

Test cases

---------------------------------------------------------------------------

CONFLICT_RESOLVE_PROMPT = """You are a family scheduling assistant. Given a conflict between two events, suggest resolution options.

Family members: Matt, Aundrea, Sullivan (child), Harper (child), Maggie (dog)
Priority rules: medical > pet > personal; kids need adults; split before reschedule

Return JSON array of options:
[{{"option": N, "action": "SPLIT|REASSIGN|RESCHEDULE", "rationale": "...", "affects_event": 1|2, "affects_who": "name"}}]"""

REJECTION_INTENT_PROMPT = """You are a family scheduling assistant. Parse the user's rejection message into a structured rule.

Return JSON:
{{"pattern": "exact event summary to match", "reason": "why rejected", "scope": "all|event|newsletter"}}

Scope rules:
- "all" = skip forever (user said "always", "forever", "never again")
- "event" = one-time skip, calendar only (default)
- "newsletter" = skip extraction only
Pattern must be the exact event summary, not a generalized keyword."""

TEST_CASES = [
{
"name": "conflict_simple",
"system": CONFLICT_RESOLVE_PROMPT,
"user": "Conflict: \"Maggie grooming\" Sat 8:00-9:00 AM at Golrusk vs \"First Communion Practice\" Sat 9:00-10:30 AM at church. Both need a parent.",
},
{
"name": "conflict_tough",
"system": CONFLICT_RESOLVE_PROMPT,
"user": "Conflict: \"Well Child Visit - Sullivan\" Fri 10:00-11:00 AM at Aurora Bay Care vs \"Sullivan School Field Trip\" Fri 9:00 AM-1:00 PM at Botanical Gardens. Both require a parent to attend.",
},
{
"name": "rejection_simple",
"system": REJECTION_INTENT_PROMPT,
"user": "We don't need that mass, ignore it forever",
},
{
"name": "rejection_ambiguous",
"system": REJECTION_INTENT_PROMPT,
"user": "Skip the grooming appointment this week, Aundrea can handle it next time",
},
]

---------------------------------------------------------------------------

Benchmark logic

---------------------------------------------------------------------------

def call_llm(url, model, system, user, timeout=120):
"""Call LLM and return (response_text, latency_seconds, error)."""
payload = {
"model": model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"stream": False,
"options": {"temperature": 0, "num_predict": 2048},
}
start = time.time()
try:
resp = requests.post(url, json=payload, timeout=timeout)
elapsed = time.time() - start
if resp.status_code != 200:
return None, elapsed, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
content = data.get("message", {}).get("content", "") or data.get("choices", [{}])[0].get("message", {}).get("content", "")
return content.strip(), elapsed, None
except requests.exceptions.Timeout:
elapsed = time.time() - start
return None, elapsed, "TIMEOUT"
except Exception as e:
elapsed = time.time() - start
return None, elapsed, str(e)[:200]

def validate_json(text):
"""Try to extract valid JSON from LLM output. Returns (json_obj, valid_bool)."""
if not text:
return None, False
# Try direct parse
try:
return json.loads(text), True
except json.JSONDecodeError:
pass
# Try extracting from markdown code block
import re
m = re.search(r"(?:json)?\s*\n?(.*?)\n?", text, re.DOTALL)
if m:
try:
return json.loads(m.group(1).strip()), True
except json.JSONDecodeError:
pass
# Try finding first { or [ to end
for start_char, end_char in [("{", "}"), ("[", "]")]:
idx = text.find(start_char)
if idx >= 0:
depth = 0
for i in range(idx, len(text)):
if text[i] == start_char:
depth += 1
elif text[i] == end_char:
depth -= 1
if depth == 0:
try:
return json.loads(text[idx:i+1]), True
except json.JSONDecodeError:
break
return None, False

def score_conflict(parsed):
"""Score conflict resolution quality (0-3)."""
if not isinstance(parsed, list):
return 0
score = 0
for opt in parsed:
if not isinstance(opt, dict):
continue
if "option" in opt and "action" in opt:
score += 1 # Has structure
if opt.get("action") in ("SPLIT", "REASSIGN", "RESCHEDULE"):
score += 1 # Valid action
if "rationale" in opt and len(opt["rationale"]) > 10:
score += 1 # Has reasoning
return min(score, 3)

def score_rejection(parsed):
"""Score rejection intent quality (0-3)."""
if not isinstance(parsed, dict):
return 0
score = 0
if "pattern" in parsed and len(parsed["pattern"]) > 3:
score += 1 # Has specific pattern
if "reason" in parsed:
score += 1 # Has reason
if "scope" in parsed and parsed["scope"] in ("all", "event", "newsletter"):
score += 1 # Valid scope
return score

def run_benchmark(url, models, timeout=120):
results = []
for model in models:
print(f"\n{'='60}")
print(f"Model: {model}")
print(f"{'='60}")
for tc in TEST_CASES:
name = tc["name"]
print(f"\n {name}...", end=" ", flush=True)
text, latency, error = call_llm(url, model, tc["system"], tc["user"], timeout)
if error:
print(f"❌ {error} ({latency:.1f}s)")
results.append({"model": model, "test": name, "latency": latency, "valid": False, "score": 0, "error": error})
continue
parsed, valid = validate_json(text)
if "conflict" in name:
score = score_conflict(parsed)
else:
score = score_rejection(parsed)
status = "✅" if valid else "⚠️"
print(f"{status} {latency:.1f}s | score {score}/3 | valid={valid}")
if valid:
# Show a snippet of the parsed output
snippet = json.dumps(parsed, indent=2)[:200]
print(f" {snippet}")
else:
print(f" Raw: {text[:150]}")
results.append({"model": model, "test": name, "latency": round(latency, 1), "valid": valid, "score": score, "error": None})
return results

def print_summary(results):
print(f"\n{'='60}")
print("SUMMARY")
print(f"{'='60}")
models = sorted(set(r["model"] for r in results))
# Header
header = f"{'Model':<25} {'Valid':>5} {'Score':>5} {'Avg(s)':>7} {'Errors':>6}"
print(header)
print("-" * len(header))
for model in models:
model_results = [r for r in results if r["model"] == model]
valid_count = sum(1 for r in model_results if r["valid"])
total_score = sum(r["score"] for r in model_results)
avg_latency = sum(r["latency"] for r in model_results) / len(model_results) if model_results else 0
errors = sum(1 for r in model_results if r["error"])
print(f"{model:<25} {valid_count}/{len(model_results):>4} {total_score}/{len(model_results)*3:>4} {avg_latency:>6.1f}s {errors:>5}")

if name == "main":
parser = argparse.ArgumentParser(description="Benchmark reasoning models for family assistant")
parser.add_argument("--url", default="http://100.104.147.116:11434/v1/chat/completions", help="LLM endpoint URL")
parser.add_argument("--models", nargs="+", default=["qwen2.5-coder:7b", "phi4:14b"], help="Models to benchmark")
parser.add_argument("--timeout", type=int, default=120, help="Timeout per request (seconds)")
args = parser.parse_args()

print(f"Endpoint: {args.url}")
print(f"Models: {args.models}")
print(f"Timeout: {args.timeout}s")

results = run_benchmark(args.url, args.models, args.timeout)
print_summary(results)

# Save results
outfile = "/home/hoffmann_admin/.openclaw/workspace/memory/benchmark-reasoning-results.json"
with open(outfile, "w") as f:
    json.dump({"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S%Z"), "url": args.url, "results": results}, f, indent=2)
print(f"\nResults saved to {outfile}")

← Back