#!/usr/bin/env python3 """Benchmark qwen2.5-coder:7b vs phi4:14b on conflict resolution and rejection intent. Tests structured reasoning tasks that the family assistant pipeline routes through LLM_MODEL. Measures: latency, JSON validity, reasoning quality. Usage: python benchmark_reasoning.py # Test all models on Gaming PC python benchmark_reasoning.py --url --models qwen2.5-coder:7b phi4:14b python benchmark_reasoning.py --url http://127.0.0.1:11434/v1/chat/completions --models glm-5.1:cloud gemma4:31b-cloud """ import argparse import json import time import sys import requests # --------------------------------------------------------------------------- # Test cases # --------------------------------------------------------------------------- CONFLICT_RESOLVE_PROMPT = """You are a family scheduling assistant. Given a conflict between two events, suggest resolution options. Family members: Matt, Aundrea, Sullivan (child), Harper (child), Maggie (dog) Priority rules: medical > pet > personal; kids need adults; split before reschedule Return JSON array of options: [{{"option": N, "action": "SPLIT|REASSIGN|RESCHEDULE", "rationale": "...", "affects_event": 1|2, "affects_who": "name"}}]""" REJECTION_INTENT_PROMPT = """You are a family scheduling assistant. Parse the user's rejection message into a structured rule. Return JSON: {{"pattern": "exact event summary to match", "reason": "why rejected", "scope": "all|event|newsletter"}} Scope rules: - "all" = skip forever (user said "always", "forever", "never again") - "event" = one-time skip, calendar only (default) - "newsletter" = skip extraction only Pattern must be the exact event summary, not a generalized keyword.""" TEST_CASES = [ { "name": "conflict_simple", "system": CONFLICT_RESOLVE_PROMPT, "user": "Conflict: \"Maggie grooming\" Sat 8:00-9:00 AM at Golrusk vs \"First Communion Practice\" Sat 9:00-10:30 AM at church. Both need a parent.", }, { "name": "conflict_tough", "system": CONFLICT_RESOLVE_PROMPT, "user": "Conflict: \"Well Child Visit - Sullivan\" Fri 10:00-11:00 AM at Aurora Bay Care vs \"Sullivan School Field Trip\" Fri 9:00 AM-1:00 PM at Botanical Gardens. Both require a parent to attend.", }, { "name": "rejection_simple", "system": REJECTION_INTENT_PROMPT, "user": "We don't need that mass, ignore it forever", }, { "name": "rejection_ambiguous", "system": REJECTION_INTENT_PROMPT, "user": "Skip the grooming appointment this week, Aundrea can handle it next time", }, ] # --------------------------------------------------------------------------- # Benchmark logic # --------------------------------------------------------------------------- def call_llm(url, model, system, user, timeout=120): """Call LLM and return (response_text, latency_seconds, error).""" payload = { "model": model, "messages": [ {"role": "system", "content": system}, {"role": "user", "content": user}, ], "stream": False, "options": {"temperature": 0, "num_predict": 2048}, } start = time.time() try: resp = requests.post(url, json=payload, timeout=timeout) elapsed = time.time() - start if resp.status_code != 200: return None, elapsed, f"HTTP {resp.status_code}: {resp.text[:200]}" data = resp.json() content = data.get("message", {}).get("content", "") or data.get("choices", [{}])[0].get("message", {}).get("content", "") return content.strip(), elapsed, None except requests.exceptions.Timeout: elapsed = time.time() - start return None, elapsed, "TIMEOUT" except Exception as e: elapsed = time.time() - start return None, elapsed, str(e)[:200] def validate_json(text): """Try to extract valid JSON from LLM output. Returns (json_obj, valid_bool).""" if not text: return None, False # Try direct parse try: return json.loads(text), True except json.JSONDecodeError: pass # Try extracting from markdown code block import re m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", text, re.DOTALL) if m: try: return json.loads(m.group(1).strip()), True except json.JSONDecodeError: pass # Try finding first { or [ to end for start_char, end_char in [("{", "}"), ("[", "]")]: idx = text.find(start_char) if idx >= 0: depth = 0 for i in range(idx, len(text)): if text[i] == start_char: depth += 1 elif text[i] == end_char: depth -= 1 if depth == 0: try: return json.loads(text[idx:i+1]), True except json.JSONDecodeError: break return None, False def score_conflict(parsed): """Score conflict resolution quality (0-3).""" if not isinstance(parsed, list): return 0 score = 0 for opt in parsed: if not isinstance(opt, dict): continue if "option" in opt and "action" in opt: score += 1 # Has structure if opt.get("action") in ("SPLIT", "REASSIGN", "RESCHEDULE"): score += 1 # Valid action if "rationale" in opt and len(opt["rationale"]) > 10: score += 1 # Has reasoning return min(score, 3) def score_rejection(parsed): """Score rejection intent quality (0-3).""" if not isinstance(parsed, dict): return 0 score = 0 if "pattern" in parsed and len(parsed["pattern"]) > 3: score += 1 # Has specific pattern if "reason" in parsed: score += 1 # Has reason if "scope" in parsed and parsed["scope"] in ("all", "event", "newsletter"): score += 1 # Valid scope return score def run_benchmark(url, models, timeout=120): results = [] for model in models: print(f"\n{'='*60}") print(f"Model: {model}") print(f"{'='*60}") for tc in TEST_CASES: name = tc["name"] print(f"\n {name}...", end=" ", flush=True) text, latency, error = call_llm(url, model, tc["system"], tc["user"], timeout) if error: print(f"❌ {error} ({latency:.1f}s)") results.append({"model": model, "test": name, "latency": latency, "valid": False, "score": 0, "error": error}) continue parsed, valid = validate_json(text) if "conflict" in name: score = score_conflict(parsed) else: score = score_rejection(parsed) status = "✅" if valid else "⚠️" print(f"{status} {latency:.1f}s | score {score}/3 | valid={valid}") if valid: # Show a snippet of the parsed output snippet = json.dumps(parsed, indent=2)[:200] print(f" {snippet}") else: print(f" Raw: {text[:150]}") results.append({"model": model, "test": name, "latency": round(latency, 1), "valid": valid, "score": score, "error": None}) return results def print_summary(results): print(f"\n{'='*60}") print("SUMMARY") print(f"{'='*60}") models = sorted(set(r["model"] for r in results)) # Header header = f"{'Model':<25} {'Valid':>5} {'Score':>5} {'Avg(s)':>7} {'Errors':>6}" print(header) print("-" * len(header)) for model in models: model_results = [r for r in results if r["model"] == model] valid_count = sum(1 for r in model_results if r["valid"]) total_score = sum(r["score"] for r in model_results) avg_latency = sum(r["latency"] for r in model_results) / len(model_results) if model_results else 0 errors = sum(1 for r in model_results if r["error"]) print(f"{model:<25} {valid_count}/{len(model_results):>4} {total_score}/{len(model_results)*3:>4} {avg_latency:>6.1f}s {errors:>5}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Benchmark reasoning models for family assistant") parser.add_argument("--url", default="http://100.104.147.116:11434/v1/chat/completions", help="LLM endpoint URL") parser.add_argument("--models", nargs="+", default=["qwen2.5-coder:7b", "phi4:14b"], help="Models to benchmark") parser.add_argument("--timeout", type=int, default=120, help="Timeout per request (seconds)") args = parser.parse_args() print(f"Endpoint: {args.url}") print(f"Models: {args.models}") print(f"Timeout: {args.timeout}s") results = run_benchmark(args.url, args.models, args.timeout) print_summary(results) # Save results outfile = "/home/hoffmann_admin/.openclaw/workspace/memory/benchmark-reasoning-results.json" with open(outfile, "w") as f: json.dump({"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S%Z"), "url": args.url, "results": results}, f, indent=2) print(f"\nResults saved to {outfile}")