#!/usr/bin/env python3 """Benchmark deepseek-r1 vs phi4:14b on reasoning tasks.""" import json import time import requests LLM_URL = "http://100.104.147.116:11434" REASONING_MODELS = ["phi4:14b", "deepseek-r1:8b", "deepseek-r1:14b"] # --- Reasoning-heavy prompts (not just JSON extraction) --- CONFLICT_PROMPT = """You are a family scheduling assistant. Two events conflict: Event A: "Sullivan well child visit" on April 22, 10:00-10:45 AM at Green Bay Pediatrics Event B: "Sullivan OT session" on April 22, 10:30-11:15 AM at Therapy Center (45 min away) Family: Matt (dad), Aundrea (mom), Sullivan (9, needs adult), Harper (7, needs adult), Maggie (dog) Priority rules: medical > therapy > personal; kids need adult supervision; 45-min drive between locations Explain the conflict, then recommend a specific resolution (SPLIT, REASSIGN, or RESCHEDULE) with reasoning. Return JSON: {resolution_type, explanation, actions:[{event, new_time, who_attends}]} """ REJECTION_INTENT_PROMPT = """Parse the user's intent about rejecting/cancelling recurring items. User message: "we don't need that mass, ignore it forever" Context: The user is referring to "First Communion Mass (Sullivan)" on May 3, 2026. Determine: - Is this a rejection rule to save for future emails? - Is this a one-time cancellation? - What scope: "all" (skip extraction + calendar), "event" (just skip calendar), or "newsletter" (just skip extraction)? - What pattern should match future occurrences? Return JSON with: intent_type, scope, pattern, is_persistent_rule, reasoning """ NEWSLETTER_DEDUP_PROMPT = """Determine if these two newsletter items are duplicates: Item A: {"title": "Summer Camp Registration Opens", "source": "St. Bernard School", "deadline": "May 1", "action_required": "register online"} Item B: {"title": "Register for Summer Camp Now", "source": "St. Bernard Newsletter", "deadline": "May 1st", "action_required": "signup at signupgenius.com"} Consider: same event described differently? Same deadline? Same required action? Return JSON: {is_duplicate (boolean), confidence (0-1), reasoning} """ SCHEDULING_PUZZLE_PROMPT = """Matt needs to schedule 3 activities for Sullivan this week: - Piano lesson: needs 30 min, any weekday 3-6 PM, preferably Tuesday or Thursday - Speech therapy: needs 45 min, only available Mon/Wed/Fri mornings - Dentist: needs 60 min, any day except Friday, before 4 PM Aundrea works Mon-Thu 9-5. Matt is flexible except Thu morning. Find valid slots. Return JSON: {piano_day_time, speech_day_time, dentist_day_time, reasoning} """ TESTS = [ ("conflict_resolve", CONFLICT_PROMPT), ("rejection_intent", REJECTION_INTENT_PROMPT), ("newsletter_dedup", NEWSLETTER_DEDUP_PROMPT), ("scheduling_puzzle", SCHEDULING_PUZZLE_PROMPT), ] def call_llm(model, prompt): payload = { "model": model, "prompt": prompt, "stream": False, "options": {"temperature": 0.1, "num_predict": 2048}, } start = time.time() resp = requests.post(f"{LLM_URL}/api/generate", json=payload, timeout=120) elapsed = time.time() - start data = resp.json() text = data.get("response", "") eval_count = data.get("eval_count", 0) eval_duration_ns = data.get("eval_duration", 0) tok_per_sec = eval_count / (eval_duration_ns / 1e9) if eval_duration_ns > 0 else 0 return text, elapsed, tok_per_sec, eval_count def has_reasoning(text): """Check if response shows chain-of-thought reasoning.""" markers = ["think", "thinking", "reason", "because", "therefore", "first", "second", "finally", "step"] return any(m in text.lower() for m in markers) def is_valid_json(text): import re cleaned = re.sub(r"^```(?:json)?\s*", "", text.strip()) cleaned = re.sub(r"\s*```$", "", cleaned.strip()) try: json.loads(cleaned) return True except json.JSONDecodeError: match = re.search(r"\[.*\]", cleaned, re.DOTALL) or re.search(r"\{.*\}", cleaned, re.DOTALL) if match: try: json.loads(match.group()) return True except json.JSONDecodeError: pass return False def warm_up(model): print(f" Warmup {model}...", end=" ", flush=True) try: call_llm(model, "Hi") print("OK") return True except Exception as e: print(f"FAIL: {e}") return False def main(): results = {} for model in REASONING_MODELS: print(f"\n{'='*60}") print(f" {model}") print(f"{'='*60}") if not warm_up(model): continue model_results = [] for test_name, prompt in TESTS: print(f" {test_name:<20}", end=" ", flush=True) try: text, elapsed, tok_per_sec, eval_count = call_llm(model, prompt) valid_json = is_valid_json(text) has_reason = has_reasoning(text) model_results.append({ "test": test_name, "latency_s": round(elapsed, 2), "tok_per_sec": round(tok_per_sec, 1), "eval_count": eval_count, "json_valid": valid_json, "has_reasoning": has_reason, "response_len": len(text), }) status = "✅" if valid_json else "❌" reason_marker = "🧠" if has_reason else " " print(f"{status}{reason_marker} {elapsed:5.1f}s | {tok_per_sec:5.0f}/s | {eval_count:5d}tok") except Exception as e: model_results.append({"test": test_name, "error": str(e)}) print(f"ERROR: {e}") results[model] = model_results # Summary table print(f"\n{'='*75}") print(f" COMPARISON: Reasoning Tasks") print(f"{'='*75}") print(f"{'Test':<18} | {'phi4:14b':<17} | {'r1:8b':<17} | {'r1:14b':<17}") print("-" * 75) for i, test_name in enumerate([t[0] for t in TESTS]): row = f"{test_name:<18}" for model in REASONING_MODELS: r = results.get(model, []) if i < len(r) and "latency_s" in r[i]: d = r[i] lat = f"{d['latency_s']:.1f}s" qual = "✅" if d["json_valid"] else "❌" think = "🧠" if d.get("has_reasoning") else " " row += f" | {qual}{think} {lat:>8}" else: row += f" | {'ERR':<17}" print(row) # Quality comparison print(f"\n{'='*75}") print(f" QUALITY SCORES (JSON + Reasoning)") print(f"{'='*75}") for model in REASONING_MODELS: r = results.get(model, []) json_ok = sum(1 for x in r if x.get("json_valid")) reason_ok = sum(1 for x in r if x.get("has_reasoning")) print(f" {model:<25} JSON: {json_ok}/{len(r)} | Reasoning: {reason_ok}/{len(r)}") # Recommendation print(f"\n{'='*75}") print(f" RECOMMENDATION") print(f"{'='*75}") # Find best model best_score = -1 best_model = None for model in REASONING_MODELS: r = results.get(model, []) if not r: continue score = sum(1 for x in r if x.get("json_valid") and x.get("has_reasoning")) - (sum(x["latency_s"] for x in r if "latency_s" in x) / 10) if score > best_score: best_score = score best_model = model if best_model: print(f" Best for local reasoning: {best_model}") print(f" (Tradeoff: Keep phi4:14b for speed, switch to deepseek-r1:14b for quality)") else: print(" No clear winner — phi4:14b remains fastest") with open("/tmp/deepseek_benchmark.json", "w") as f: json.dump(results, f, indent=2) print(f"\n Saved to /tmp/deepseek_benchmark.json") if __name__ == "__main__": main()