!/usr/bin/env python3
"""Benchmark deepseek-r1 vs phi4:14b on reasoning tasks."""
import json
import time
import requests
LLM_URL = "http://100.104.147.116:11434"
REASONING_MODELS = ["phi4:14b", "deepseek-r1:8b", "deepseek-r1:14b"]
--- Reasoning-heavy prompts (not just JSON extraction) ---
CONFLICT_PROMPT = """You are a family scheduling assistant. Two events conflict:
Event A: "Sullivan well child visit" on April 22, 10:00-10:45 AM at Green Bay Pediatrics
Event B: "Sullivan OT session" on April 22, 10:30-11:15 AM at Therapy Center (45 min away)
Family: Matt (dad), Aundrea (mom), Sullivan (9, needs adult), Harper (7, needs adult), Maggie (dog)
Priority rules: medical > therapy > personal; kids need adult supervision; 45-min drive between locations
Explain the conflict, then recommend a specific resolution (SPLIT, REASSIGN, or RESCHEDULE) with reasoning.
Return JSON: {resolution_type, explanation, actions:[{event, new_time, who_attends}]}
"""
REJECTION_INTENT_PROMPT = """Parse the user's intent about rejecting/cancelling recurring items.
User message: "we don't need that mass, ignore it forever"
Context: The user is referring to "First Communion Mass (Sullivan)" on May 3, 2026.
Determine:
- Is this a rejection rule to save for future emails?
- Is this a one-time cancellation?
- What scope: "all" (skip extraction + calendar), "event" (just skip calendar), or "newsletter" (just skip extraction)?
- What pattern should match future occurrences?
Return JSON with: intent_type, scope, pattern, is_persistent_rule, reasoning
"""
NEWSLETTER_DEDUP_PROMPT = """Determine if these two newsletter items are duplicates:
Item A: {"title": "Summer Camp Registration Opens", "source": "St. Bernard School", "deadline": "May 1", "action_required": "register online"}
Item B: {"title": "Register for Summer Camp Now", "source": "St. Bernard Newsletter", "deadline": "May 1st", "action_required": "signup at signupgenius.com"}
Consider: same event described differently? Same deadline? Same required action?
Return JSON: {is_duplicate (boolean), confidence (0-1), reasoning}
"""
SCHEDULING_PUZZLE_PROMPT = """Matt needs to schedule 3 activities for Sullivan this week:
- Piano lesson: needs 30 min, any weekday 3-6 PM, preferably Tuesday or Thursday
- Speech therapy: needs 45 min, only available Mon/Wed/Fri mornings
- Dentist: needs 60 min, any day except Friday, before 4 PM
Aundrea works Mon-Thu 9-5. Matt is flexible except Thu morning.
Find valid slots. Return JSON: {piano_day_time, speech_day_time, dentist_day_time, reasoning}
"""
TESTS = [
("conflict_resolve", CONFLICT_PROMPT),
("rejection_intent", REJECTION_INTENT_PROMPT),
("newsletter_dedup", NEWSLETTER_DEDUP_PROMPT),
("scheduling_puzzle", SCHEDULING_PUZZLE_PROMPT),
]
def call_llm(model, prompt):
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 2048},
}
start = time.time()
resp = requests.post(f"{LLM_URL}/api/generate", json=payload, timeout=120)
elapsed = time.time() - start
data = resp.json()
text = data.get("response", "")
eval_count = data.get("eval_count", 0)
eval_duration_ns = data.get("eval_duration", 0)
tok_per_sec = eval_count / (eval_duration_ns / 1e9) if eval_duration_ns > 0 else 0
return text, elapsed, tok_per_sec, eval_count
def has_reasoning(text):
"""Check if response shows chain-of-thought reasoning."""
markers = ["think", "thinking", "reason", "because", "therefore", "first", "second", "finally", "step"]
return any(m in text.lower() for m in markers)
def is_valid_json(text):
import re
cleaned = re.sub(r"^(?:json)?\s*", "", text.strip())
cleaned = re.sub(r"\s*$", "", cleaned.strip())
try:
json.loads(cleaned)
return True
except json.JSONDecodeError:
match = re.search(r"[.]", cleaned, re.DOTALL) or re.search(r"{.}", cleaned, re.DOTALL)
if match:
try:
json.loads(match.group())
return True
except json.JSONDecodeError:
pass
return False
def warm_up(model):
print(f" Warmup {model}...", end=" ", flush=True)
try:
call_llm(model, "Hi")
print("OK")
return True
except Exception as e:
print(f"FAIL: {e}")
return False
def main():
results = {}
for model in REASONING_MODELS:
print(f"\n{'='*60}")
print(f" {model}")
print(f"{'='*60}")
if not warm_up(model):
continue
model_results = []
for test_name, prompt in TESTS:
print(f" {test_name:<20}", end=" ", flush=True)
try:
text, elapsed, tok_per_sec, eval_count = call_llm(model, prompt)
valid_json = is_valid_json(text)
has_reason = has_reasoning(text)
model_results.append({
"test": test_name,
"latency_s": round(elapsed, 2),
"tok_per_sec": round(tok_per_sec, 1),
"eval_count": eval_count,
"json_valid": valid_json,
"has_reasoning": has_reason,
"response_len": len(text),
})
status = "✅" if valid_json else "❌"
reason_marker = "🧠" if has_reason else " "
print(f"{status}{reason_marker} {elapsed:5.1f}s | {tok_per_sec:5.0f}/s | {eval_count:5d}tok")
except Exception as e:
model_results.append({"test": test_name, "error": str(e)})
print(f"ERROR: {e}")
results[model] = model_results
# Summary table
print(f"\n{'='*75}")
print(f" COMPARISON: Reasoning Tasks")
print(f"{'='*75}")
print(f"{'Test':<18} | {'phi4:14b':<17} | {'r1:8b':<17} | {'r1:14b':<17}")
print("-" * 75)
for i, test_name in enumerate([t[0] for t in TESTS]):
row = f"{test_name:<18}"
for model in REASONING_MODELS:
r = results.get(model, [])
if i < len(r) and "latency_s" in r[i]:
d = r[i]
lat = f"{d['latency_s']:.1f}s"
qual = "✅" if d["json_valid"] else "❌"
think = "🧠" if d.get("has_reasoning") else " "
row += f" | {qual}{think} {lat:>8}"
else:
row += f" | {'ERR':<17}"
print(row)
# Quality comparison
print(f"\n{'='*75}")
print(f" QUALITY SCORES (JSON + Reasoning)")
print(f"{'='*75}")
for model in REASONING_MODELS:
r = results.get(model, [])
json_ok = sum(1 for x in r if x.get("json_valid"))
reason_ok = sum(1 for x in r if x.get("has_reasoning"))
print(f" {model:<25} JSON: {json_ok}/{len(r)} | Reasoning: {reason_ok}/{len(r)}")
# Recommendation
print(f"\n{'='*75}")
print(f" RECOMMENDATION")
print(f"{'='*75}")
# Find best model
best_score = -1
best_model = None
for model in REASONING_MODELS:
r = results.get(model, [])
if not r:
continue
score = sum(1 for x in r if x.get("json_valid") and x.get("has_reasoning")) - (sum(x["latency_s"] for x in r if "latency_s" in x) / 10)
if score > best_score:
best_score = score
best_model = model
if best_model:
print(f" Best for local reasoning: {best_model}")
print(f" (Tradeoff: Keep phi4:14b for speed, switch to deepseek-r1:14b for quality)")
else:
print(" No clear winner — phi4:14b remains fastest")
with open("/tmp/deepseek_benchmark.json", "w") as f:
json.dump(results, f, indent=2)
print(f"\n Saved to /tmp/deepseek_benchmark.json")
if name == "main":
main()