📄 bench_deepseek.py 7,871 bytes Apr 19, 2026 📋 Raw

!/usr/bin/env python3

"""Benchmark deepseek-r1 vs phi4:14b on reasoning tasks."""

import json
import time
import requests

LLM_URL = "http://100.104.147.116:11434"
REASONING_MODELS = ["phi4:14b", "deepseek-r1:8b", "deepseek-r1:14b"]

--- Reasoning-heavy prompts (not just JSON extraction) ---

CONFLICT_PROMPT = """You are a family scheduling assistant. Two events conflict:

Event A: "Sullivan well child visit" on April 22, 10:00-10:45 AM at Green Bay Pediatrics
Event B: "Sullivan OT session" on April 22, 10:30-11:15 AM at Therapy Center (45 min away)

Family: Matt (dad), Aundrea (mom), Sullivan (9, needs adult), Harper (7, needs adult), Maggie (dog)

Priority rules: medical > therapy > personal; kids need adult supervision; 45-min drive between locations

Explain the conflict, then recommend a specific resolution (SPLIT, REASSIGN, or RESCHEDULE) with reasoning.

Return JSON: {resolution_type, explanation, actions:[{event, new_time, who_attends}]}
"""

REJECTION_INTENT_PROMPT = """Parse the user's intent about rejecting/cancelling recurring items.

User message: "we don't need that mass, ignore it forever"

Context: The user is referring to "First Communion Mass (Sullivan)" on May 3, 2026.

Determine:
- Is this a rejection rule to save for future emails?
- Is this a one-time cancellation?
- What scope: "all" (skip extraction + calendar), "event" (just skip calendar), or "newsletter" (just skip extraction)?
- What pattern should match future occurrences?

Return JSON with: intent_type, scope, pattern, is_persistent_rule, reasoning
"""

NEWSLETTER_DEDUP_PROMPT = """Determine if these two newsletter items are duplicates:

Item A: {"title": "Summer Camp Registration Opens", "source": "St. Bernard School", "deadline": "May 1", "action_required": "register online"}
Item B: {"title": "Register for Summer Camp Now", "source": "St. Bernard Newsletter", "deadline": "May 1st", "action_required": "signup at signupgenius.com"}

Consider: same event described differently? Same deadline? Same required action?

Return JSON: {is_duplicate (boolean), confidence (0-1), reasoning}
"""

SCHEDULING_PUZZLE_PROMPT = """Matt needs to schedule 3 activities for Sullivan this week:
- Piano lesson: needs 30 min, any weekday 3-6 PM, preferably Tuesday or Thursday
- Speech therapy: needs 45 min, only available Mon/Wed/Fri mornings
- Dentist: needs 60 min, any day except Friday, before 4 PM

Aundrea works Mon-Thu 9-5. Matt is flexible except Thu morning.

Find valid slots. Return JSON: {piano_day_time, speech_day_time, dentist_day_time, reasoning}
"""

TESTS = [
("conflict_resolve", CONFLICT_PROMPT),
("rejection_intent", REJECTION_INTENT_PROMPT),
("newsletter_dedup", NEWSLETTER_DEDUP_PROMPT),
("scheduling_puzzle", SCHEDULING_PUZZLE_PROMPT),
]

def call_llm(model, prompt):
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.1, "num_predict": 2048},
}
start = time.time()
resp = requests.post(f"{LLM_URL}/api/generate", json=payload, timeout=120)
elapsed = time.time() - start

data = resp.json()
text = data.get("response", "")
eval_count = data.get("eval_count", 0)
eval_duration_ns = data.get("eval_duration", 0)
tok_per_sec = eval_count / (eval_duration_ns / 1e9) if eval_duration_ns > 0 else 0

return text, elapsed, tok_per_sec, eval_count

def has_reasoning(text):
"""Check if response shows chain-of-thought reasoning."""
markers = ["think", "thinking", "reason", "because", "therefore", "first", "second", "finally", "step"]
return any(m in text.lower() for m in markers)

def is_valid_json(text):
import re
cleaned = re.sub(r"^(?:json)?\s*", "", text.strip()) cleaned = re.sub(r"\s*$", "", cleaned.strip())
try:
json.loads(cleaned)
return True
except json.JSONDecodeError:
match = re.search(r"[.]", cleaned, re.DOTALL) or re.search(r"{.}", cleaned, re.DOTALL)
if match:
try:
json.loads(match.group())
return True
except json.JSONDecodeError:
pass
return False

def warm_up(model):
print(f" Warmup {model}...", end=" ", flush=True)
try:
call_llm(model, "Hi")
print("OK")
return True
except Exception as e:
print(f"FAIL: {e}")
return False

def main():
results = {}

for model in REASONING_MODELS:
    print(f"\n{'='*60}")
    print(f"  {model}")
    print(f"{'='*60}")

    if not warm_up(model):
        continue

    model_results = []
    for test_name, prompt in TESTS:
        print(f"  {test_name:<20}", end=" ", flush=True)
        try:
            text, elapsed, tok_per_sec, eval_count = call_llm(model, prompt)
            valid_json = is_valid_json(text)
            has_reason = has_reasoning(text)
            model_results.append({
                "test": test_name,
                "latency_s": round(elapsed, 2),
                "tok_per_sec": round(tok_per_sec, 1),
                "eval_count": eval_count,
                "json_valid": valid_json,
                "has_reasoning": has_reason,
                "response_len": len(text),
            })
            status = "✅" if valid_json else "❌"
            reason_marker = "🧠" if has_reason else "  "
            print(f"{status}{reason_marker} {elapsed:5.1f}s | {tok_per_sec:5.0f}/s | {eval_count:5d}tok")
        except Exception as e:
            model_results.append({"test": test_name, "error": str(e)})
            print(f"ERROR: {e}")

    results[model] = model_results

# Summary table
print(f"\n{'='*75}")
print(f"  COMPARISON: Reasoning Tasks")
print(f"{'='*75}")
print(f"{'Test':<18} | {'phi4:14b':<17} | {'r1:8b':<17} | {'r1:14b':<17}")
print("-" * 75)

for i, test_name in enumerate([t[0] for t in TESTS]):
    row = f"{test_name:<18}"
    for model in REASONING_MODELS:
        r = results.get(model, [])
        if i < len(r) and "latency_s" in r[i]:
            d = r[i]
            lat = f"{d['latency_s']:.1f}s"
            qual = "✅" if d["json_valid"] else "❌"
            think = "🧠" if d.get("has_reasoning") else "  "
            row += f" | {qual}{think} {lat:>8}"
        else:
            row += f" | {'ERR':<17}"
    print(row)

# Quality comparison
print(f"\n{'='*75}")
print(f"  QUALITY SCORES (JSON + Reasoning)")
print(f"{'='*75}")
for model in REASONING_MODELS:
    r = results.get(model, [])
    json_ok = sum(1 for x in r if x.get("json_valid"))
    reason_ok = sum(1 for x in r if x.get("has_reasoning"))
    print(f"  {model:<25} JSON: {json_ok}/{len(r)} | Reasoning: {reason_ok}/{len(r)}")

# Recommendation
print(f"\n{'='*75}")
print(f"  RECOMMENDATION")
print(f"{'='*75}")

# Find best model
best_score = -1
best_model = None
for model in REASONING_MODELS:
    r = results.get(model, [])
    if not r:
        continue
    score = sum(1 for x in r if x.get("json_valid") and x.get("has_reasoning")) - (sum(x["latency_s"] for x in r if "latency_s" in x) / 10)
    if score > best_score:
        best_score = score
        best_model = model

if best_model:
    print(f"  Best for local reasoning: {best_model}")
    print(f"  (Tradeoff: Keep phi4:14b for speed, switch to deepseek-r1:14b for quality)")
else:
    print("  No clear winner — phi4:14b remains fastest")

with open("/tmp/deepseek_benchmark.json", "w") as f:
    json.dump(results, f, indent=2)
print(f"\n  Saved to /tmp/deepseek_benchmark.json")

if name == "main":
main()