📄 test_model_comparison.py 10,587 bytes Apr 21, 2026 📋 Raw

!/usr/bin/env python3

"""
Multi-model comparison test for blog content generation.
Tests accuracy, compliance, and performance across local models.
"""

import httpx
import json
import time
from datetime import datetime
from typing import Dict, List, Optional

OLLAMA_URL = "http://matt-pc.tail864e81.ts.net:11434/api/generate"
GAMING_PC_OLLAMA = "http://matt-pc.tail864e81.ts.net:11434"

Models to test

MODELS = [
"phi4:14b",
"qwen2.5-coder:14b",
"gemma4:latest",
"llama3.1:8b"
]

Real names to NEVER use

FORBIDDEN_NAMES = ["Aundrea", "Sullivan", "Harper", "Maggie"]
SAFE_REPLACEMENTS = {
"Aundrea": "my wife",
"Sullivan": "our son",
"Harper": "our daughter",
"Maggie": "the dog"
}

Banned words list

BANNED_WORDS = [
"delve", "tapestry", "moreover", "leveraging", "holistic",
"paradigm", "synergy", "unlock", "potential", "seamless",
"robust", "streamline", "utilize", "embark", "journey",
"transformative", "cutting-edge", "innovative", "groundbreaking",
"leverage", "delving", "navigate", "landscape", "ecosystem",
"empower", "facilitate", "optimize", "enhance", "foster"
]

Grounding facts about our actual setup (for verification)

GROUNDING_FACTS = {
"calendar_server": "Radicale",
"server_ip": "0.0.0.0:5232",
"auth_method": "htpasswd with bcrypt",
"migration_method": "Python scripts using icalendar + caldav libraries",
"webhook_parser": "Cloudflare Email Worker",
"email_domain": "assistant@hoffdesk.com",
"local_llm": "qwen2.5-coder:7b on Gaming PC via Tailscale",
"infrastructure": "Beelink (titanium-butler) + Gaming PC (3080 Ti) via Tailscale",
"family_assistant_location": "~/.openclaw/services/family_assistant/",
"blog_api_path": "/api/blog"
}

SYSTEM_PROMPT = f"""You are a technical writer with a sovereign, pragmatic voice.
Style rules:
- Start with the human moment, not the solution
- Admit wrong turns before revealing fixes
- Use specific versions, error messages, timestamps
- End with actionable takeaways
- NEVER use these words: {', '.join(BANNED_WORDS)}
- NEVER use real names: {', '.join(FORBIDDEN_NAMES)}
- Use generic references: "my wife", "our son", "our daughter", "the dog"
- Write like a competent engineer talking to another engineer
- Be specific about tools, versions, and configurations
"""

def generate(model: str, prompt: str, max_tokens: int = 2000, temperature: float = 0.7) -> tuple:
"""Generate content and return (response_text, elapsed_time)."""

start = time.time()
try:
    response = httpx.post(
        OLLAMA_URL,
        json={
            "model": model,
            "prompt": f"{SYSTEM_PROMPT}\n\n{prompt}",
            "stream": False,
            "options": {
                "temperature": temperature,
                "num_predict": max_tokens,
                "top_p": 0.9,
            }
        },
        timeout=600.0
    )
    response.raise_for_status()
    elapsed = time.time() - start
    return response.json()["response"], elapsed
except Exception as e:
    return f"ERROR: {e}", time.time() - start

def check_compliance(text: str) -> dict:
"""Check content for compliance issues."""

issues = []

# Check for forbidden names
for name in FORBIDDEN_NAMES:
    if name.lower() in text.lower():
        issues.append(f"FORBIDDEN_NAME: '{name}' found")

# Check for banned words
banned_found = []
for word in BANNED_WORDS:
    if word.lower() in text.lower():
        banned_found.append(word)
if banned_found:
    issues.append(f"BANNED_WORDS: {', '.join(banned_found)}")

# Check for hallucinated details (simple pattern matching)
if "Google Calendar API" in text or "google-api-python-client" in text:
    issues.append("HALLUCINATION: References Google Calendar API (we used webhook/email parsing)")

if "NextCloud" in text and "Radicale" not in text:
    issues.append("HALLUCINATION: Mentions NextCloud without acknowledging we used Radicale")

if "March" in text or "7:45 PM" in text or "Sunday morning" in text:
    issues.append("HALLUCINATION: Includes specific fictional timestamps")

# Check for code blocks (good sign)
has_code = "```" in text

return {
    "compliant": len(issues) == 0,
    "issues": issues,
    "banned_words_found": banned_found,
    "has_code_blocks": has_code,
    "word_count": len(text.split()),
    "char_count": len(text)
}

def check_grounding(text: str) -> dict:
"""Check if text aligns with known facts about our setup."""

grounding_checks = {
    "radicale_mentioned": "Radicale" in text,
    "caldav_mentioned": "caldav" in text.lower() or "CalDAV" in text,
    "cloudflare_mentioned": "Cloudflare" in text or "Cloudflare Email Worker" in text,
    "tailscale_mentioned": "Tailscale" in text or "tailscale" in text.lower(),
    "beelink_mentioned": "Beelink" in text or "titanium-butler" in text,
    "python_mentioned": "Python" in text or "python" in text.lower(),
}

score = sum(grounding_checks.values()) / len(grounding_checks)

return {
    "grounding_score": round(score, 2),
    "checks": grounding_checks,
    "well_grounded": score >= 0.5
}

def test_model(model: str) -> dict:
"""Run full test on a single model."""

print(f"\n{'='*60}")
print(f"Testing: {model}")
print('='*60)

results = {
    "model": model,
    "timestamp": datetime.now().isoformat(),
    "tests": {}
}

# Test 1: Title Generation
print("\n📝 Title Generation...")
title_prompt = """Topic: Migrating from Google Calendar to a self-hosted CalDAV server

Generate 3 title options. Return ONLY a JSON array of strings.
Rules: No real names. No banned words."""

titles_text, elapsed = generate(model, title_prompt, max_tokens=500)
results["tests"]["titles"] = {
    "elapsed": round(elapsed, 1),
    "raw_output": titles_text[:500]
}

# Try to extract titles
titles = []
try:
    start = titles_text.find("[")
    end = titles_text.rfind("]") + 1
    if start >= 0 and end > start:
        titles = json.loads(titles_text[start:end])
except:
    lines = [l.strip("- •\"' ") for l in titles_text.split("\n") if l.strip()]
    titles = [l for l in lines if l and not l.startswith("{")][:3]

results["tests"]["titles"]["extracted"] = titles
print(f"  Elapsed: {elapsed:.1f}s | Titles: {len(titles)}")

# Test 2: Draft Generation
print("\n📝 Draft Generation...")

title = titles[0] if titles else "The Night I Broke DNS: A CalDAV Migration Story"

draft_prompt = f"""Title: {title}

Write a 600-800 word blog post about migrating from Google Calendar to Radicale (self-hosted CalDAV).
Include:
- Why we did it (family privacy, data sovereignty)
- The stack: Radicale on Beelink, Cloudflare Email Worker, Python scripts
- Technical specifics: Tailscale, icalendar + caldav libs
- The moment it worked
- What you'd do differently

Rules:
- Use "my wife", "our kids" — NEVER real names
- Include Python code blocks
- Mention specific versions where relevant
- Be specific, not vague
"""

draft, elapsed = generate(model, draft_prompt, max_tokens=3000)
compliance = check_compliance(draft)
grounding = check_grounding(draft)

results["tests"]["draft"] = {
    "elapsed": round(elapsed, 1),
    "compliance": compliance,
    "grounding": grounding,
    "preview": draft[:600] + "..." if len(draft) > 600 else draft
}

print(f"  Elapsed: {elapsed:.1f}s | Words: {compliance['word_count']} | Compliant: {compliance['compliant']}")
print(f"  Grounding: {grounding['grounding_score']*100:.0f}%")
if compliance["issues"]:
    for issue in compliance["issues"]:
        print(f"  ⚠️  {issue}")

# Test 3: SEO Generation
print("\n📝 SEO Metadata...")
seo_prompt = f"""Generate SEO metadata for this blog post:

{draft[:500]}

Return ONLY JSON:
{{"excerpt": "1-2 sentences", "tags": "tag1, tag2, tag3", "meta_description": "under 160 chars"}}

No real names. No banned words."""

seo_text, elapsed = generate(model, seo_prompt, max_tokens=500)
results["tests"]["seo"] = {
    "elapsed": round(elapsed, 1),
    "raw_output": seo_text[:400]
}

print(f"  Elapsed: {elapsed:.1f}s")

return results

def run_comparison():
"""Run tests across all models and output comparison."""

print("\n" + "="*60)
print("MULTI-MODEL BLOG GENERATION COMPARISON")
print("="*60)
print(f"Models: {', '.join(MODELS)}")
print(f"Time: {datetime.now().isoformat()}")
print("="*60)

all_results = []

for model in MODELS:
    try:
        result = test_model(model)
        all_results.append(result)
    except Exception as e:
        print(f"❌ {model} failed: {e}")
        all_results.append({
            "model": model,
            "error": str(e),
            "timestamp": datetime.now().isoformat()
        })

# Summary
print("\n" + "="*60)
print("COMPARISON SUMMARY")
print("="*60)

for r in all_results:
    if "error" in r:
        print(f"\n❌ {r['model']}: FAILED ({r['error']})")
        continue

    draft_test = r["tests"].get("draft", {})
    compliance = draft_test.get("compliance", {})
    grounding = draft_test.get("grounding", {})

    total_time = sum(
        t.get("elapsed", 0) 
        for t in r["tests"].values()
    )

    print(f"\n{r['model']}:")
    print(f"  Total time: {total_time:.1f}s")
    print(f"  Words: {compliance.get('word_count', 0)}")
    print(f"  Compliant: {'✅' if compliance.get('compliant') else '❌'}")
    print(f"  Grounding: {grounding.get('grounding_score', 0)*100:.0f}%")
    if compliance.get("issues"):
        print(f"  Issues: {len(compliance['issues'])}")

# Save results
output_file = f"/home/hoffmann_admin/.openclaw/workspace-socrates/model_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(output_file, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"\n💾 Full results: {output_file}")
print("="*60)

if name == "main":
run_comparison()

← Back