📄 test_real_incident.py 13,174 bytes Apr 23, 2026 📋 Raw

"""
Test real-incident injection into V2 pipeline.

This script tests the theory that phi4:14b produces authentic content
when given real incident data vs. hallucinated briefs.

Real incident: Cloudflare Tunnel Error 1033 (2026-04-23)
"""

import json
import sys
sys.path.insert(0, '/home/hoffmann_admin/.openclaw/workspace-socrates/hoffdesk-api')

import asyncio
from content.pipeline import (
generate_ollama, extract_json,
stage_draft, stage_seo, stage_compliance
)
from content.compliance_filter import ComplianceFilter
from blog.generation.prompts import build_struggle_first_prompt

═══════════════════════════════════════════════════════════════

REAL INCIDENT DATA (from memory/2026-04-23.md + cloudflared-fix.md)

═══════════════════════════════════════════════════════════════

REAL_INCIDENT = {
"struggle_angle": "Error 1033: Cloudflare can't reach origin on notes.hoffdesk.com",
"origin_story": "I was testing the blog admin panel after deploying Daedalus' templates. Clicked the preview button. Instead of the admin dashboard, Cloudflare served Error 1033. The tunnel was up — I could see it in cloudflared tunnel list — but requests died somewhere between Cloudflare's edge and my Beelink.",
"attempts": [
{
"attempt": "Checked if uvicorn was running. curl http://127.0.0.1:8000/health returned healthy. Service was fine locally.",
"why_failed": "Uvicorn was bound to 127.0.0.1, not 0.0.0.0. Cloudflared on the same machine couldn't reach it. Classic localhost-only binding mistake."
},
{
"attempt": "Added a systemd drop-in to bind uvicorn to 0.0.0.0:8000. Restarted service. Local health check still passed.",
"why_failed": "Cloudflared was running with --token (managed tunnel mode) instead of reading local config.yml. The managed config didn't have an ingress rule for notes.hoffdesk.com."
},
{
"attempt": "Checked cloudflared service file. Found correct command with --config /home/hoffmann_admin/.cloudflared/config.yml run. Should work.",
"why_failed": "/etc/systemd/system/cloudflared.service.d/override.conf existed from an earlier fix attempt. It had the OLD command syntax (--config after run). Systemd drop-ins override the main unit. The override was winning."
}
],
"the_moment": "Realized there were THREE layers of config fighting: (1) uvicorn binding, (2) cloudflared managed vs local mode, (3) a stale systemd override from a previous debug session. Each layer looked correct in isolation. The override.conf was the invisible hand breaking everything.",
"the_fix": "Removed the stale override.conf, reloaded systemd, restarted cloudflared with local config. Notes.hoffdesk.com came back immediately. Caveat: I still need to migrate the tunnel fully to local config — right now it's a hybrid that works but isn't clean.",
"reflection": "I should have checked for existing systemd overrides before editing the main service file. The systemctl cat command shows drop-ins — I knew this, but forgot under pressure. Also: Error 1033 is a generic 'origin unreachable' message. It doesn't tell you WHICH layer is broken. You have to peel them like an onion. Next time I'll start with systemctl cat and work backwards.",
"target_length": 1200,
"timestamp": "2026-04-23T01:15:00Z",
"location": "titanium-butler (Beelink)",
"systems": ["uvicorn", "cloudflared", "systemd", "Cloudflare Tunnel"],
"error_message": "Error 1033: Cloudflare can't reach origin"
}

═══════════════════════════════════════════════════════════════

STYLE EXAMPLE: Real Matt writing (simulated from known voice)

═══════════════════════════════════════════════════════════════

STYLE_EXAMPLE = """Style Reference: Sarcastically Serious Lab Notes

Opening pattern: Timestamp + location + dry understatement of crisis
- "It was 03:14 CST. titanium-butler had been running for 47 days. I was about to find out what happens when you forget about a systemd override."

Voice rules:
- Deadpan delivery of technical facts
- Self-deprecating admission of mistakes
- Specific version numbers, error codes, config file paths
- Internal monologue in quotes: "I thought..."
- Cost accounting: time, sleep, relationship friction (but generic)
- No exclamation marks. Periods only. Let the absurdity speak for itself.

Example tone:
"The tunnel was up. The service was healthy. The DNS resolved. Everything looked correct. Nothing worked. This is the kind of problem that makes you question whether computers are actually deterministic or just gaslighting you in groups."
"""

═══════════════════════════════════════════════════════════════

BYPASS STAGES 1+2: Feed real incident directly to Stage 3 (Draft)

═══════════════════════════════════════════════════════════════

async def test_real_incident_pipeline():
"""Test pipeline with real incident data injected directly."""

print("=" * 60)
print("TEST: Real Incident → phi4:14b Draft Generation")
print("Incident: Cloudflare Error 1033")
print("=" * 60)
print()

# Build prompt with REAL data
prompt = build_struggle_first_prompt(REAL_INCIDENT, style_reference=None)

# Inject style example
prompt = prompt.replace(
    "=== STYLE EXAMPLE ===",
    f"=== STYLE EXAMPLE ===\n{STYLE_EXAMPLE}"
)

print("PROMPT PREVIEW (first 500 chars):")
print("-" * 60)
print(prompt[:500])
print("...")
print()

# Call phi4:14b for draft
print("Calling phi4:14b on Gaming PC...")
print("-" * 60)

start_time = asyncio.get_event_loop().time()

try:
    draft_text = await generate_ollama(
        model="phi4:14b",
        prompt=prompt,
        temperature=0.7,
        max_tokens=3000,
        timeout=180.0
    )

    elapsed = asyncio.get_event_loop().time() - start_time
    word_count = len(draft_text.split())

    print(f"✓ Generation complete in {elapsed:.1f}s")
    print(f"✓ Output: {word_count} words")
    print()

    # Run compliance filter
    print("Running compliance filter...")
    print("-" * 60)

    compliance = ComplianceFilter().process(draft_text)

    print(f"  Banned words found: {compliance.banned_found or 'None'}")
    print(f"  Real names found: {compliance.names_found or 'None'}")
    print(f"  Hallucinated dates: {compliance.dates_found or 'None'}")
    print(f"  Compliant: {'✓ YES' if compliance.is_compliant else '✗ NO'}")
    print()

    # Print the output
    print("=" * 60)
    print("GENERATED CONTENT:")
    print("=" * 60)
    print()
    print(draft_text)
    print()
    print("=" * 60)
    print("END OF OUTPUT")
    print("=" * 60)

    # Save to file for review
    output_path = "/tmp/real_incident_test_output.md"
    with open(output_path, "w") as f:
        f.write(draft_text)

    print(f"\nSaved to: {output_path}")

    return {
        "success": True,
        "word_count": word_count,
        "elapsed_seconds": elapsed,
        "compliant": compliance.is_compliant,
        "banned_found": compliance.banned_found,
        "names_found": compliance.names_found,
        "dates_found": compliance.dates_found,
        "content": draft_text[:500] + "..." if len(draft_text) > 500 else draft_text
    }

except Exception as e:
    print(f"✗ FAILED: {e}")
    return {"success": False, "error": str(e)}

═══════════════════════════════════════════════════════════════

COMPARISON: Run same pipeline with HALLUCINATED brief

═══════════════════════════════════════════════════════════════

HALLUCINATED_INCIDENT = {
"struggle_angle": "The day everything went downhill at home",
"origin_story": "My spouse and I were gearing up for the day, ready to tackle our usual tasks, when we hit an unexpected snag: notes.hoffdesk.com returned Error 1033.",
"attempts": [
{
"attempt": "Adjusting the Cloudflare Worker configuration files on our Gaming PC",
"why_failed": "Hours went by as I meticulously reviewed and tweaked settings, but nothing changed"
},
{
"attempt": "Isolating Tailscale and Radicale separately",
"why_failed": "This exercise only reinforced my suspicion: Cloudflare Workers were at the heart of the issue"
}
],
"the_moment": "Reviewing logs late into the evening, I noticed something crucial: the error coincided with memory issues in our Local LLMs due to an outdated model",
"the_fix": "Updating the Local LLM models finally resolved the issue",
"reflection": "I should have prioritized model updates and regular performance checks for our Local LLMs",
"target_length": 1200
}

async def test_hallucinated_pipeline():
"""Test with the same hallucinated data that produced the bad output."""

print("\n" + "=" * 60)
print("TEST: Hallucinated Brief → phi4:14b Draft Generation")
print("(Same data that produced 'The Day Everything Went Downhill')")
print("=" * 60)
print()

prompt = build_struggle_first_prompt(HALLUCINATED_INCIDENT, style_reference=None)

print("Calling phi4:14b...")
print("-" * 60)

try:
    draft_text = await generate_ollama(
        model="phi4:14b",
        prompt=prompt,
        temperature=0.7,
        max_tokens=3000,
        timeout=180.0
    )

    word_count = len(draft_text.split())

    print(f"✓ Output: {word_count} words")
    print()
    print("FIRST 500 CHARS:")
    print(draft_text[:500])
    print()

    # Compliance check
    compliance = ComplianceFilter().process(draft_text)
    print(f"Compliant: {'✓ YES' if compliance.is_compliant else '✗ NO'}")

    return {
        "success": True,
        "word_count": word_count,
        "compliant": compliance.is_compliant,
        "content_preview": draft_text[:300]
    }

except Exception as e:
    print(f"✗ FAILED: {e}")
    return {"success": False, "error": str(e)}

═══════════════════════════════════════════════════════════════

MAIN

═══════════════════════════════════════════════════════════════

async def main():
"""Run both tests and compare."""

print("\n" + "🧪 " * 20)
print("REAL INCIDENT vs HALLUCINATED PIPELINE TEST")
print("🧪 " * 20)
print()

# Test 1: Real incident
real_result = await test_real_incident_pipeline()

# Test 2: Hallucinated (skip if time constrained)
# hallucinated_result = await test_hallucinated_pipeline()

print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print()

if real_result.get("success"):
    print("✓ Real incident pipeline: SUCCESS")
    print(f"  Words: {real_result['word_count']}")
    print(f"  Time: {real_result['elapsed_seconds']:.1f}s")
    print(f"  Compliant: {real_result['compliant']}")
    print()
    print("HYPOTHESIS:")
    print("  If real data produces authentic, grounded output,")
    print("  then the issue is NOT phi4:14b quality.")
    print("  The issue is hallucinated Stage 1 input.")
    print()
    print("RECOMMENDATION:")
    print("  1. Replace Stage 1 (Strategy) with incident logging/capture")
    print("  2. Feed real incidents directly to Stage 3 (Draft)")
    print("  3. Keep phi4:14b — it's not the bottleneck")
else:
    print("✗ Test failed:", real_result.get("error"))

if name == "main":
asyncio.run(main())

← Back