"""
Content Compliance Filter — Post-processing pipeline for blog generation.
Replaces banned words, flags real names, catches hallucinated timestamps.
Lightweight. No prompt engineering. Just enforcement.
"""

import re
from dataclasses import dataclass
from typing import List, Dict, Set, Optional

# Banned corporate-speak with suggested replacements
BANNED_WORDS = {
    "delve": "explore",
    "delving": "exploring",
    "tapestry": "mix",
    "moreover": "also",
    "leveraging": "using",
    "leverage": "use",
    "holistic": "complete",
    "paradigm": "approach",
    "synergy": "collaboration",
    "unlock": "enable",
    "potential": "capability",
    "seamless": "smooth",
    "robust": "reliable",
    "streamline": "simplify",
    "utilize": "use",
    "embark": "start",
    "journey": "process",
    "transformative": "significant",
    "cutting-edge": "advanced",
    "innovative": "new",
    "groundbreaking": "important",
    "navigate": "handle",
    "landscape": "environment",
    "ecosystem": "system",
    "empower": "enable",
    "facilitate": "help",
    "optimize": "improve",
    "enhance": "improve",
    "foster": "encourage",
    "revolutionary": "significant",
    "disruptive": "unusual",
    "synergistic": "cooperative",
    "mission-critical": "essential",
    "best-in-class": "excellent",
    "world-class": "excellent",
    "next-generation": "new",
    "game-changing": "important"
}

# Real names that must NEVER appear
FORBIDDEN_NAMES = {
    "aundrea", "sullivan", "harper", "maggie", "hoffmann"
}

# Date patterns to flag as potentially hallucinated
DATE_PATTERNS = [
    # Specific day of week + time of day
    r"\b(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)\s+(?:morning|afternoon|evening|night)\b",
    # Month + day (with optional ordinal)
    r"\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2}(?:st|nd|rd|th)?\b",
    # Full date with year (April 23, 2026)
    r"\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},?\s+\d{4}\b",
    # Time with AM/PM
    r"\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)\b",
    # Abbreviated month + day + year
    r"\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}\b",
    # Year standing alone (2026)
    r"\b20\d{2}\b",
]

# --- Sovereign Stack Context ---
# Injected into drafting prompts to prevent cloud-tool hallucinations
SOVEREIGN_STACK_CONTEXT = """
Our infrastructure uses these specific tools. Do NOT mention Google, AWS, Azure, or cloud services unless explicitly part of the story.

LOCAL STACK:
- Radicale (self-hosted CalDAV server, port 5232)
- Tailscale (mesh VPN, 100.x.x.x addresses)
- Beelink mini-PC (titanium-butler, Ubuntu 24.04)
- Gaming PC (RTX 3080 Ti, local LLM inference via Ollama)
- Cloudflare (DNS + Email Workers only — not compute)
- OpenClaw (agent orchestration framework)

If you reference a tool, use the real name above. Never hallucinate services we don't use.
"""


@dataclass
class ComplianceReport:
    """Results of compliance check."""
    clean_text: str
    banned_found: List[str]
    names_found: List[str]
    dates_found: List[str]
    replacements_made: int
    is_compliant: bool
    has_warnings: bool


class ComplianceFilter:
    """
    Lightweight post-processing filter for blog content.
    
    STRICT MODE: Fails on real names (doesn't just flag).
    """
    
    def __init__(
        self,
        banned_words: Dict[str, str] = None,
        forbidden_names: Set[str] = None,
        strict_mode: bool = True
    ):
        self.banned = banned_words or BANNED_WORDS
        self.forbidden = forbidden_names or FORBIDDEN_NAMES
        self.strict = strict_mode
        self._compile_patterns()
    
    def _compile_patterns(self):
        """Pre-compile regex patterns for performance."""
        # Name detection
        self.name_pattern = re.compile(
            r'\b(' + '|'.join(re.escape(n) for n in self.forbidden) + r')\b',
            re.IGNORECASE
        )
        # Date/time patterns
        self.date_patterns = [re.compile(p, re.IGNORECASE) for p in DATE_PATTERNS]
    
    def replace_banned_words(self, text: str) -> tuple:
        """
        Replace banned words with simpler alternatives.
        Returns (cleaned_text, list_of_found_words).
        """
        found = []
        cleaned = text
        
        # Sort by length (longest first) to avoid partial replacements
        for word, replacement in sorted(self.banned.items(), key=lambda x: -len(x[0])):
            pattern = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE)
            
            def replace_match(match):
                found.append(word)
                # Preserve capitalization
                original = match.group(0)
                if original.isupper():
                    return replacement.upper()
                elif original[0].isupper():
                    return replacement.capitalize()
                return replacement
            
            cleaned = pattern.sub(replace_match, cleaned)
        
        return cleaned, found
    
    def check_names(self, text: str) -> List[str]:
        """Check for forbidden real names."""
        matches = self.name_pattern.findall(text)
        return [m.lower() for m in matches]
    
    def check_dates(self, text: str) -> List[str]:
        """Check for potentially hallucinated timestamps."""
        found = []
        for pattern in self.date_patterns:
            matches = pattern.findall(text)
            found.extend(matches)
        return found
    
    def process(self, text: str) -> ComplianceReport:
        """
        Full compliance check and cleanup.
        
        STRICT MODE:
        - Names are BLOCKING (not just warnings)
        - Dates are BLOCKING (not just warnings)
        - Banned words are auto-replaced (warnings only)
        
        Pipeline:
        1. Replace banned words with alternatives
        2. Check for real names (BLOCKING in strict mode)
        3. Flag fictional timestamps (BLOCKING in strict mode)
        4. Clean up whitespace
        """
        # Step 1: Replace banned words
        cleaned, banned_found = self.replace_banned_words(text)
        
        # Step 2: Check names (on cleaned text to catch any that slipped through)
        names_found = self.check_names(cleaned)
        
        # Step 3: Check dates (on cleaned text)
        dates_found = self.check_dates(cleaned)
        
        # Step 4: Clean up whitespace
        cleaned = re.sub(r'\s{2,}', ' ', cleaned)
        cleaned = re.sub(r'\n\n\n+', '\n\n', cleaned)
        cleaned = cleaned.strip()
        
        # STRICT MODE: Names and dates are BLOCKING
        if self.strict:
            is_compliant = len(names_found) == 0 and len(dates_found) == 0
        else:
            # LENIENT MODE: Only names are blocking, dates are warnings
            is_compliant = len(names_found) == 0
        
        has_warnings = len(banned_found) > 0 or len(dates_found) > 0
        
        return ComplianceReport(
            clean_text=cleaned,
            banned_found=banned_found,
            names_found=names_found,
            dates_found=dates_found,
            replacements_made=len(banned_found),
            is_compliant=is_compliant,
            has_warnings=has_warnings
        )
    
    def quick_check(self, text: str) -> bool:
        """Fast compliance check — returns True if clean."""
        report = self.process(text)
        return report.is_compliant


# Convenience functions for pipeline integration
def filter_content(text: str, strict: bool = True) -> ComplianceReport:
    """One-shot compliance filter."""
    return ComplianceFilter(strict_mode=strict).process(text)


def is_compliant(text: str, strict: bool = True) -> bool:
    """Quick check if text passes all filters."""
    return ComplianceFilter(strict_mode=strict).quick_check(text)


# Example/test
if __name__ == "__main__":
    test_text = """
    # My Journey to Self-Hosting
    
    Last Tuesday morning, I decided to embark on a transformative project.
    We needed to leverage our holistic infrastructure paradigm.
    
    My wife Aundrea suggested we streamline our calendar setup.
    This was on March 15th at 7:45 PM.
    
    The solution was robust and seamless.
    """
    
    # Test strict mode
    print("STRICT MODE:")
    report = filter_content(test_text, strict=True)
    
    print(f"Compliant: {report.is_compliant}")
    print(f"Has warnings: {report.has_warnings}")
    print(f"Banned words replaced: {report.banned_found}")
    print(f"Real names: {report.names_found}")
    print(f"Dates/times: {report.dates_found}")
    print(f"Replacements: {report.replacements_made}")
    print()
    
    # Test lenient mode
    print("LENIENT MODE:")
    report_lenient = filter_content(test_text, strict=False)
    print(f"Compliant: {report_lenient.is_compliant} (names only)")
    print(f"Dates flagged (warning only): {report_lenient.dates_found}")
    print()
    
    print("CLEANED TEXT:")
    print(report.clean_text)