""" Content Compliance Filter — Post-processing pipeline for blog generation. Replaces banned words, flags real names, catches hallucinated timestamps. Lightweight. No prompt engineering. Just enforcement. """ import re from dataclasses import dataclass from typing import List, Dict, Set, Optional # Banned corporate-speak with suggested replacements BANNED_WORDS = { "delve": "explore", "delving": "exploring", "tapestry": "mix", "moreover": "also", "leveraging": "using", "leverage": "use", "holistic": "complete", "paradigm": "approach", "synergy": "collaboration", "unlock": "enable", "potential": "capability", "seamless": "smooth", "robust": "reliable", "streamline": "simplify", "utilize": "use", "embark": "start", "journey": "process", "transformative": "significant", "cutting-edge": "advanced", "innovative": "new", "groundbreaking": "important", "navigate": "handle", "landscape": "environment", "ecosystem": "system", "empower": "enable", "facilitate": "help", "optimize": "improve", "enhance": "improve", "foster": "encourage", "revolutionary": "significant", "disruptive": "unusual", "synergistic": "cooperative", "mission-critical": "essential", "best-in-class": "excellent", "world-class": "excellent", "next-generation": "new", "game-changing": "important" } # Real names that must NEVER appear FORBIDDEN_NAMES = { "aundrea", "sullivan", "harper", "maggie", "hoffmann" } # Date patterns to flag as potentially hallucinated DATE_PATTERNS = [ # Specific day of week + time of day r"\b(?:monday|tuesday|wednesday|thursday|friday|saturday|sunday)\s+(?:morning|afternoon|evening|night)\b", # Month + day (with optional ordinal) r"\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2}(?:st|nd|rd|th)?\b", # Full date with year (April 23, 2026) r"\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s+\d{1,2},?\s+\d{4}\b", # Time with AM/PM r"\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)\b", # Abbreviated month + day + year r"\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4}\b", # Year standing alone (2026) r"\b20\d{2}\b", ] # --- Sovereign Stack Context --- # Injected into drafting prompts to prevent cloud-tool hallucinations SOVEREIGN_STACK_CONTEXT = """ Our infrastructure uses these specific tools. Do NOT mention Google, AWS, Azure, or cloud services unless explicitly part of the story. LOCAL STACK: - Radicale (self-hosted CalDAV server, port 5232) - Tailscale (mesh VPN, 100.x.x.x addresses) - Beelink mini-PC (titanium-butler, Ubuntu 24.04) - Gaming PC (RTX 3080 Ti, local LLM inference via Ollama) - Cloudflare (DNS + Email Workers only — not compute) - OpenClaw (agent orchestration framework) If you reference a tool, use the real name above. Never hallucinate services we don't use. """ @dataclass class ComplianceReport: """Results of compliance check.""" clean_text: str banned_found: List[str] names_found: List[str] dates_found: List[str] replacements_made: int is_compliant: bool has_warnings: bool class ComplianceFilter: """ Lightweight post-processing filter for blog content. STRICT MODE: Fails on real names (doesn't just flag). """ def __init__( self, banned_words: Dict[str, str] = None, forbidden_names: Set[str] = None, strict_mode: bool = True ): self.banned = banned_words or BANNED_WORDS self.forbidden = forbidden_names or FORBIDDEN_NAMES self.strict = strict_mode self._compile_patterns() def _compile_patterns(self): """Pre-compile regex patterns for performance.""" # Name detection self.name_pattern = re.compile( r'\b(' + '|'.join(re.escape(n) for n in self.forbidden) + r')\b', re.IGNORECASE ) # Date/time patterns self.date_patterns = [re.compile(p, re.IGNORECASE) for p in DATE_PATTERNS] def replace_banned_words(self, text: str) -> tuple: """ Replace banned words with simpler alternatives. Returns (cleaned_text, list_of_found_words). """ found = [] cleaned = text # Sort by length (longest first) to avoid partial replacements for word, replacement in sorted(self.banned.items(), key=lambda x: -len(x[0])): pattern = re.compile(r'\b' + re.escape(word) + r'\b', re.IGNORECASE) def replace_match(match): found.append(word) # Preserve capitalization original = match.group(0) if original.isupper(): return replacement.upper() elif original[0].isupper(): return replacement.capitalize() return replacement cleaned = pattern.sub(replace_match, cleaned) return cleaned, found def check_names(self, text: str) -> List[str]: """Check for forbidden real names.""" matches = self.name_pattern.findall(text) return [m.lower() for m in matches] def check_dates(self, text: str) -> List[str]: """Check for potentially hallucinated timestamps.""" found = [] for pattern in self.date_patterns: matches = pattern.findall(text) found.extend(matches) return found def process(self, text: str) -> ComplianceReport: """ Full compliance check and cleanup. STRICT MODE: - Names are BLOCKING (not just warnings) - Dates are BLOCKING (not just warnings) - Banned words are auto-replaced (warnings only) Pipeline: 1. Replace banned words with alternatives 2. Check for real names (BLOCKING in strict mode) 3. Flag fictional timestamps (BLOCKING in strict mode) 4. Clean up whitespace """ # Step 1: Replace banned words cleaned, banned_found = self.replace_banned_words(text) # Step 2: Check names (on cleaned text to catch any that slipped through) names_found = self.check_names(cleaned) # Step 3: Check dates (on cleaned text) dates_found = self.check_dates(cleaned) # Step 4: Clean up whitespace cleaned = re.sub(r'\s{2,}', ' ', cleaned) cleaned = re.sub(r'\n\n\n+', '\n\n', cleaned) cleaned = cleaned.strip() # STRICT MODE: Names and dates are BLOCKING if self.strict: is_compliant = len(names_found) == 0 and len(dates_found) == 0 else: # LENIENT MODE: Only names are blocking, dates are warnings is_compliant = len(names_found) == 0 has_warnings = len(banned_found) > 0 or len(dates_found) > 0 return ComplianceReport( clean_text=cleaned, banned_found=banned_found, names_found=names_found, dates_found=dates_found, replacements_made=len(banned_found), is_compliant=is_compliant, has_warnings=has_warnings ) def quick_check(self, text: str) -> bool: """Fast compliance check — returns True if clean.""" report = self.process(text) return report.is_compliant # Convenience functions for pipeline integration def filter_content(text: str, strict: bool = True) -> ComplianceReport: """One-shot compliance filter.""" return ComplianceFilter(strict_mode=strict).process(text) def is_compliant(text: str, strict: bool = True) -> bool: """Quick check if text passes all filters.""" return ComplianceFilter(strict_mode=strict).quick_check(text) # Example/test if __name__ == "__main__": test_text = """ # My Journey to Self-Hosting Last Tuesday morning, I decided to embark on a transformative project. We needed to leverage our holistic infrastructure paradigm. My wife Aundrea suggested we streamline our calendar setup. This was on March 15th at 7:45 PM. The solution was robust and seamless. """ # Test strict mode print("STRICT MODE:") report = filter_content(test_text, strict=True) print(f"Compliant: {report.is_compliant}") print(f"Has warnings: {report.has_warnings}") print(f"Banned words replaced: {report.banned_found}") print(f"Real names: {report.names_found}") print(f"Dates/times: {report.dates_found}") print(f"Replacements: {report.replacements_made}") print() # Test lenient mode print("LENIENT MODE:") report_lenient = filter_content(test_text, strict=False) print(f"Compliant: {report_lenient.is_compliant} (names only)") print(f"Dates flagged (warning only): {report_lenient.dates_found}") print() print("CLEANED TEXT:") print(report.clean_text)