""" Anti-Hallucination Date Filter — Post-processing for generated content. Strips ALL specific dates, times, days of week from generated text. Replaces with vague references or removes entirely. Usage: from anti_hallucination import strip_dates clean_text = strip_dates(generated_text) """ import re from typing import List, Tuple # Patterns to match and strip DATE_PATTERNS = [ # Full dates: April 23, 2026 (r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', ''), # Month + day (no year): April 23 (r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?\b', ''), # Day of week: Tuesday, Wednesday afternoon (r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+(?:morning|afternoon|evening|night))?\b', ''), # Time: 3:45 PM, 14:30 (r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?\b', ''), # Year alone: 2026 (r'\b20\d{2}\b', ''), # Relative dates with specifics (r'\blast\s+(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', ''), (r'\bthis\s+(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', ''), (r'\byesterday\b', ''), (r'\btomorrow\b', ''), ] # Replacement strategies REPLACEMENTS = { 'opening_timestamp': 'It was late.', 'around_time': 'around that time', 'vague_date': 'that day', 'vague_period': 'a while', } def strip_dates(text: str, aggressive: bool = True) -> str: """ Remove all specific dates/times from generated text. Args: text: Generated content aggressive: If True, strips ALL dates. If False, replaces with vague references. Returns: Clean text with no hallucinated dates """ cleaned = text removed = [] for pattern, replacement in DATE_PATTERNS: matches = re.findall(pattern, cleaned, re.IGNORECASE) if matches: removed.extend(matches) if aggressive: # Remove entirely (with surrounding punctuation if needed) cleaned = re.sub(r'[,\s]*' + pattern + r'[,\s]*', ' ', cleaned, flags=re.IGNORECASE) else: cleaned = re.sub(pattern, replacement, cleaned, flags=re.IGNORECASE) # Clean up artifacts from date removal cleaned = re.sub(r'\s+', ' ', cleaned) # Normalize spaces cleaned = re.sub(r'\s+([.,;:!?)])', r'\1', cleaned) # Fix punctuation spacing cleaned = re.sub(r'([(])\s+', r'\1', cleaned) # Fix opening paren cleaned = re.sub(r'\s+([-—])\s+', r' \1 ', cleaned) # Fix dashes cleaned = re.sub(r'\n\n\n+', '\n\n', cleaned) # Fix multiple newlines cleaned = re.sub(r'^[,\s]+', '', cleaned) # Remove leading punctuation cleaned = cleaned.strip() return cleaned def extract_dates(text: str) -> List[str]: """Extract all dates found in text (for logging/debugging).""" found = [] for pattern, _ in DATE_PATTERNS: matches = re.findall(pattern, text, re.IGNORECASE) found.extend(matches) return found def clean_generated_content(text: str, incident_date: str = None, strip_names: bool = True) -> str: """ Full cleanup pipeline for generated content. Args: text: Raw generated text incident_date: Real incident date (if known, used for context) strip_names: Also remove real names (Aundrea, etc.) Returns: Clean text ready for publishing """ cleaned = text # Step 1: Strip all dates cleaned = strip_dates(cleaned, aggressive=True) # Step 2: Strip real names if requested if strip_names: cleaned = re.sub(r'\b(Aundrea|Sullivan|Harper|Maggie|Hoffmann)\b', 'my spouse', cleaned, flags=re.IGNORECASE) # Step 3: Ensure no orphaned punctuation cleaned = re.sub(r'\s+([.,;:!?)])', r'\1', cleaned) cleaned = re.sub(r'([(])\s+', r'\1', cleaned) return cleaned # Test if __name__ == "__main__": test = """ It was April 23, 2026, around 3:45 PM on a Tuesday afternoon. Last Wednesday, I decided to fix the server. The year 2026 has been rough. """ print("ORIGINAL:") print(test) print() print("CLEANED:") print(strip_dates(test)) print() print("EXTRACTED DATES:") print(extract_dates(test))