"""
Anti-Hallucination Date Filter — Post-processing for generated content.
Strips ALL specific dates, times, days of week from generated text.
Replaces with vague references or removes entirely.
Usage:
from anti_hallucination import strip_dates
clean_text = strip_dates(generated_text)
"""
import re
from typing import List, Tuple
Patterns to match and strip
DATE_PATTERNS = [
# Full dates: April 23, 2026
(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', ''),
# Month + day (no year): April 23
(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?\b', ''),
# Day of week: Tuesday, Wednesday afternoon
(r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+(?:morning|afternoon|evening|night))?\b', ''),
# Time: 3:45 PM, 14:30
(r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?\b', ''),
# Year alone: 2026
(r'\b20\d{2}\b', ''),
# Relative dates with specifics
(r'\blast\s+(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', ''),
(r'\bthis\s+(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', ''),
(r'\byesterday\b', ''),
(r'\btomorrow\b', ''),
]
Replacement strategies
REPLACEMENTS = {
'opening_timestamp': 'It was late.',
'around_time': 'around that time',
'vague_date': 'that day',
'vague_period': 'a while',
}
def strip_dates(text: str, aggressive: bool = True) -> str:
"""
Remove all specific dates/times from generated text.
Args:
text: Generated content
aggressive: If True, strips ALL dates. If False, replaces with vague references.
Returns:
Clean text with no hallucinated dates
"""
cleaned = text
removed = []
for pattern, replacement in DATE_PATTERNS:
matches = re.findall(pattern, cleaned, re.IGNORECASE)
if matches:
removed.extend(matches)
if aggressive:
# Remove entirely (with surrounding punctuation if needed)
cleaned = re.sub(r'[,\s]*' + pattern + r'[,\s]*', ' ', cleaned, flags=re.IGNORECASE)
else:
cleaned = re.sub(pattern, replacement, cleaned, flags=re.IGNORECASE)
# Clean up artifacts from date removal
cleaned = re.sub(r'\s+', ' ', cleaned) # Normalize spaces
cleaned = re.sub(r'\s+([.,;:!?)])', r'\1', cleaned) # Fix punctuation spacing
cleaned = re.sub(r'([(])\s+', r'\1', cleaned) # Fix opening paren
cleaned = re.sub(r'\s+([-—])\s+', r' \1 ', cleaned) # Fix dashes
cleaned = re.sub(r'\n\n\n+', '\n\n', cleaned) # Fix multiple newlines
cleaned = re.sub(r'^[,\s]+', '', cleaned) # Remove leading punctuation
cleaned = cleaned.strip()
return cleaned
def extract_dates(text: str) -> List[str]:
"""Extract all dates found in text (for logging/debugging)."""
found = []
for pattern, _ in DATE_PATTERNS:
matches = re.findall(pattern, text, re.IGNORECASE)
found.extend(matches)
return found
def clean_generated_content(text: str, incident_date: str = None, strip_names: bool = True) -> str:
"""
Full cleanup pipeline for generated content.
Args:
text: Raw generated text
incident_date: Real incident date (if known, used for context)
strip_names: Also remove real names (Aundrea, etc.)
Returns:
Clean text ready for publishing
"""
cleaned = text
# Step 1: Strip all dates
cleaned = strip_dates(cleaned, aggressive=True)
# Step 2: Strip real names if requested
if strip_names:
cleaned = re.sub(r'\b(Aundrea|Sullivan|Harper|Maggie|Hoffmann)\b', 'my spouse', cleaned, flags=re.IGNORECASE)
# Step 3: Ensure no orphaned punctuation
cleaned = re.sub(r'\s+([.,;:!?)])', r'\1', cleaned)
cleaned = re.sub(r'([(])\s+', r'\1', cleaned)
return cleaned
Test
if name == "main":
test = """
It was April 23, 2026, around 3:45 PM on a Tuesday afternoon.
Last Wednesday, I decided to fix the server.
The year 2026 has been rough.
"""
print("ORIGINAL:")
print(test)
print()
print("CLEANED:")
print(strip_dates(test))
print()
print("EXTRACTED DATES:")
print(extract_dates(test))