📄 anti_hallucination.py 4,434 bytes Apr 23, 2026 📋 Raw

"""
Anti-Hallucination Date Filter — Post-processing for generated content.

Strips ALL specific dates, times, days of week from generated text.
Replaces with vague references or removes entirely.

Usage:
from anti_hallucination import strip_dates
clean_text = strip_dates(generated_text)
"""

import re
from typing import List, Tuple

Patterns to match and strip

DATE_PATTERNS = [
# Full dates: April 23, 2026
(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', ''),

# Month + day (no year): April 23
(r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?\b', ''),

# Day of week: Tuesday, Wednesday afternoon
(r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+(?:morning|afternoon|evening|night))?\b', ''),

# Time: 3:45 PM, 14:30
(r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?\b', ''),

# Year alone: 2026
(r'\b20\d{2}\b', ''),

# Relative dates with specifics
(r'\blast\s+(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', ''),
(r'\bthis\s+(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', ''),
(r'\byesterday\b', ''),
(r'\btomorrow\b', ''),

]

Replacement strategies

REPLACEMENTS = {
'opening_timestamp': 'It was late.',
'around_time': 'around that time',
'vague_date': 'that day',
'vague_period': 'a while',
}

def strip_dates(text: str, aggressive: bool = True) -> str:
"""
Remove all specific dates/times from generated text.

Args:
    text: Generated content
    aggressive: If True, strips ALL dates. If False, replaces with vague references.

Returns:
    Clean text with no hallucinated dates
"""
cleaned = text
removed = []

for pattern, replacement in DATE_PATTERNS:
    matches = re.findall(pattern, cleaned, re.IGNORECASE)
    if matches:
        removed.extend(matches)
        if aggressive:
            # Remove entirely (with surrounding punctuation if needed)
            cleaned = re.sub(r'[,\s]*' + pattern + r'[,\s]*', ' ', cleaned, flags=re.IGNORECASE)
        else:
            cleaned = re.sub(pattern, replacement, cleaned, flags=re.IGNORECASE)

# Clean up artifacts from date removal
cleaned = re.sub(r'\s+', ' ', cleaned)  # Normalize spaces
cleaned = re.sub(r'\s+([.,;:!?)])', r'\1', cleaned)  # Fix punctuation spacing
cleaned = re.sub(r'([(])\s+', r'\1', cleaned)  # Fix opening paren
cleaned = re.sub(r'\s+([-—])\s+', r' \1 ', cleaned)  # Fix dashes
cleaned = re.sub(r'\n\n\n+', '\n\n', cleaned)  # Fix multiple newlines
cleaned = re.sub(r'^[,\s]+', '', cleaned)  # Remove leading punctuation
cleaned = cleaned.strip()

return cleaned

def extract_dates(text: str) -> List[str]:
"""Extract all dates found in text (for logging/debugging)."""
found = []
for pattern, _ in DATE_PATTERNS:
matches = re.findall(pattern, text, re.IGNORECASE)
found.extend(matches)
return found

def clean_generated_content(text: str, incident_date: str = None, strip_names: bool = True) -> str:
"""
Full cleanup pipeline for generated content.

Args:
    text: Raw generated text
    incident_date: Real incident date (if known, used for context)
    strip_names: Also remove real names (Aundrea, etc.)

Returns:
    Clean text ready for publishing
"""
cleaned = text

# Step 1: Strip all dates
cleaned = strip_dates(cleaned, aggressive=True)

# Step 2: Strip real names if requested
if strip_names:
    cleaned = re.sub(r'\b(Aundrea|Sullivan|Harper|Maggie|Hoffmann)\b', 'my spouse', cleaned, flags=re.IGNORECASE)

# Step 3: Ensure no orphaned punctuation
cleaned = re.sub(r'\s+([.,;:!?)])', r'\1', cleaned)
cleaned = re.sub(r'([(])\s+', r'\1', cleaned)

return cleaned

Test

if name == "main":
test = """
It was April 23, 2026, around 3:45 PM on a Tuesday afternoon.
Last Wednesday, I decided to fix the server.
The year 2026 has been rough.
"""

print("ORIGINAL:")
print(test)
print()

print("CLEANED:")
print(strip_dates(test))
print()

print("EXTRACTED DATES:")
print(extract_dates(test))