"""
Anti-Hallucination Date Filter — Post-processing for generated content.

Strips ALL specific dates, times, days of week from generated text.
Replaces with vague references or removes entirely.

Usage:
    from anti_hallucination import strip_dates
    clean_text = strip_dates(generated_text)
"""

import re
from typing import List, Tuple

# Patterns to match and strip
DATE_PATTERNS = [
    # Full dates: April 23, 2026
    (r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b', ''),
    
    # Month + day (no year): April 23
    (r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}(?:st|nd|rd|th)?\b', ''),
    
    # Day of week: Tuesday, Wednesday afternoon
    (r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)(?:\s+(?:morning|afternoon|evening|night))?\b', ''),
    
    # Time: 3:45 PM, 14:30
    (r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)?\b', ''),
    
    # Year alone: 2026
    (r'\b20\d{2}\b', ''),
    
    # Relative dates with specifics
    (r'\blast\s+(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', ''),
    (r'\bthis\s+(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', ''),
    (r'\byesterday\b', ''),
    (r'\btomorrow\b', ''),
]

# Replacement strategies
REPLACEMENTS = {
    'opening_timestamp': 'It was late.',
    'around_time': 'around that time',
    'vague_date': 'that day',
    'vague_period': 'a while',
}


def strip_dates(text: str, aggressive: bool = True) -> str:
    """
    Remove all specific dates/times from generated text.
    
    Args:
        text: Generated content
        aggressive: If True, strips ALL dates. If False, replaces with vague references.
    
    Returns:
        Clean text with no hallucinated dates
    """
    cleaned = text
    removed = []
    
    for pattern, replacement in DATE_PATTERNS:
        matches = re.findall(pattern, cleaned, re.IGNORECASE)
        if matches:
            removed.extend(matches)
            if aggressive:
                # Remove entirely (with surrounding punctuation if needed)
                cleaned = re.sub(r'[,\s]*' + pattern + r'[,\s]*', ' ', cleaned, flags=re.IGNORECASE)
            else:
                cleaned = re.sub(pattern, replacement, cleaned, flags=re.IGNORECASE)
    
    # Clean up artifacts from date removal
    cleaned = re.sub(r'\s+', ' ', cleaned)  # Normalize spaces
    cleaned = re.sub(r'\s+([.,;:!?)])', r'\1', cleaned)  # Fix punctuation spacing
    cleaned = re.sub(r'([(])\s+', r'\1', cleaned)  # Fix opening paren
    cleaned = re.sub(r'\s+([-—])\s+', r' \1 ', cleaned)  # Fix dashes
    cleaned = re.sub(r'\n\n\n+', '\n\n', cleaned)  # Fix multiple newlines
    cleaned = re.sub(r'^[,\s]+', '', cleaned)  # Remove leading punctuation
    cleaned = cleaned.strip()
    
    return cleaned


def extract_dates(text: str) -> List[str]:
    """Extract all dates found in text (for logging/debugging)."""
    found = []
    for pattern, _ in DATE_PATTERNS:
        matches = re.findall(pattern, text, re.IGNORECASE)
        found.extend(matches)
    return found


def clean_generated_content(text: str, incident_date: str = None, strip_names: bool = True) -> str:
    """
    Full cleanup pipeline for generated content.
    
    Args:
        text: Raw generated text
        incident_date: Real incident date (if known, used for context)
        strip_names: Also remove real names (Aundrea, etc.)
    
    Returns:
        Clean text ready for publishing
    """
    cleaned = text
    
    # Step 1: Strip all dates
    cleaned = strip_dates(cleaned, aggressive=True)
    
    # Step 2: Strip real names if requested
    if strip_names:
        cleaned = re.sub(r'\b(Aundrea|Sullivan|Harper|Maggie|Hoffmann)\b', 'my spouse', cleaned, flags=re.IGNORECASE)
    
    # Step 3: Ensure no orphaned punctuation
    cleaned = re.sub(r'\s+([.,;:!?)])', r'\1', cleaned)
    cleaned = re.sub(r'([(])\s+', r'\1', cleaned)
    
    return cleaned


# Test
if __name__ == "__main__":
    test = """
    It was April 23, 2026, around 3:45 PM on a Tuesday afternoon.
    Last Wednesday, I decided to fix the server.
    The year 2026 has been rough.
    """
    
    print("ORIGINAL:")
    print(test)
    print()
    
    print("CLEANED:")
    print(strip_dates(test))
    print()
    
    print("EXTRACTED DATES:")
    print(extract_dates(test))