"""Document chunking and embedding for semantic search."""

import hashlib
import logging
from typing import List, Dict
import httpx

from config import LLM_HOST

logger = logging.getLogger(__name__)

# nomic-embed-text via Ollama (local, no data leaves network)
EMBEDDING_MODEL = "nomic-embed-text"
EMBEDDING_DIM = 768  # nomic-embed-text dimension


def chunk_document(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
    """Split document into overlapping chunks.
    
    Args:
        text: The document text to chunk
        chunk_size: Target words per chunk
        overlap: Words to overlap between chunks
        
    Returns:
        List of text chunks
    """
    if not text or not text.strip():
        return []
    
    words = text.split()
    chunks = []
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    
    return chunks


def get_embedding(text: str) -> List[float]:
    """Get embedding vector from local Ollama.
    
    Args:
        text: Text to embed
        
    Returns:
        768-dim embedding vector
    """
    if not text or not text.strip():
        logger.warning("Empty text provided for embedding, returning zero vector")
        return [0.0] * EMBEDDING_DIM
    
    try:
        response = httpx.post(
            f"{LLM_HOST}/api/embeddings",
            json={
                "model": EMBEDDING_MODEL,
                "prompt": text
            },
            timeout=30.0
        )
        response.raise_for_status()
        data = response.json()
        
        embedding = data.get("embedding", [])
        
        if len(embedding) != EMBEDDING_DIM:
            logger.warning(
                f"Unexpected embedding dimension: {len(embedding)} expected {EMBEDDING_DIM}"
            )
            # Pad or truncate to expected dimension
            if len(embedding) < EMBEDDING_DIM:
                embedding.extend([0.0] * (EMBEDDING_DIM - len(embedding)))
            else:
                embedding = embedding[:EMBEDDING_DIM]
        
        return embedding
        
    except httpx.HTTPStatusError as e:
        logger.error(f"HTTP error getting embedding: {e.response.status_code} - {e.response.text}")
        return [0.0] * EMBEDDING_DIM
    except Exception as e:
        logger.error(f"Error getting embedding: {e}")
        return [0.0] * EMBEDDING_DIM


def embed_document(doc_id: str, text: str, metadata: Dict) -> List[Dict]:
    """Embed a document into chunks with metadata.
    
    Args:
        doc_id: Unique document identifier
        text: Full document text
        metadata: Document metadata (source_date, doc_type, etc.)
        
    Returns:
        List of chunk dictionaries with embeddings
    """
    chunks = chunk_document(text)
    embedded = []
    
    for i, chunk in enumerate(chunks):
        embedding = get_embedding(chunk)
        chunk_id = f"{doc_id}_chunk_{i}"
        
        embedded.append({
            "id": chunk_id,
            "text": chunk,
            "embedding": embedding,
            "metadata": {
                "doc_id": doc_id,
                "chunk_index": i,
                **metadata
            }
        })
    
    return embedded


def generate_doc_id(text: str, source: str) -> str:
    """Generate a deterministic document ID from content and source.
    
    Args:
        text: Document content
        source: Source identifier (e.g., filename, email_id)
        
    Returns:
        Unique document ID
    """
    content = f"{source}:{text[:100]}"
    return hashlib.sha256(content.encode()).hexdigest()[:16]