"""Document chunking and embedding for semantic search.""" import hashlib import logging from typing import List, Dict import httpx from config import LLM_HOST logger = logging.getLogger(__name__) # nomic-embed-text via Ollama (local, no data leaves network) EMBEDDING_MODEL = "nomic-embed-text" EMBEDDING_DIM = 768 # nomic-embed-text dimension def chunk_document(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]: """Split document into overlapping chunks. Args: text: The document text to chunk chunk_size: Target words per chunk overlap: Words to overlap between chunks Returns: List of text chunks """ if not text or not text.strip(): return [] words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = " ".join(words[i:i + chunk_size]) chunks.append(chunk) return chunks def get_embedding(text: str) -> List[float]: """Get embedding vector from local Ollama. Args: text: Text to embed Returns: 768-dim embedding vector """ if not text or not text.strip(): logger.warning("Empty text provided for embedding, returning zero vector") return [0.0] * EMBEDDING_DIM try: response = httpx.post( f"{LLM_HOST}/api/embeddings", json={ "model": EMBEDDING_MODEL, "prompt": text }, timeout=30.0 ) response.raise_for_status() data = response.json() embedding = data.get("embedding", []) if len(embedding) != EMBEDDING_DIM: logger.warning( f"Unexpected embedding dimension: {len(embedding)} expected {EMBEDDING_DIM}" ) # Pad or truncate to expected dimension if len(embedding) < EMBEDDING_DIM: embedding.extend([0.0] * (EMBEDDING_DIM - len(embedding))) else: embedding = embedding[:EMBEDDING_DIM] return embedding except httpx.HTTPStatusError as e: logger.error(f"HTTP error getting embedding: {e.response.status_code} - {e.response.text}") return [0.0] * EMBEDDING_DIM except Exception as e: logger.error(f"Error getting embedding: {e}") return [0.0] * EMBEDDING_DIM def embed_document(doc_id: str, text: str, metadata: Dict) -> List[Dict]: """Embed a document into chunks with metadata. Args: doc_id: Unique document identifier text: Full document text metadata: Document metadata (source_date, doc_type, etc.) Returns: List of chunk dictionaries with embeddings """ chunks = chunk_document(text) embedded = [] for i, chunk in enumerate(chunks): embedding = get_embedding(chunk) chunk_id = f"{doc_id}_chunk_{i}" embedded.append({ "id": chunk_id, "text": chunk, "embedding": embedding, "metadata": { "doc_id": doc_id, "chunk_index": i, **metadata } }) return embedded def generate_doc_id(text: str, source: str) -> str: """Generate a deterministic document ID from content and source. Args: text: Document content source: Source identifier (e.g., filename, email_id) Returns: Unique document ID """ content = f"{source}:{text[:100]}" return hashlib.sha256(content.encode()).hexdigest()[:16]