"""Document chunking and embedding for semantic search."""
import hashlib
import logging
from typing import List, Dict
import httpx
from config import LLM_HOST
logger = logging.getLogger(name)
nomic-embed-text via Ollama (local, no data leaves network)
EMBEDDING_MODEL = "nomic-embed-text"
EMBEDDING_DIM = 768 # nomic-embed-text dimension
def chunk_document(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
"""Split document into overlapping chunks.
Args:
text: The document text to chunk
chunk_size: Target words per chunk
overlap: Words to overlap between chunks
Returns:
List of text chunks
"""
if not text or not text.strip():
return []
words = text.split()
chunks = []
for i in range(0, len(words), chunk_size - overlap):
chunk = " ".join(words[i:i + chunk_size])
chunks.append(chunk)
return chunks
def get_embedding(text: str) -> List[float]:
"""Get embedding vector from local Ollama.
Args:
text: Text to embed
Returns:
768-dim embedding vector
"""
if not text or not text.strip():
logger.warning("Empty text provided for embedding, returning zero vector")
return [0.0] * EMBEDDING_DIM
try:
response = httpx.post(
f"{LLM_HOST}/api/embeddings",
json={
"model": EMBEDDING_MODEL,
"prompt": text
},
timeout=30.0
)
response.raise_for_status()
data = response.json()
embedding = data.get("embedding", [])
if len(embedding) != EMBEDDING_DIM:
logger.warning(
f"Unexpected embedding dimension: {len(embedding)} expected {EMBEDDING_DIM}"
)
# Pad or truncate to expected dimension
if len(embedding) < EMBEDDING_DIM:
embedding.extend([0.0] * (EMBEDDING_DIM - len(embedding)))
else:
embedding = embedding[:EMBEDDING_DIM]
return embedding
except httpx.HTTPStatusError as e:
logger.error(f"HTTP error getting embedding: {e.response.status_code} - {e.response.text}")
return [0.0] * EMBEDDING_DIM
except Exception as e:
logger.error(f"Error getting embedding: {e}")
return [0.0] * EMBEDDING_DIM
def embed_document(doc_id: str, text: str, metadata: Dict) -> List[Dict]:
"""Embed a document into chunks with metadata.
Args:
doc_id: Unique document identifier
text: Full document text
metadata: Document metadata (source_date, doc_type, etc.)
Returns:
List of chunk dictionaries with embeddings
"""
chunks = chunk_document(text)
embedded = []
for i, chunk in enumerate(chunks):
embedding = get_embedding(chunk)
chunk_id = f"{doc_id}_chunk_{i}"
embedded.append({
"id": chunk_id,
"text": chunk,
"embedding": embedding,
"metadata": {
"doc_id": doc_id,
"chunk_index": i,
**metadata
}
})
return embedded
def generate_doc_id(text: str, source: str) -> str:
"""Generate a deterministic document ID from content and source.
Args:
text: Document content
source: Source identifier (e.g., filename, email_id)
Returns:
Unique document ID
"""
content = f"{source}:{text[:100]}"
return hashlib.sha256(content.encode()).hexdigest()[:16]