📄 embeddings.py 3,662 bytes Apr 30, 2026 📋 Raw

"""Document chunking and embedding for semantic search."""

import hashlib
import logging
from typing import List, Dict
import httpx

from config import LLM_HOST

logger = logging.getLogger(name)

nomic-embed-text via Ollama (local, no data leaves network)

EMBEDDING_MODEL = "nomic-embed-text"
EMBEDDING_DIM = 768 # nomic-embed-text dimension

def chunk_document(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
"""Split document into overlapping chunks.

Args:
    text: The document text to chunk
    chunk_size: Target words per chunk
    overlap: Words to overlap between chunks

Returns:
    List of text chunks
"""
if not text or not text.strip():
    return []

words = text.split()
chunks = []

for i in range(0, len(words), chunk_size - overlap):
    chunk = " ".join(words[i:i + chunk_size])
    chunks.append(chunk)

return chunks

def get_embedding(text: str) -> List[float]:
"""Get embedding vector from local Ollama.

Args:
    text: Text to embed

Returns:
    768-dim embedding vector
"""
if not text or not text.strip():
    logger.warning("Empty text provided for embedding, returning zero vector")
    return [0.0] * EMBEDDING_DIM

try:
    response = httpx.post(
        f"{LLM_HOST}/api/embeddings",
        json={
            "model": EMBEDDING_MODEL,
            "prompt": text
        },
        timeout=30.0
    )
    response.raise_for_status()
    data = response.json()

    embedding = data.get("embedding", [])

    if len(embedding) != EMBEDDING_DIM:
        logger.warning(
            f"Unexpected embedding dimension: {len(embedding)} expected {EMBEDDING_DIM}"
        )
        # Pad or truncate to expected dimension
        if len(embedding) < EMBEDDING_DIM:
            embedding.extend([0.0] * (EMBEDDING_DIM - len(embedding)))
        else:
            embedding = embedding[:EMBEDDING_DIM]

    return embedding

except httpx.HTTPStatusError as e:
    logger.error(f"HTTP error getting embedding: {e.response.status_code} - {e.response.text}")
    return [0.0] * EMBEDDING_DIM
except Exception as e:
    logger.error(f"Error getting embedding: {e}")
    return [0.0] * EMBEDDING_DIM

def embed_document(doc_id: str, text: str, metadata: Dict) -> List[Dict]:
"""Embed a document into chunks with metadata.

Args:
    doc_id: Unique document identifier
    text: Full document text
    metadata: Document metadata (source_date, doc_type, etc.)

Returns:
    List of chunk dictionaries with embeddings
"""
chunks = chunk_document(text)
embedded = []

for i, chunk in enumerate(chunks):
    embedding = get_embedding(chunk)
    chunk_id = f"{doc_id}_chunk_{i}"

    embedded.append({
        "id": chunk_id,
        "text": chunk,
        "embedding": embedding,
        "metadata": {
            "doc_id": doc_id,
            "chunk_index": i,
            **metadata
        }
    })

return embedded

def generate_doc_id(text: str, source: str) -> str:
"""Generate a deterministic document ID from content and source.

Args:
    text: Document content
    source: Source identifier (e.g., filename, email_id)

Returns:
    Unique document ID
"""
content = f"{source}:{text[:100]}"
return hashlib.sha256(content.encode()).hexdigest()[:16]