Learnixo

RAG Systems · Lesson 6 of 24

Choosing an Embedding Model: OpenAI, Cohere, BGE

What Embeddings Do in RAG

Embeddings convert text (documents, queries) into dense vectors in a high-dimensional space, where semantically similar texts have high cosine similarity. In RAG:

  1. Indexing: Documents are chunked and each chunk is embedded → stored in a vector database
  2. Querying: The user's question is embedded → similarity search retrieves relevant chunks
  3. Generation: Retrieved chunks are passed to the LLM as context

The embedding model determines retrieval quality. A better embedding model retrieves more relevant documents, which directly improves the quality of LLM answers.


OpenAI Embedding Models

Python
from openai import OpenAI
import numpy as np

client = OpenAI()

def embed_text(
    text: str,
    model: str = "text-embedding-3-small",
) -> list[float]:
    """Embed a single text string."""
    response = client.embeddings.create(
        model=model,
        input=text,
    )
    return response.data[0].embedding


def embed_batch(
    texts: list[str],
    model: str = "text-embedding-3-small",
    batch_size: int = 100,
) -> list[list[float]]:
    """Embed a list of texts in batches (API limit: 2048 per call)."""
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        response = client.embeddings.create(
            model=model,
            input=batch,
        )
        # Preserve order (API may not return in input order)
        batch_embeddings = sorted(response.data, key=lambda x: x.index)
        all_embeddings.extend([e.embedding for e in batch_embeddings])

    return all_embeddings


def cosine_similarity(a: list[float], b: list[float]) -> float:
    """Compute cosine similarity between two embedding vectors."""
    a_arr = np.array(a)
    b_arr = np.array(b)
    return float(np.dot(a_arr, b_arr) / (np.linalg.norm(a_arr) * np.linalg.norm(b_arr)))


# Demonstration
query_emb = embed_text("What is the interaction between warfarin and aspirin?")
doc1_emb = embed_text("Warfarin and aspirin have a major pharmacodynamic interaction, significantly increasing bleeding risk.")
doc2_emb = embed_text("The patient should take metformin with meals to reduce GI side effects.")

print(f"Query-doc1 similarity: {cosine_similarity(query_emb, doc1_emb):.3f}")  # High
print(f"Query-doc2 similarity: {cosine_similarity(query_emb, doc2_emb):.3f}")  # Low

OpenAI Model Comparison

Python
OPENAI_EMBEDDING_MODELS = {
    "text-embedding-ada-002": {
        "dimensions": 1536,
        "max_tokens": 8191,
        "price_per_million": 0.10,
        "mteb_score": 61.0,    # Massive Text Embedding Benchmark
        "notes": "Legacy — text-embedding-3-small is better at lower cost",
    },
    "text-embedding-3-small": {
        "dimensions": 1536,    # Can reduce to 512 with Matryoshka
        "max_tokens": 8191,
        "price_per_million": 0.02,
        "mteb_score": 62.3,
        "notes": "Best cost-performance ratio for most use cases",
    },
    "text-embedding-3-large": {
        "dimensions": 3072,    # Can reduce to any size
        "max_tokens": 8191,
        "price_per_million": 0.13,
        "mteb_score": 64.6,
        "notes": "Highest quality, 6× cost of text-embedding-3-small",
    },
}

# Matryoshka Representation Learning: dimension reduction with minimal quality loss
def embed_with_reduced_dimensions(
    text: str,
    dimensions: int = 256,
    model: str = "text-embedding-3-small",
) -> list[float]:
    """
    Request truncated embeddings — only the first N dimensions.
    Works with text-embedding-3 models (trained with Matryoshka loss).
    Useful for reducing storage and compute costs.
    """
    response = client.embeddings.create(
        model=model,
        input=text,
        dimensions=dimensions,
    )
    return response.data[0].embedding
    # Dimensions: 256, 512, or 1536 for text-embedding-3-small
    # Quality drops slightly  256d is about 95% of 1536d quality on MTEB

Open-Source Embedding Models

Python
from sentence_transformers import SentenceTransformer
import torch

def load_sentence_transformer(model_name: str = "BAAI/bge-large-en-v1.5") -> SentenceTransformer:
    """Load an open-source sentence transformer model."""
    model = SentenceTransformer(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
    return model


def embed_with_sentence_transformer(
    texts: list[str],
    model: SentenceTransformer,
    normalize: bool = True,
) -> np.ndarray:
    """Embed texts using a sentence transformer model."""
    embeddings = model.encode(
        texts,
        normalize_embeddings=normalize,  # L2 normalize for cosine similarity
        batch_size=32,
        show_progress_bar=len(texts) > 100,
    )
    return embeddings  # (n_texts, n_dimensions) numpy array


# Top open-source embedding models (MTEB benchmarks)
OPEN_SOURCE_MODELS = {
    "BAAI/bge-large-en-v1.5": {
        "dimensions": 1024,
        "mteb_score": 64.2,
        "notes": "Strong baseline, well-maintained by BAAI",
    },
    "BAAI/bge-m3": {
        "dimensions": 1024,
        "mteb_score": 66.0,
        "notes": "Multilingual, dense+sparse+ColBERT in one model",
        "max_tokens": 8192,
    },
    "intfloat/e5-mistral-7b-instruct": {
        "dimensions": 4096,
        "mteb_score": 66.6,
        "notes": "LLM-based embedding — very high quality, expensive to run",
    },
    "mixedbread-ai/mxbai-embed-large-v1": {
        "dimensions": 1024,
        "mteb_score": 64.7,
        "notes": "Competitive with text-embedding-3-small",
    },
    "nomic-ai/nomic-embed-text-v1.5": {
        "dimensions": 768,
        "mteb_score": 62.3,
        "notes": "Open source and fully reproducible (unique among top models)",
    },
}

Domain-Specific Embedding Fine-Tuning

Generic embedding models may not capture domain-specific terminology well. Fine-tuning on domain data significantly improves clinical/legal/technical retrieval:

Python
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

def prepare_clinical_training_pairs(
    questions: list[str],
    relevant_passages: list[str],
    hard_negatives: list[str] = None,
) -> list[InputExample]:
    """
    Prepare training pairs for embedding fine-tuning.
    Uses MultipleNegativesRankingLoss: positive pairs + in-batch negatives.
    """
    examples = []

    for question, passage in zip(questions, relevant_passages):
        if hard_negatives:
            # Random hard negative from the pool
            import random
            neg = random.choice([n for n in hard_negatives if n != passage])
            examples.append(InputExample(texts=[question, passage, neg]))
        else:
            examples.append(InputExample(texts=[question, passage]))

    return examples


def fine_tune_embedding_model(
    base_model_name: str,
    training_examples: list[InputExample],
    output_path: str,
    epochs: int = 3,
) -> SentenceTransformer:
    """Fine-tune an embedding model on domain-specific retrieval pairs."""
    model = SentenceTransformer(base_model_name)

    train_dataloader = DataLoader(training_examples, shuffle=True, batch_size=16)

    # MultipleNegativesRankingLoss uses in-batch negatives  efficient and effective
    train_loss = losses.MultipleNegativesRankingLoss(model)

    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=epochs,
        warmup_steps=100,
        output_path=output_path,
        show_progress_bar=True,
    )

    return model


# Example: fine-tune on drug interaction Q&A pairs
training_pairs = [
    ("What is the interaction between warfarin and clarithromycin?",
     "Warfarin and clarithromycin: major interaction. Clarithromycin inhibits CYP2C9 and CYP3A4, significantly increasing warfarin concentrations. Expected 2-3× INR increase. Monitor INR within 3 days."),
    ("How do I dose metformin in renal impairment?",
     "Metformin renal dosing: eGFR 45-60: reduce dose and monitor; eGFR 30-44: use with caution, maximum 1000mg/day; eGFR under 30: contraindicated due to lactic acidosis risk."),
]

Evaluating Embedding Models

Use BEIR (Benchmarking Information Retrieval) or custom domain evaluation:

Python
def evaluate_retrieval_quality(
    embedding_model,
    queries: list[str],
    corpus: list[str],
    relevant_doc_ids: list[list[int]],  # For each query, list of relevant corpus indices
    k_values: list[int] = [1, 5, 10],
) -> dict:
    """Evaluate embedding model retrieval quality on a test set."""
    # Embed all corpus documents
    corpus_embeddings = embedding_model.encode(corpus, normalize_embeddings=True)

    results = {f"recall@{k}": [] for k in k_values}
    results["mrr"] = []

    for query, relevant_ids in zip(queries, relevant_doc_ids):
        # Embed query
        query_emb = embedding_model.encode([query], normalize_embeddings=True)[0]

        # Score all documents
        scores = corpus_embeddings @ query_emb  # Cosine similarity
        ranked_ids = np.argsort(-scores)        # Descending order

        # Compute metrics
        relevant_set = set(relevant_ids)

        for k in k_values:
            top_k = set(ranked_ids[:k])
            recall = len(top_k & relevant_set) / len(relevant_set)
            results[f"recall@{k}"].append(recall)

        # MRR: rank of first relevant document
        mrr = 0.0
        for rank, doc_id in enumerate(ranked_ids, 1):
            if doc_id in relevant_set:
                mrr = 1.0 / rank
                break
        results["mrr"].append(mrr)

    return {metric: sum(vals) / len(vals) for metric, vals in results.items()}


# Compare models
def benchmark_models(
    model_names: list[str],
    queries: list[str],
    corpus: list[str],
    relevant_doc_ids: list[list[int]],
) -> list[dict]:
    results = []
    for name in model_names:
        model = SentenceTransformer(name)
        metrics = evaluate_retrieval_quality(model, queries, corpus, relevant_doc_ids)
        results.append({"model": name, **metrics})
        print(f"{name}: {metrics}")

    return sorted(results, key=lambda x: x["recall@5"], reverse=True)

Choosing the Right Embedding Model

Python
def recommend_embedding_model(requirements: dict) -> str:
    """Heuristic embedding model recommendation."""
    if requirements.get("privacy_required"):
        # Must run locally
        if requirements.get("multilingual"):
            return "BAAI/bge-m3"
        return "BAAI/bge-large-en-v1.5"

    if requirements.get("cost_sensitive"):
        return "text-embedding-3-small"  # 20× cheaper than ada-002

    if requirements.get("highest_quality"):
        return "text-embedding-3-large"  # or e5-mistral-7b-instruct

    if requirements.get("clinical_domain"):
        # Consider fine-tuning BAAI/bge-large-en-v1.5 on clinical Q&A pairs
        return "BAAI/bge-large-en-v1.5 (fine-tuned)"

    # Default: good balance of cost and quality
    return "text-embedding-3-small"