Embedding Models for RAG
How to choose and use embedding models for retrieval-augmented generation. OpenAI ada-002 vs text-embedding-3, open-source alternatives, fine-tuning for domain-specific retrieval.
What Embeddings Do in RAG
Embeddings convert text (documents, queries) into dense vectors in a high-dimensional space, where semantically similar texts have high cosine similarity. In RAG:
- Indexing: Documents are chunked and each chunk is embedded → stored in a vector database
- Querying: The user's question is embedded → similarity search retrieves relevant chunks
- Generation: Retrieved chunks are passed to the LLM as context
The embedding model determines retrieval quality. A better embedding model retrieves more relevant documents, which directly improves the quality of LLM answers.
OpenAI Embedding Models
from openai import OpenAI
import numpy as np
client = OpenAI()
def embed_text(
text: str,
model: str = "text-embedding-3-small",
) -> list[float]:
"""Embed a single text string."""
response = client.embeddings.create(
model=model,
input=text,
)
return response.data[0].embedding
def embed_batch(
texts: list[str],
model: str = "text-embedding-3-small",
batch_size: int = 100,
) -> list[list[float]]:
"""Embed a list of texts in batches (API limit: 2048 per call)."""
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
response = client.embeddings.create(
model=model,
input=batch,
)
# Preserve order (API may not return in input order)
batch_embeddings = sorted(response.data, key=lambda x: x.index)
all_embeddings.extend([e.embedding for e in batch_embeddings])
return all_embeddings
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two embedding vectors."""
a_arr = np.array(a)
b_arr = np.array(b)
return float(np.dot(a_arr, b_arr) / (np.linalg.norm(a_arr) * np.linalg.norm(b_arr)))
# Demonstration
query_emb = embed_text("What is the interaction between warfarin and aspirin?")
doc1_emb = embed_text("Warfarin and aspirin have a major pharmacodynamic interaction, significantly increasing bleeding risk.")
doc2_emb = embed_text("The patient should take metformin with meals to reduce GI side effects.")
print(f"Query-doc1 similarity: {cosine_similarity(query_emb, doc1_emb):.3f}") # High
print(f"Query-doc2 similarity: {cosine_similarity(query_emb, doc2_emb):.3f}") # LowOpenAI Model Comparison
OPENAI_EMBEDDING_MODELS = {
"text-embedding-ada-002": {
"dimensions": 1536,
"max_tokens": 8191,
"price_per_million": 0.10,
"mteb_score": 61.0, # Massive Text Embedding Benchmark
"notes": "Legacy — text-embedding-3-small is better at lower cost",
},
"text-embedding-3-small": {
"dimensions": 1536, # Can reduce to 512 with Matryoshka
"max_tokens": 8191,
"price_per_million": 0.02,
"mteb_score": 62.3,
"notes": "Best cost-performance ratio for most use cases",
},
"text-embedding-3-large": {
"dimensions": 3072, # Can reduce to any size
"max_tokens": 8191,
"price_per_million": 0.13,
"mteb_score": 64.6,
"notes": "Highest quality, 6× cost of text-embedding-3-small",
},
}
# Matryoshka Representation Learning: dimension reduction with minimal quality loss
def embed_with_reduced_dimensions(
text: str,
dimensions: int = 256,
model: str = "text-embedding-3-small",
) -> list[float]:
"""
Request truncated embeddings — only the first N dimensions.
Works with text-embedding-3 models (trained with Matryoshka loss).
Useful for reducing storage and compute costs.
"""
response = client.embeddings.create(
model=model,
input=text,
dimensions=dimensions,
)
return response.data[0].embedding
# Dimensions: 256, 512, or 1536 for text-embedding-3-small
# Quality drops slightly — 256d is about 95% of 1536d quality on MTEBOpen-Source Embedding Models
from sentence_transformers import SentenceTransformer
import torch
def load_sentence_transformer(model_name: str = "BAAI/bge-large-en-v1.5") -> SentenceTransformer:
"""Load an open-source sentence transformer model."""
model = SentenceTransformer(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
return model
def embed_with_sentence_transformer(
texts: list[str],
model: SentenceTransformer,
normalize: bool = True,
) -> np.ndarray:
"""Embed texts using a sentence transformer model."""
embeddings = model.encode(
texts,
normalize_embeddings=normalize, # L2 normalize for cosine similarity
batch_size=32,
show_progress_bar=len(texts) > 100,
)
return embeddings # (n_texts, n_dimensions) numpy array
# Top open-source embedding models (MTEB benchmarks)
OPEN_SOURCE_MODELS = {
"BAAI/bge-large-en-v1.5": {
"dimensions": 1024,
"mteb_score": 64.2,
"notes": "Strong baseline, well-maintained by BAAI",
},
"BAAI/bge-m3": {
"dimensions": 1024,
"mteb_score": 66.0,
"notes": "Multilingual, dense+sparse+ColBERT in one model",
"max_tokens": 8192,
},
"intfloat/e5-mistral-7b-instruct": {
"dimensions": 4096,
"mteb_score": 66.6,
"notes": "LLM-based embedding — very high quality, expensive to run",
},
"mixedbread-ai/mxbai-embed-large-v1": {
"dimensions": 1024,
"mteb_score": 64.7,
"notes": "Competitive with text-embedding-3-small",
},
"nomic-ai/nomic-embed-text-v1.5": {
"dimensions": 768,
"mteb_score": 62.3,
"notes": "Open source and fully reproducible (unique among top models)",
},
}Domain-Specific Embedding Fine-Tuning
Generic embedding models may not capture domain-specific terminology well. Fine-tuning on domain data significantly improves clinical/legal/technical retrieval:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
def prepare_clinical_training_pairs(
questions: list[str],
relevant_passages: list[str],
hard_negatives: list[str] = None,
) -> list[InputExample]:
"""
Prepare training pairs for embedding fine-tuning.
Uses MultipleNegativesRankingLoss: positive pairs + in-batch negatives.
"""
examples = []
for question, passage in zip(questions, relevant_passages):
if hard_negatives:
# Random hard negative from the pool
import random
neg = random.choice([n for n in hard_negatives if n != passage])
examples.append(InputExample(texts=[question, passage, neg]))
else:
examples.append(InputExample(texts=[question, passage]))
return examples
def fine_tune_embedding_model(
base_model_name: str,
training_examples: list[InputExample],
output_path: str,
epochs: int = 3,
) -> SentenceTransformer:
"""Fine-tune an embedding model on domain-specific retrieval pairs."""
model = SentenceTransformer(base_model_name)
train_dataloader = DataLoader(training_examples, shuffle=True, batch_size=16)
# MultipleNegativesRankingLoss uses in-batch negatives — efficient and effective
train_loss = losses.MultipleNegativesRankingLoss(model)
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=epochs,
warmup_steps=100,
output_path=output_path,
show_progress_bar=True,
)
return model
# Example: fine-tune on drug interaction Q&A pairs
training_pairs = [
("What is the interaction between warfarin and clarithromycin?",
"Warfarin and clarithromycin: major interaction. Clarithromycin inhibits CYP2C9 and CYP3A4, significantly increasing warfarin concentrations. Expected 2-3× INR increase. Monitor INR within 3 days."),
("How do I dose metformin in renal impairment?",
"Metformin renal dosing: eGFR 45-60: reduce dose and monitor; eGFR 30-44: use with caution, maximum 1000mg/day; eGFR under 30: contraindicated due to lactic acidosis risk."),
]Evaluating Embedding Models
Use BEIR (Benchmarking Information Retrieval) or custom domain evaluation:
def evaluate_retrieval_quality(
embedding_model,
queries: list[str],
corpus: list[str],
relevant_doc_ids: list[list[int]], # For each query, list of relevant corpus indices
k_values: list[int] = [1, 5, 10],
) -> dict:
"""Evaluate embedding model retrieval quality on a test set."""
# Embed all corpus documents
corpus_embeddings = embedding_model.encode(corpus, normalize_embeddings=True)
results = {f"recall@{k}": [] for k in k_values}
results["mrr"] = []
for query, relevant_ids in zip(queries, relevant_doc_ids):
# Embed query
query_emb = embedding_model.encode([query], normalize_embeddings=True)[0]
# Score all documents
scores = corpus_embeddings @ query_emb # Cosine similarity
ranked_ids = np.argsort(-scores) # Descending order
# Compute metrics
relevant_set = set(relevant_ids)
for k in k_values:
top_k = set(ranked_ids[:k])
recall = len(top_k & relevant_set) / len(relevant_set)
results[f"recall@{k}"].append(recall)
# MRR: rank of first relevant document
mrr = 0.0
for rank, doc_id in enumerate(ranked_ids, 1):
if doc_id in relevant_set:
mrr = 1.0 / rank
break
results["mrr"].append(mrr)
return {metric: sum(vals) / len(vals) for metric, vals in results.items()}
# Compare models
def benchmark_models(
model_names: list[str],
queries: list[str],
corpus: list[str],
relevant_doc_ids: list[list[int]],
) -> list[dict]:
results = []
for name in model_names:
model = SentenceTransformer(name)
metrics = evaluate_retrieval_quality(model, queries, corpus, relevant_doc_ids)
results.append({"model": name, **metrics})
print(f"{name}: {metrics}")
return sorted(results, key=lambda x: x["recall@5"], reverse=True)Choosing the Right Embedding Model
def recommend_embedding_model(requirements: dict) -> str:
"""Heuristic embedding model recommendation."""
if requirements.get("privacy_required"):
# Must run locally
if requirements.get("multilingual"):
return "BAAI/bge-m3"
return "BAAI/bge-large-en-v1.5"
if requirements.get("cost_sensitive"):
return "text-embedding-3-small" # 20× cheaper than ada-002
if requirements.get("highest_quality"):
return "text-embedding-3-large" # or e5-mistral-7b-instruct
if requirements.get("clinical_domain"):
# Consider fine-tuning BAAI/bge-large-en-v1.5 on clinical Q&A pairs
return "BAAI/bge-large-en-v1.5 (fine-tuned)"
# Default: good balance of cost and quality
return "text-embedding-3-small"Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.