Hybrid Search in RAG
Combine dense (embedding) and sparse (BM25) retrieval for better RAG results. Reciprocal Rank Fusion, weighted combination, and when hybrid beats pure semantic search.
Why Hybrid Search
Pure semantic (dense) search is great at finding conceptually similar documents but fails when:
- The query contains specific technical terms (drug names, medical codes, model numbers)
- Exact keyword match matters (a patient's name, a specific ICD code, a serial number)
- Rare terms aren't in the embedding space (new drug names, proprietary terminology)
BM25 (sparse retrieval) excels at exact keyword matching but fails at:
- Paraphrase ā "cardiac event" vs "myocardial infarction" ā same concept, different words
- Synonym handling ā different terminology for the same idea
- Multi-hop reasoning ā query and answer share no common words
Hybrid search combines both, capturing the strengths of each.
BM25: Sparse Retrieval
from rank_bm25 import BM25Okapi
import numpy as np
import re
def tokenize(text: str) -> list[str]:
"""Simple tokenization for BM25."""
# Lowercase and split on non-alphanumeric
tokens = re.findall(r'\b[a-z0-9]+\b', text.lower())
return tokens
class BM25Index:
"""BM25 retrieval index."""
def __init__(self, documents: list[dict]):
self.documents = documents
self.corpus = [doc["content"] for doc in documents]
# Tokenize all documents
tokenized_corpus = [tokenize(doc) for doc in self.corpus]
# Build BM25 index
# k1=1.5 (term frequency saturation), b=0.75 (length normalization)
self.bm25 = BM25Okapi(tokenized_corpus, k1=1.5, b=0.75)
def search(self, query: str, top_k: int = 10) -> list[dict]:
"""Search using BM25."""
query_tokens = tokenize(query)
scores = self.bm25.get_scores(query_tokens)
# Get top-k indices
top_indices = np.argsort(-scores)[:top_k]
return [
{
"document": self.documents[i],
"score": float(scores[i]),
"rank": rank + 1,
}
for rank, i in enumerate(top_indices)
if scores[i] > 0 # Only return non-zero scores
]
def get_all_scores(self, query: str) -> np.ndarray:
"""Get BM25 scores for all documents (for hybrid fusion)."""
return self.bm25.get_scores(tokenize(query))
# Example
documents = [
{"id": "1", "content": "Warfarin interacts with clarithromycin via CYP2C9 inhibition."},
{"id": "2", "content": "Metformin should be withheld before procedures requiring contrast."},
{"id": "3", "content": "The blood thinner warfarin requires regular INR monitoring."},
]
bm25_index = BM25Index(documents)
results = bm25_index.search("warfarin drug interaction")
# Doc 1 scores highest (exact "warfarin" + "interaction"), Doc 3 also matches "warfarin"Reciprocal Rank Fusion (RRF)
RRF combines ranking from multiple retrieval systems without needing to normalize scores:
def reciprocal_rank_fusion(
ranked_lists: list[list[str]], # Each list is a ranked list of document IDs
k: int = 60, # RRF constant (controls rank sensitivity)
weights: list[float] = None, # Optional per-system weights
) -> dict[str, float]:
"""
Combine multiple ranked lists using Reciprocal Rank Fusion.
For each document, score = Σ weight_i / (k + rank_i)
where rank is 1-indexed position in each list.
k=60 is the commonly recommended default (from Cormack et al., 2009).
"""
if weights is None:
weights = [1.0] * len(ranked_lists)
scores = {}
for ranked_list, weight in zip(ranked_lists, weights):
for rank, doc_id in enumerate(ranked_list, 1):
if doc_id not in scores:
scores[doc_id] = 0.0
scores[doc_id] += weight / (k + rank)
return scores
class HybridRetriever:
"""Combines dense semantic search with BM25 sparse retrieval using RRF."""
def __init__(
self,
documents: list[dict],
embeddings: np.ndarray,
dense_weight: float = 0.5,
sparse_weight: float = 0.5,
):
self.documents = documents
self.embeddings = embeddings
self.bm25_index = BM25Index(documents)
self.dense_weight = dense_weight
self.sparse_weight = sparse_weight
def retrieve(
self,
query: str,
query_embedding: np.ndarray,
top_k: int = 10,
) -> list[dict]:
"""Retrieve top-k documents using hybrid search."""
# Dense retrieval: cosine similarity
dense_scores = self.embeddings @ query_embedding
dense_ranked_ids = [
str(self.documents[i]["id"])
for i in np.argsort(-dense_scores)[:top_k * 2] # Over-retrieve, then fuse
]
# Sparse retrieval: BM25
bm25_scores = self.bm25_index.get_all_scores(query)
sparse_ranked_ids = [
str(self.documents[i]["id"])
for i in np.argsort(-bm25_scores)[:top_k * 2]
]
# Combine with RRF
rrf_scores = reciprocal_rank_fusion(
ranked_lists=[dense_ranked_ids, sparse_ranked_ids],
weights=[self.dense_weight, self.sparse_weight],
)
# Sort by RRF score
sorted_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
# Retrieve document content
doc_map = {str(doc["id"]): doc for doc in self.documents}
return [
{
"document": doc_map[doc_id],
"rrf_score": score,
}
for doc_id, score in sorted_docs
if doc_id in doc_map
]Weighted Score Combination
Alternative to RRF: normalize scores and combine with weights:
def min_max_normalize(scores: np.ndarray) -> np.ndarray:
"""Normalize scores to [0, 1] range."""
min_score = scores.min()
max_score = scores.max()
if max_score == min_score:
return np.zeros_like(scores)
return (scores - min_score) / (max_score - min_score)
def weighted_combination(
dense_scores: np.ndarray,
sparse_scores: np.ndarray,
alpha: float = 0.7, # Weight for dense; (1-alpha) for sparse
) -> np.ndarray:
"""
Combine normalized dense and sparse scores.
alpha = 1.0 ā pure dense
alpha = 0.0 ā pure sparse
alpha = 0.7 ā typical good default (dense-dominant)
"""
norm_dense = min_max_normalize(dense_scores)
norm_sparse = min_max_normalize(sparse_scores)
return alpha * norm_dense + (1 - alpha) * norm_sparse
# The main downside: score normalization is sensitive to the score distribution
# If BM25 scores are concentrated near 0 with a few high outliers,
# normalization amplifies noise from low-scoring documents.
# RRF avoids this by using ranks instead of raw scores.Qdrant Hybrid Search
Qdrant supports dense + sparse vectors natively:
from qdrant_client import QdrantClient
from qdrant_client.models import (
VectorParams, SparseVectorParams,
PointStruct, SparseVector,
NamedVector, NamedSparseVector,
SearchRequest, FusionQuery, Fusion,
)
client = QdrantClient(url="http://localhost:6333")
# Create collection with both dense and sparse vectors
def create_hybrid_collection(collection_name: str) -> None:
client.create_collection(
collection_name=collection_name,
vectors_config={"dense": VectorParams(size=1536, distance="Cosine")},
sparse_vectors_config={"sparse": SparseVectorParams()},
)
def compute_sparse_vector(text: str, vocabulary: dict) -> SparseVector:
"""Convert text to sparse BM25-style vector using vocabulary."""
tokens = tokenize(text)
token_counts = {}
for token in tokens:
if token in vocabulary:
idx = vocabulary[token]
token_counts[idx] = token_counts.get(idx, 0) + 1
# Simple TF (Term Frequency) ā Qdrant handles IDF internally
indices = list(token_counts.keys())
values = [float(count) for count in token_counts.values()]
return SparseVector(indices=indices, values=values)
def upsert_with_sparse(
collection_name: str,
documents: list[dict],
dense_embeddings: list[list[float]],
vocabulary: dict,
) -> None:
"""Upsert documents with both dense and sparse vectors."""
points = []
for i, (doc, emb) in enumerate(zip(documents, dense_embeddings)):
sparse_vec = compute_sparse_vector(doc["content"], vocabulary)
points.append(
PointStruct(
id=i,
vector={
"dense": emb,
"sparse": sparse_vec,
},
payload={"content": doc["content"], "title": doc["title"]},
)
)
client.upsert(collection_name=collection_name, points=points)
# Hybrid query with Qdrant's native fusion
def hybrid_query_qdrant(
collection_name: str,
dense_vector: list[float],
sparse_vector: SparseVector,
top_k: int = 5,
) -> list[dict]:
"""Query using Qdrant's native hybrid search with RRF fusion."""
results = client.query_points(
collection_name=collection_name,
prefetch=[
# Dense retrieval
{"query": dense_vector, "using": "dense", "limit": top_k * 2},
# Sparse retrieval
{"query": sparse_vector, "using": "sparse", "limit": top_k * 2},
],
query=FusionQuery(fusion=Fusion.RRF), # Apply RRF fusion
limit=top_k,
with_payload=True,
)
return [
{
"content": hit.payload.get("content", ""),
"score": hit.score,
}
for hit in results.points
]When to Use Hybrid Search
Pure dense retrieval is sufficient when:
- Documents are long-form natural language (prose, paragraphs)
- Queries are conceptual ("what are the side effects of...")
- Domain terminology is common enough to be in pretraining
- Embedding model is well-calibrated for the domain
Hybrid search significantly helps when:
- Queries contain rare proper nouns (new drug names, patient names, product codes)
- Short documents (titles, one-liners) ā dense models struggle without context
- Users search with exact technical terms from reference material
- Recall is critical and you can afford slightly higher latency
Practical benchmark: On clinical drug information retrieval in internal testing:
- Dense only: 72% recall@5
- BM25 only: 68% recall@5
- Hybrid (RRF, α=0.6): 81% recall@5
The gains are most pronounced for rare drug names and specific ICD codes ā exactly the precision-critical cases.
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.