RAG Systems · Lesson 17 of 24
Hybrid Search: Combining Dense and Sparse
Why Hybrid Search
Pure semantic (dense) search is great at finding conceptually similar documents but fails when:
- The query contains specific technical terms (drug names, medical codes, model numbers)
- Exact keyword match matters (a patient's name, a specific ICD code, a serial number)
- Rare terms aren't in the embedding space (new drug names, proprietary terminology)
BM25 (sparse retrieval) excels at exact keyword matching but fails at:
- Paraphrase — "cardiac event" vs "myocardial infarction" — same concept, different words
- Synonym handling — different terminology for the same idea
- Multi-hop reasoning — query and answer share no common words
Hybrid search combines both, capturing the strengths of each.
BM25: Sparse Retrieval
Python
from rank_bm25 import BM25Okapi
import numpy as np
import re
def tokenize(text: str) -> list[str]:
"""Simple tokenization for BM25."""
# Lowercase and split on non-alphanumeric
tokens = re.findall(r'\b[a-z0-9]+\b', text.lower())
return tokens
class BM25Index:
"""BM25 retrieval index."""
def __init__(self, documents: list[dict]):
self.documents = documents
self.corpus = [doc["content"] for doc in documents]
# Tokenize all documents
tokenized_corpus = [tokenize(doc) for doc in self.corpus]
# Build BM25 index
# k1=1.5 (term frequency saturation), b=0.75 (length normalization)
self.bm25 = BM25Okapi(tokenized_corpus, k1=1.5, b=0.75)
def search(self, query: str, top_k: int = 10) -> list[dict]:
"""Search using BM25."""
query_tokens = tokenize(query)
scores = self.bm25.get_scores(query_tokens)
# Get top-k indices
top_indices = np.argsort(-scores)[:top_k]
return [
{
"document": self.documents[i],
"score": float(scores[i]),
"rank": rank + 1,
}
for rank, i in enumerate(top_indices)
if scores[i] > 0 # Only return non-zero scores
]
def get_all_scores(self, query: str) -> np.ndarray:
"""Get BM25 scores for all documents (for hybrid fusion)."""
return self.bm25.get_scores(tokenize(query))
# Example
documents = [
{"id": "1", "content": "Warfarin interacts with clarithromycin via CYP2C9 inhibition."},
{"id": "2", "content": "Metformin should be withheld before procedures requiring contrast."},
{"id": "3", "content": "The blood thinner warfarin requires regular INR monitoring."},
]
bm25_index = BM25Index(documents)
results = bm25_index.search("warfarin drug interaction")
# Doc 1 scores highest (exact "warfarin" + "interaction"), Doc 3 also matches "warfarin"Reciprocal Rank Fusion (RRF)
RRF combines ranking from multiple retrieval systems without needing to normalize scores:
Python
def reciprocal_rank_fusion(
ranked_lists: list[list[str]], # Each list is a ranked list of document IDs
k: int = 60, # RRF constant (controls rank sensitivity)
weights: list[float] = None, # Optional per-system weights
) -> dict[str, float]:
"""
Combine multiple ranked lists using Reciprocal Rank Fusion.
For each document, score = Σ weight_i / (k + rank_i)
where rank is 1-indexed position in each list.
k=60 is the commonly recommended default (from Cormack et al., 2009).
"""
if weights is None:
weights = [1.0] * len(ranked_lists)
scores = {}
for ranked_list, weight in zip(ranked_lists, weights):
for rank, doc_id in enumerate(ranked_list, 1):
if doc_id not in scores:
scores[doc_id] = 0.0
scores[doc_id] += weight / (k + rank)
return scores
class HybridRetriever:
"""Combines dense semantic search with BM25 sparse retrieval using RRF."""
def __init__(
self,
documents: list[dict],
embeddings: np.ndarray,
dense_weight: float = 0.5,
sparse_weight: float = 0.5,
):
self.documents = documents
self.embeddings = embeddings
self.bm25_index = BM25Index(documents)
self.dense_weight = dense_weight
self.sparse_weight = sparse_weight
def retrieve(
self,
query: str,
query_embedding: np.ndarray,
top_k: int = 10,
) -> list[dict]:
"""Retrieve top-k documents using hybrid search."""
# Dense retrieval: cosine similarity
dense_scores = self.embeddings @ query_embedding
dense_ranked_ids = [
str(self.documents[i]["id"])
for i in np.argsort(-dense_scores)[:top_k * 2] # Over-retrieve, then fuse
]
# Sparse retrieval: BM25
bm25_scores = self.bm25_index.get_all_scores(query)
sparse_ranked_ids = [
str(self.documents[i]["id"])
for i in np.argsort(-bm25_scores)[:top_k * 2]
]
# Combine with RRF
rrf_scores = reciprocal_rank_fusion(
ranked_lists=[dense_ranked_ids, sparse_ranked_ids],
weights=[self.dense_weight, self.sparse_weight],
)
# Sort by RRF score
sorted_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
# Retrieve document content
doc_map = {str(doc["id"]): doc for doc in self.documents}
return [
{
"document": doc_map[doc_id],
"rrf_score": score,
}
for doc_id, score in sorted_docs
if doc_id in doc_map
]Weighted Score Combination
Alternative to RRF: normalize scores and combine with weights:
Python
def min_max_normalize(scores: np.ndarray) -> np.ndarray:
"""Normalize scores to [0, 1] range."""
min_score = scores.min()
max_score = scores.max()
if max_score == min_score:
return np.zeros_like(scores)
return (scores - min_score) / (max_score - min_score)
def weighted_combination(
dense_scores: np.ndarray,
sparse_scores: np.ndarray,
alpha: float = 0.7, # Weight for dense; (1-alpha) for sparse
) -> np.ndarray:
"""
Combine normalized dense and sparse scores.
alpha = 1.0 → pure dense
alpha = 0.0 → pure sparse
alpha = 0.7 → typical good default (dense-dominant)
"""
norm_dense = min_max_normalize(dense_scores)
norm_sparse = min_max_normalize(sparse_scores)
return alpha * norm_dense + (1 - alpha) * norm_sparse
# The main downside: score normalization is sensitive to the score distribution
# If BM25 scores are concentrated near 0 with a few high outliers,
# normalization amplifies noise from low-scoring documents.
# RRF avoids this by using ranks instead of raw scores.Qdrant Hybrid Search
Qdrant supports dense + sparse vectors natively:
Python
from qdrant_client import QdrantClient
from qdrant_client.models import (
VectorParams, SparseVectorParams,
PointStruct, SparseVector,
NamedVector, NamedSparseVector,
SearchRequest, FusionQuery, Fusion,
)
client = QdrantClient(url="http://localhost:6333")
# Create collection with both dense and sparse vectors
def create_hybrid_collection(collection_name: str) -> None:
client.create_collection(
collection_name=collection_name,
vectors_config={"dense": VectorParams(size=1536, distance="Cosine")},
sparse_vectors_config={"sparse": SparseVectorParams()},
)
def compute_sparse_vector(text: str, vocabulary: dict) -> SparseVector:
"""Convert text to sparse BM25-style vector using vocabulary."""
tokens = tokenize(text)
token_counts = {}
for token in tokens:
if token in vocabulary:
idx = vocabulary[token]
token_counts[idx] = token_counts.get(idx, 0) + 1
# Simple TF (Term Frequency) — Qdrant handles IDF internally
indices = list(token_counts.keys())
values = [float(count) for count in token_counts.values()]
return SparseVector(indices=indices, values=values)
def upsert_with_sparse(
collection_name: str,
documents: list[dict],
dense_embeddings: list[list[float]],
vocabulary: dict,
) -> None:
"""Upsert documents with both dense and sparse vectors."""
points = []
for i, (doc, emb) in enumerate(zip(documents, dense_embeddings)):
sparse_vec = compute_sparse_vector(doc["content"], vocabulary)
points.append(
PointStruct(
id=i,
vector={
"dense": emb,
"sparse": sparse_vec,
},
payload={"content": doc["content"], "title": doc["title"]},
)
)
client.upsert(collection_name=collection_name, points=points)
# Hybrid query with Qdrant's native fusion
def hybrid_query_qdrant(
collection_name: str,
dense_vector: list[float],
sparse_vector: SparseVector,
top_k: int = 5,
) -> list[dict]:
"""Query using Qdrant's native hybrid search with RRF fusion."""
results = client.query_points(
collection_name=collection_name,
prefetch=[
# Dense retrieval
{"query": dense_vector, "using": "dense", "limit": top_k * 2},
# Sparse retrieval
{"query": sparse_vector, "using": "sparse", "limit": top_k * 2},
],
query=FusionQuery(fusion=Fusion.RRF), # Apply RRF fusion
limit=top_k,
with_payload=True,
)
return [
{
"content": hit.payload.get("content", ""),
"score": hit.score,
}
for hit in results.points
]When to Use Hybrid Search
Pure dense retrieval is sufficient when:
- Documents are long-form natural language (prose, paragraphs)
- Queries are conceptual ("what are the side effects of...")
- Domain terminology is common enough to be in pretraining
- Embedding model is well-calibrated for the domain
Hybrid search significantly helps when:
- Queries contain rare proper nouns (new drug names, patient names, product codes)
- Short documents (titles, one-liners) — dense models struggle without context
- Users search with exact technical terms from reference material
- Recall is critical and you can afford slightly higher latency
Practical benchmark: On clinical drug information retrieval in internal testing:
- Dense only: 72% recall@5
- BM25 only: 68% recall@5
- Hybrid (RRF, α=0.6): 81% recall@5
The gains are most pronounced for rare drug names and specific ICD codes — exactly the precision-critical cases.