GenAI & LLM Interviews · Lesson 14 of 30

Hybrid Search: Vector + Keyword

Why Hybrid Search

Pure semantic (dense) search is great at finding conceptually similar documents but fails when:

The query contains specific technical terms (drug names, medical codes, model numbers)
Exact keyword match matters (a patient's name, a specific ICD code, a serial number)
Rare terms aren't in the embedding space (new drug names, proprietary terminology)

BM25 (sparse retrieval) excels at exact keyword matching but fails at:

Paraphrase — "cardiac event" vs "myocardial infarction" — same concept, different words
Synonym handling — different terminology for the same idea
Multi-hop reasoning — query and answer share no common words

Hybrid search combines both, capturing the strengths of each.

BM25: Sparse Retrieval

Python

from rank_bm25 import BM25Okapi
import numpy as np
import re

def tokenize(text: str) -> list[str]:
    """Simple tokenization for BM25."""
    # Lowercase and split on non-alphanumeric
    tokens = re.findall(r'\b[a-z0-9]+\b', text.lower())
    return tokens

class BM25Index:
    """BM25 retrieval index."""

    def __init__(self, documents: list[dict]):
        self.documents = documents
        self.corpus = [doc["content"] for doc in documents]

        # Tokenize all documents
        tokenized_corpus = [tokenize(doc) for doc in self.corpus]

        # Build BM25 index
        # k1=1.5 (term frequency saturation), b=0.75 (length normalization)
        self.bm25 = BM25Okapi(tokenized_corpus, k1=1.5, b=0.75)

    def search(self, query: str, top_k: int = 10) -> list[dict]:
        """Search using BM25."""
        query_tokens = tokenize(query)
        scores = self.bm25.get_scores(query_tokens)

        # Get top-k indices
        top_indices = np.argsort(-scores)[:top_k]

        return [
            {
                "document": self.documents[i],
                "score": float(scores[i]),
                "rank": rank + 1,
            }
            for rank, i in enumerate(top_indices)
            if scores[i] > 0  # Only return non-zero scores
        ]

    def get_all_scores(self, query: str) -> np.ndarray:
        """Get BM25 scores for all documents (for hybrid fusion)."""
        return self.bm25.get_scores(tokenize(query))


# Example
documents = [
    {"id": "1", "content": "Warfarin interacts with clarithromycin via CYP2C9 inhibition."},
    {"id": "2", "content": "Metformin should be withheld before procedures requiring contrast."},
    {"id": "3", "content": "The blood thinner warfarin requires regular INR monitoring."},
]

bm25_index = BM25Index(documents)
results = bm25_index.search("warfarin drug interaction")
# Doc 1 scores highest (exact "warfarin" + "interaction"), Doc 3 also matches "warfarin"

Reciprocal Rank Fusion (RRF)

RRF combines ranking from multiple retrieval systems without needing to normalize scores:

Python

def reciprocal_rank_fusion(
    ranked_lists: list[list[str]],   # Each list is a ranked list of document IDs
    k: int = 60,                      # RRF constant (controls rank sensitivity)
    weights: list[float] = None,     # Optional per-system weights
) -> dict[str, float]:
    """
    Combine multiple ranked lists using Reciprocal Rank Fusion.
    
    For each document, score = Σ weight_i / (k + rank_i)
    where rank is 1-indexed position in each list.
    
    k=60 is the commonly recommended default (from Cormack et al., 2009).
    """
    if weights is None:
        weights = [1.0] * len(ranked_lists)

    scores = {}
    for ranked_list, weight in zip(ranked_lists, weights):
        for rank, doc_id in enumerate(ranked_list, 1):
            if doc_id not in scores:
                scores[doc_id] = 0.0
            scores[doc_id] += weight / (k + rank)

    return scores


class HybridRetriever:
    """Combines dense semantic search with BM25 sparse retrieval using RRF."""

    def __init__(
        self,
        documents: list[dict],
        embeddings: np.ndarray,
        dense_weight: float = 0.5,
        sparse_weight: float = 0.5,
    ):
        self.documents = documents
        self.embeddings = embeddings
        self.bm25_index = BM25Index(documents)
        self.dense_weight = dense_weight
        self.sparse_weight = sparse_weight

    def retrieve(
        self,
        query: str,
        query_embedding: np.ndarray,
        top_k: int = 10,
    ) -> list[dict]:
        """Retrieve top-k documents using hybrid search."""

        # Dense retrieval: cosine similarity
        dense_scores = self.embeddings @ query_embedding
        dense_ranked_ids = [
            str(self.documents[i]["id"])
            for i in np.argsort(-dense_scores)[:top_k * 2]  # Over-retrieve, then fuse
        ]

        # Sparse retrieval: BM25
        bm25_scores = self.bm25_index.get_all_scores(query)
        sparse_ranked_ids = [
            str(self.documents[i]["id"])
            for i in np.argsort(-bm25_scores)[:top_k * 2]
        ]

        # Combine with RRF
        rrf_scores = reciprocal_rank_fusion(
            ranked_lists=[dense_ranked_ids, sparse_ranked_ids],
            weights=[self.dense_weight, self.sparse_weight],
        )

        # Sort by RRF score
        sorted_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

        # Retrieve document content
        doc_map = {str(doc["id"]): doc for doc in self.documents}
        return [
            {
                "document": doc_map[doc_id],
                "rrf_score": score,
            }
            for doc_id, score in sorted_docs
            if doc_id in doc_map
        ]

Weighted Score Combination

Alternative to RRF: normalize scores and combine with weights:

Python

def min_max_normalize(scores: np.ndarray) -> np.ndarray:
    """Normalize scores to [0, 1] range."""
    min_score = scores.min()
    max_score = scores.max()
    if max_score == min_score:
        return np.zeros_like(scores)
    return (scores - min_score) / (max_score - min_score)


def weighted_combination(
    dense_scores: np.ndarray,
    sparse_scores: np.ndarray,
    alpha: float = 0.7,  # Weight for dense; (1-alpha) for sparse
) -> np.ndarray:
    """
    Combine normalized dense and sparse scores.
    
    alpha = 1.0 → pure dense
    alpha = 0.0 → pure sparse
    alpha = 0.7 → typical good default (dense-dominant)
    """
    norm_dense = min_max_normalize(dense_scores)
    norm_sparse = min_max_normalize(sparse_scores)

    return alpha * norm_dense + (1 - alpha) * norm_sparse


# The main downside: score normalization is sensitive to the score distribution
# If BM25 scores are concentrated near 0 with a few high outliers,
# normalization amplifies noise from low-scoring documents.
# RRF avoids this by using ranks instead of raw scores.

Qdrant Hybrid Search

Qdrant supports dense + sparse vectors natively:

Python

from qdrant_client import QdrantClient
from qdrant_client.models import (
    VectorParams, SparseVectorParams,
    PointStruct, SparseVector,
    NamedVector, NamedSparseVector,
    SearchRequest, FusionQuery, Fusion,
)

client = QdrantClient(url="http://localhost:6333")

# Create collection with both dense and sparse vectors
def create_hybrid_collection(collection_name: str) -> None:
    client.create_collection(
        collection_name=collection_name,
        vectors_config={"dense": VectorParams(size=1536, distance="Cosine")},
        sparse_vectors_config={"sparse": SparseVectorParams()},
    )


def compute_sparse_vector(text: str, vocabulary: dict) -> SparseVector:
    """Convert text to sparse BM25-style vector using vocabulary."""
    tokens = tokenize(text)
    token_counts = {}
    for token in tokens:
        if token in vocabulary:
            idx = vocabulary[token]
            token_counts[idx] = token_counts.get(idx, 0) + 1

    # Simple TF (Term Frequency) — Qdrant handles IDF internally
    indices = list(token_counts.keys())
    values = [float(count) for count in token_counts.values()]

    return SparseVector(indices=indices, values=values)


def upsert_with_sparse(
    collection_name: str,
    documents: list[dict],
    dense_embeddings: list[list[float]],
    vocabulary: dict,
) -> None:
    """Upsert documents with both dense and sparse vectors."""
    points = []
    for i, (doc, emb) in enumerate(zip(documents, dense_embeddings)):
        sparse_vec = compute_sparse_vector(doc["content"], vocabulary)
        points.append(
            PointStruct(
                id=i,
                vector={
                    "dense": emb,
                    "sparse": sparse_vec,
                },
                payload={"content": doc["content"], "title": doc["title"]},
            )
        )

    client.upsert(collection_name=collection_name, points=points)


# Hybrid query with Qdrant's native fusion
def hybrid_query_qdrant(
    collection_name: str,
    dense_vector: list[float],
    sparse_vector: SparseVector,
    top_k: int = 5,
) -> list[dict]:
    """Query using Qdrant's native hybrid search with RRF fusion."""
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            # Dense retrieval
            {"query": dense_vector, "using": "dense", "limit": top_k * 2},
            # Sparse retrieval
            {"query": sparse_vector, "using": "sparse", "limit": top_k * 2},
        ],
        query=FusionQuery(fusion=Fusion.RRF),  # Apply RRF fusion
        limit=top_k,
        with_payload=True,
    )

    return [
        {
            "content": hit.payload.get("content", ""),
            "score": hit.score,
        }
        for hit in results.points
    ]

When to Use Hybrid Search

Pure dense retrieval is sufficient when:

Documents are long-form natural language (prose, paragraphs)
Queries are conceptual ("what are the side effects of...")
Domain terminology is common enough to be in pretraining
Embedding model is well-calibrated for the domain

Hybrid search significantly helps when:

Queries contain rare proper nouns (new drug names, patient names, product codes)
Short documents (titles, one-liners) — dense models struggle without context
Users search with exact technical terms from reference material
Recall is critical and you can afford slightly higher latency

Practical benchmark: On clinical drug information retrieval in internal testing:

Dense only: 72% recall@5
BM25 only: 68% recall@5
Hybrid (RRF, α=0.6): 81% recall@5

The gains are most pronounced for rare drug names and specific ICD codes — exactly the precision-critical cases.

RAG in Production: Pipelines & Pitfalls

Next Lesson

RAG Evaluation: Recall, MRR, Golden Sets