Metadata Filtering in RAG
Filter retrieved documents by metadata before or after vector search. Pre-filtering, post-filtering, and combining semantic similarity with structured data constraints.
Why Metadata Filtering Matters
Semantic search alone can retrieve topically similar documents that are wrong for the context. A query about warfarin in 2025 might retrieve a document from 2015 with outdated dosing guidelines. A query about pediatric dosing might retrieve adult dosing information. Metadata filters constrain retrieval to documents that are semantically similar AND meet structured constraints.
Common metadata dimensions for filtering:
- Date/version (only current guidelines)
- Document type (drug monograph, clinical trial, patient education)
- Specialty or indication (cardiology, nephrology)
- Patient population (adult, pediatric, renal impairment)
- Authority level (FDA label, clinical guideline, case report)
Metadata Schema Design
Design metadata schemas that support your most important filter patterns:
from dataclasses import dataclass, field
from datetime import date
from typing import Optional
from enum import Enum
class DocumentType(str, Enum):
DRUG_MONOGRAPH = "drug_monograph"
CLINICAL_GUIDELINE = "clinical_guideline"
PATIENT_EDUCATION = "patient_education"
RESEARCH_PAPER = "research_paper"
DRUG_INTERACTION = "drug_interaction"
ADVERSE_EVENT = "adverse_event"
class PatientPopulation(str, Enum):
ADULT = "adult"
PEDIATRIC = "pediatric"
GERIATRIC = "geriatric"
RENAL_IMPAIRMENT = "renal_impairment"
HEPATIC_IMPAIRMENT = "hepatic_impairment"
PREGNANCY = "pregnancy"
@dataclass
class DocumentMetadata:
"""Metadata schema for clinical documents."""
document_id: str
title: str
document_type: DocumentType
drug_names: list[str] # Primary drugs covered
drug_classes: list[str] # ATC drug classes
populations: list[PatientPopulation] # Applicable populations
publication_date: date
last_updated: date
source: str # "FDA", "Lexicomp", "KDIGO", etc.
authority_level: int # 1=FDA label, 2=guideline, 3=consensus, 4=case report
specialty: list[str] # ["cardiology", "nephrology"]
is_current: bool = True # False = superseded
language: str = "en"
version: str = "1.0"
def to_vector_db_metadata(self) -> dict:
"""Convert to flat dict for vector DB storage."""
return {
"document_id": self.document_id,
"title": self.title,
"document_type": self.document_type.value,
"drug_names": self.drug_names,
"drug_classes": self.drug_classes,
"populations": [p.value for p in self.populations],
"publication_date": self.publication_date.isoformat(),
"last_updated": self.last_updated.isoformat(),
"source": self.source,
"authority_level": self.authority_level,
"specialty": self.specialty,
"is_current": self.is_current,
"language": self.language,
}Pre-Filtering (Filter Before Search)
Apply metadata filters before vector similarity search. Reduces the search space — faster when the filter dramatically narrows results:
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny, Range
client = QdrantClient(url="http://localhost:6333")
def search_with_prefilter(
collection_name: str,
query_embedding: list[float],
top_k: int = 5,
document_type: str = None,
drug_name: str = None,
population: str = None,
min_authority_level: int = None,
max_authority_level: int = None,
only_current: bool = True,
) -> list[dict]:
"""
Qdrant search with metadata pre-filtering.
Qdrant applies the filter before computing similarities.
"""
conditions = []
if only_current:
conditions.append(FieldCondition(key="is_current", match=MatchValue(value=True)))
if document_type:
conditions.append(FieldCondition(key="document_type", match=MatchValue(value=document_type)))
if drug_name:
# MatchAny checks if any element in the list matches
conditions.append(FieldCondition(key="drug_names", match=MatchAny(any=[drug_name.lower()])))
if population:
conditions.append(FieldCondition(key="populations", match=MatchAny(any=[population])))
if min_authority_level is not None or max_authority_level is not None:
conditions.append(FieldCondition(
key="authority_level",
range=Range(
gte=min_authority_level,
lte=max_authority_level,
),
))
filter_obj = Filter(must=conditions) if conditions else None
results = client.search(
collection_name=collection_name,
query_vector=query_embedding,
query_filter=filter_obj,
limit=top_k,
with_payload=True,
)
return [
{
"content": hit.payload.get("content", ""),
"title": hit.payload.get("title", ""),
"score": hit.score,
"metadata": hit.payload,
}
for hit in results
]
# Example: find current FDA drug labels for warfarin only
results = search_with_prefilter(
collection_name="clinical_documents",
query_embedding=embed_text("warfarin dosing in atrial fibrillation"),
drug_name="warfarin",
document_type="drug_monograph",
min_authority_level=1,
max_authority_level=1, # FDA label only
only_current=True,
)Post-Filtering (Filter After Search)
Retrieve more candidates than needed, then apply filters:
def search_with_postfilter(
query_embedding: list[float],
all_documents: list[dict],
all_embeddings: list[list[float]],
top_k: int = 5,
filters: dict = None,
) -> list[dict]:
"""
Post-filtering: retrieve top-N, then filter by metadata.
Less efficient but useful when pre-filtering isn't supported
or when filters are complex.
"""
import numpy as np
# Retrieve many more candidates than needed
over_retrieve_k = top_k * 5
# Compute similarities
embeddings_arr = np.array(all_embeddings)
query_arr = np.array(query_embedding)
similarities = embeddings_arr @ query_arr
# Get top over_retrieve_k candidates
top_indices = np.argsort(-similarities)[:over_retrieve_k]
# Apply filters
filtered_results = []
for idx in top_indices:
doc = all_documents[idx]
meta = doc.get("metadata", {})
# Apply each filter condition
passes = True
if filters:
for key, value in filters.items():
doc_value = meta.get(key)
if isinstance(value, list):
# Any of the values must match
if isinstance(doc_value, list):
if not any(v in doc_value for v in value):
passes = False
break
elif doc_value not in value:
passes = False
break
elif doc_value != value:
passes = False
break
if passes:
filtered_results.append({
"document": doc,
"similarity": float(similarities[idx]),
})
if len(filtered_results) >= top_k:
break
return filtered_resultsDynamic Filter Generation
Let the LLM generate filters from the user query:
from openai import OpenAI
import json
client = OpenAI()
def extract_filters_from_query(query: str) -> dict:
"""Use an LLM to extract metadata filters from a natural language query."""
FILTER_EXTRACTION_PROMPT = """Extract metadata filters from this clinical query.
Available filter fields:
- drug_name: specific drug name (e.g., "warfarin", "metformin")
- document_type: "drug_monograph", "clinical_guideline", "patient_education", "drug_interaction"
- population: "adult", "pediatric", "geriatric", "renal_impairment", "hepatic_impairment", "pregnancy"
- specialty: "cardiology", "nephrology", "endocrinology", "infectious_disease"
Return JSON with only the filters that are explicitly mentioned or strongly implied:
{"filters": {"field": value, ...}}
If no filters are applicable, return: {"filters": {}}"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "system", "content": FILTER_EXTRACTION_PROMPT},
{"role": "user", "content": f"Query: {query}"},
],
response_format={"type": "json_object"},
temperature=0,
)
result = json.loads(response.choices[0].message.content)
return result.get("filters", {})
# Test
query1 = "What's the warfarin dose adjustment for atrial fibrillation in elderly patients?"
filters1 = extract_filters_from_query(query1)
print(filters1)
# → {"drug_name": "warfarin", "population": "geriatric"}
query2 = "What are the common side effects of beta-blockers?"
filters2 = extract_filters_from_query(query2)
print(filters2)
# → {"drug_class": "beta-blockers"} or {}
def smart_retrieval(
query: str,
query_embedding: list[float],
retriever,
use_dynamic_filters: bool = True,
) -> list[dict]:
"""Full retrieval pipeline with automatic filter extraction."""
filters = {}
if use_dynamic_filters:
filters = extract_filters_from_query(query)
if filters:
results = retriever.search(query_embedding, filters=filters)
# Fallback: if filtered results are too few, retrieve without filters
if len(results) < 3:
results = retriever.search(query_embedding, filters={})
else:
results = retriever.search(query_embedding)
return resultsTemporal Filtering: Handling Document Freshness
from datetime import datetime, timedelta
def get_freshness_score(
doc_date: str,
decay_rate: float = 0.1,
max_age_days: int = 365 * 5,
) -> float:
"""
Score document freshness. Recent docs score higher.
Useful for re-ranking after retrieval.
decay_rate controls how quickly older docs are penalized.
"""
doc_dt = datetime.fromisoformat(doc_date)
age_days = (datetime.now() - doc_dt).days
if age_days > max_age_days:
return 0.0
return 1.0 - (decay_rate * age_days / max_age_days)
def combine_similarity_with_freshness(
results: list[dict],
freshness_weight: float = 0.2,
) -> list[dict]:
"""Re-rank results by combining similarity with document freshness."""
for result in results:
date_str = result.get("metadata", {}).get("last_updated", "2020-01-01")
freshness = get_freshness_score(date_str)
similarity = result.get("score", 0.5)
result["combined_score"] = (
(1 - freshness_weight) * similarity +
freshness_weight * freshness
)
return sorted(results, key=lambda x: x["combined_score"], reverse=True)Chroma Metadata Filtering
import chromadb
from chromadb.utils import embedding_functions
client = chromadb.PersistentClient(path="./chroma_db")
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key="your-api-key",
model_name="text-embedding-3-small",
)
collection = client.get_or_create_collection("clinical_docs", embedding_function=openai_ef)
# Chroma filter syntax
def query_chroma_with_filters(
query_text: str,
n_results: int = 5,
document_type: str = None,
drug_name: str = None,
is_current: bool = True,
) -> list[dict]:
"""Query Chroma with metadata filters."""
# Build Chroma's where clause
conditions = []
if is_current:
conditions.append({"is_current": {"$eq": True}})
if document_type:
conditions.append({"document_type": {"$eq": document_type}})
if drug_name:
conditions.append({"drug_names": {"$contains": drug_name.lower()}})
where = None
if len(conditions) == 1:
where = conditions[0]
elif len(conditions) > 1:
where = {"$and": conditions}
results = collection.query(
query_texts=[query_text],
n_results=n_results,
where=where,
include=["documents", "metadatas", "distances"],
)
return [
{
"content": doc,
"metadata": meta,
"score": 1 - dist,
}
for doc, meta, dist in zip(
results["documents"][0],
results["metadatas"][0],
results["distances"][0],
)
]Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.