Metadata Filtering in RAG

Why Metadata Filtering Matters

Semantic search alone can retrieve topically similar documents that are wrong for the context. A query about warfarin in 2025 might retrieve a document from 2015 with outdated dosing guidelines. A query about pediatric dosing might retrieve adult dosing information. Metadata filters constrain retrieval to documents that are semantically similar AND meet structured constraints.

Common metadata dimensions for filtering:

Date/version (only current guidelines)
Document type (drug monograph, clinical trial, patient education)
Specialty or indication (cardiology, nephrology)
Patient population (adult, pediatric, renal impairment)
Authority level (FDA label, clinical guideline, case report)

Metadata Schema Design

Design metadata schemas that support your most important filter patterns:

Python

from dataclasses import dataclass, field
from datetime import date
from typing import Optional
from enum import Enum

class DocumentType(str, Enum):
    DRUG_MONOGRAPH = "drug_monograph"
    CLINICAL_GUIDELINE = "clinical_guideline"
    PATIENT_EDUCATION = "patient_education"
    RESEARCH_PAPER = "research_paper"
    DRUG_INTERACTION = "drug_interaction"
    ADVERSE_EVENT = "adverse_event"

class PatientPopulation(str, Enum):
    ADULT = "adult"
    PEDIATRIC = "pediatric"
    GERIATRIC = "geriatric"
    RENAL_IMPAIRMENT = "renal_impairment"
    HEPATIC_IMPAIRMENT = "hepatic_impairment"
    PREGNANCY = "pregnancy"

@dataclass
class DocumentMetadata:
    """Metadata schema for clinical documents."""
    document_id: str
    title: str
    document_type: DocumentType
    drug_names: list[str]              # Primary drugs covered
    drug_classes: list[str]            # ATC drug classes
    populations: list[PatientPopulation]  # Applicable populations
    publication_date: date
    last_updated: date
    source: str                         # "FDA", "Lexicomp", "KDIGO", etc.
    authority_level: int               # 1=FDA label, 2=guideline, 3=consensus, 4=case report
    specialty: list[str]               # ["cardiology", "nephrology"]
    is_current: bool = True            # False = superseded
    language: str = "en"
    version: str = "1.0"

    def to_vector_db_metadata(self) -> dict:
        """Convert to flat dict for vector DB storage."""
        return {
            "document_id": self.document_id,
            "title": self.title,
            "document_type": self.document_type.value,
            "drug_names": self.drug_names,
            "drug_classes": self.drug_classes,
            "populations": [p.value for p in self.populations],
            "publication_date": self.publication_date.isoformat(),
            "last_updated": self.last_updated.isoformat(),
            "source": self.source,
            "authority_level": self.authority_level,
            "specialty": self.specialty,
            "is_current": self.is_current,
            "language": self.language,
        }

Pre-Filtering (Filter Before Search)

Apply metadata filters before vector similarity search. Reduces the search space — faster when the filter dramatically narrows results:

Python

from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny, Range

client = QdrantClient(url="http://localhost:6333")

def search_with_prefilter(
    collection_name: str,
    query_embedding: list[float],
    top_k: int = 5,
    document_type: str = None,
    drug_name: str = None,
    population: str = None,
    min_authority_level: int = None,
    max_authority_level: int = None,
    only_current: bool = True,
) -> list[dict]:
    """
    Qdrant search with metadata pre-filtering.
    Qdrant applies the filter before computing similarities.
    """
    conditions = []

    if only_current:
        conditions.append(FieldCondition(key="is_current", match=MatchValue(value=True)))

    if document_type:
        conditions.append(FieldCondition(key="document_type", match=MatchValue(value=document_type)))

    if drug_name:
        # MatchAny checks if any element in the list matches
        conditions.append(FieldCondition(key="drug_names", match=MatchAny(any=[drug_name.lower()])))

    if population:
        conditions.append(FieldCondition(key="populations", match=MatchAny(any=[population])))

    if min_authority_level is not None or max_authority_level is not None:
        conditions.append(FieldCondition(
            key="authority_level",
            range=Range(
                gte=min_authority_level,
                lte=max_authority_level,
            ),
        ))

    filter_obj = Filter(must=conditions) if conditions else None

    results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        query_filter=filter_obj,
        limit=top_k,
        with_payload=True,
    )

    return [
        {
            "content": hit.payload.get("content", ""),
            "title": hit.payload.get("title", ""),
            "score": hit.score,
            "metadata": hit.payload,
        }
        for hit in results
    ]


# Example: find current FDA drug labels for warfarin only
results = search_with_prefilter(
    collection_name="clinical_documents",
    query_embedding=embed_text("warfarin dosing in atrial fibrillation"),
    drug_name="warfarin",
    document_type="drug_monograph",
    min_authority_level=1,
    max_authority_level=1,  # FDA label only
    only_current=True,
)

Post-Filtering (Filter After Search)

Retrieve more candidates than needed, then apply filters:

Python

def search_with_postfilter(
    query_embedding: list[float],
    all_documents: list[dict],
    all_embeddings: list[list[float]],
    top_k: int = 5,
    filters: dict = None,
) -> list[dict]:
    """
    Post-filtering: retrieve top-N, then filter by metadata.
    Less efficient but useful when pre-filtering isn't supported
    or when filters are complex.
    """
    import numpy as np

    # Retrieve many more candidates than needed
    over_retrieve_k = top_k * 5

    # Compute similarities
    embeddings_arr = np.array(all_embeddings)
    query_arr = np.array(query_embedding)
    similarities = embeddings_arr @ query_arr

    # Get top over_retrieve_k candidates
    top_indices = np.argsort(-similarities)[:over_retrieve_k]

    # Apply filters
    filtered_results = []
    for idx in top_indices:
        doc = all_documents[idx]
        meta = doc.get("metadata", {})

        # Apply each filter condition
        passes = True
        if filters:
            for key, value in filters.items():
                doc_value = meta.get(key)

                if isinstance(value, list):
                    # Any of the values must match
                    if isinstance(doc_value, list):
                        if not any(v in doc_value for v in value):
                            passes = False
                            break
                    elif doc_value not in value:
                        passes = False
                        break
                elif doc_value != value:
                    passes = False
                    break

        if passes:
            filtered_results.append({
                "document": doc,
                "similarity": float(similarities[idx]),
            })

        if len(filtered_results) >= top_k:
            break

    return filtered_results

Dynamic Filter Generation

Let the LLM generate filters from the user query:

Python

from openai import OpenAI
import json

client = OpenAI()

def extract_filters_from_query(query: str) -> dict:
    """Use an LLM to extract metadata filters from a natural language query."""
    FILTER_EXTRACTION_PROMPT = """Extract metadata filters from this clinical query.

Available filter fields:
- drug_name: specific drug name (e.g., "warfarin", "metformin")
- document_type: "drug_monograph", "clinical_guideline", "patient_education", "drug_interaction"
- population: "adult", "pediatric", "geriatric", "renal_impairment", "hepatic_impairment", "pregnancy"
- specialty: "cardiology", "nephrology", "endocrinology", "infectious_disease"

Return JSON with only the filters that are explicitly mentioned or strongly implied:
{"filters": {"field": value, ...}}

If no filters are applicable, return: {"filters": {}}"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": FILTER_EXTRACTION_PROMPT},
            {"role": "user", "content": f"Query: {query}"},
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )

    result = json.loads(response.choices[0].message.content)
    return result.get("filters", {})


# Test
query1 = "What's the warfarin dose adjustment for atrial fibrillation in elderly patients?"
filters1 = extract_filters_from_query(query1)
print(filters1)
# → {"drug_name": "warfarin", "population": "geriatric"}

query2 = "What are the common side effects of beta-blockers?"
filters2 = extract_filters_from_query(query2)
print(filters2)
# → {"drug_class": "beta-blockers"} or {}


def smart_retrieval(
    query: str,
    query_embedding: list[float],
    retriever,
    use_dynamic_filters: bool = True,
) -> list[dict]:
    """Full retrieval pipeline with automatic filter extraction."""
    filters = {}

    if use_dynamic_filters:
        filters = extract_filters_from_query(query)

    if filters:
        results = retriever.search(query_embedding, filters=filters)
        # Fallback: if filtered results are too few, retrieve without filters
        if len(results) < 3:
            results = retriever.search(query_embedding, filters={})
    else:
        results = retriever.search(query_embedding)

    return results

Temporal Filtering: Handling Document Freshness

Python

from datetime import datetime, timedelta

def get_freshness_score(
    doc_date: str,
    decay_rate: float = 0.1,
    max_age_days: int = 365 * 5,
) -> float:
    """
    Score document freshness. Recent docs score higher.
    Useful for re-ranking after retrieval.
    decay_rate controls how quickly older docs are penalized.
    """
    doc_dt = datetime.fromisoformat(doc_date)
    age_days = (datetime.now() - doc_dt).days

    if age_days > max_age_days:
        return 0.0

    return 1.0 - (decay_rate * age_days / max_age_days)


def combine_similarity_with_freshness(
    results: list[dict],
    freshness_weight: float = 0.2,
) -> list[dict]:
    """Re-rank results by combining similarity with document freshness."""
    for result in results:
        date_str = result.get("metadata", {}).get("last_updated", "2020-01-01")
        freshness = get_freshness_score(date_str)

        similarity = result.get("score", 0.5)
        result["combined_score"] = (
            (1 - freshness_weight) * similarity +
            freshness_weight * freshness
        )

    return sorted(results, key=lambda x: x["combined_score"], reverse=True)

Chroma Metadata Filtering

Python

import chromadb
from chromadb.utils import embedding_functions

client = chromadb.PersistentClient(path="./chroma_db")
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key="your-api-key",
    model_name="text-embedding-3-small",
)
collection = client.get_or_create_collection("clinical_docs", embedding_function=openai_ef)

# Chroma filter syntax
def query_chroma_with_filters(
    query_text: str,
    n_results: int = 5,
    document_type: str = None,
    drug_name: str = None,
    is_current: bool = True,
) -> list[dict]:
    """Query Chroma with metadata filters."""
    # Build Chroma's where clause
    conditions = []

    if is_current:
        conditions.append({"is_current": {"$eq": True}})

    if document_type:
        conditions.append({"document_type": {"$eq": document_type}})

    if drug_name:
        conditions.append({"drug_names": {"$contains": drug_name.lower()}})

    where = None
    if len(conditions) == 1:
        where = conditions[0]
    elif len(conditions) > 1:
        where = {"$and": conditions}

    results = collection.query(
        query_texts=[query_text],
        n_results=n_results,
        where=where,
        include=["documents", "metadatas", "distances"],
    )

    return [
        {
            "content": doc,
            "metadata": meta,
            "score": 1 - dist,
        }
        for doc, meta, dist in zip(
            results["documents"][0],
            results["metadatas"][0],
            results["distances"][0],
        )
    ]

Metadata Filtering in RAG

Why Metadata Filtering Matters

Metadata Schema Design

Pre-Filtering (Filter Before Search)

Post-Filtering (Filter After Search)

Dynamic Filter Generation

Temporal Filtering: Handling Document Freshness

Chroma Metadata Filtering

Enjoyed this article?

Leave a comment