Learnixo
Back to blog
AI Systemsintermediate

Metadata Filtering in RAG

Filter retrieved documents by metadata before or after vector search. Pre-filtering, post-filtering, and combining semantic similarity with structured data constraints.

Asma Hafeez KhanMay 16, 20266 min read
RAGMetadata FilteringHybrid SearchVector Database
Share:𝕏

Why Metadata Filtering Matters

Semantic search alone can retrieve topically similar documents that are wrong for the context. A query about warfarin in 2025 might retrieve a document from 2015 with outdated dosing guidelines. A query about pediatric dosing might retrieve adult dosing information. Metadata filters constrain retrieval to documents that are semantically similar AND meet structured constraints.

Common metadata dimensions for filtering:

  • Date/version (only current guidelines)
  • Document type (drug monograph, clinical trial, patient education)
  • Specialty or indication (cardiology, nephrology)
  • Patient population (adult, pediatric, renal impairment)
  • Authority level (FDA label, clinical guideline, case report)

Metadata Schema Design

Design metadata schemas that support your most important filter patterns:

Python
from dataclasses import dataclass, field
from datetime import date
from typing import Optional
from enum import Enum

class DocumentType(str, Enum):
    DRUG_MONOGRAPH = "drug_monograph"
    CLINICAL_GUIDELINE = "clinical_guideline"
    PATIENT_EDUCATION = "patient_education"
    RESEARCH_PAPER = "research_paper"
    DRUG_INTERACTION = "drug_interaction"
    ADVERSE_EVENT = "adverse_event"

class PatientPopulation(str, Enum):
    ADULT = "adult"
    PEDIATRIC = "pediatric"
    GERIATRIC = "geriatric"
    RENAL_IMPAIRMENT = "renal_impairment"
    HEPATIC_IMPAIRMENT = "hepatic_impairment"
    PREGNANCY = "pregnancy"

@dataclass
class DocumentMetadata:
    """Metadata schema for clinical documents."""
    document_id: str
    title: str
    document_type: DocumentType
    drug_names: list[str]              # Primary drugs covered
    drug_classes: list[str]            # ATC drug classes
    populations: list[PatientPopulation]  # Applicable populations
    publication_date: date
    last_updated: date
    source: str                         # "FDA", "Lexicomp", "KDIGO", etc.
    authority_level: int               # 1=FDA label, 2=guideline, 3=consensus, 4=case report
    specialty: list[str]               # ["cardiology", "nephrology"]
    is_current: bool = True            # False = superseded
    language: str = "en"
    version: str = "1.0"

    def to_vector_db_metadata(self) -> dict:
        """Convert to flat dict for vector DB storage."""
        return {
            "document_id": self.document_id,
            "title": self.title,
            "document_type": self.document_type.value,
            "drug_names": self.drug_names,
            "drug_classes": self.drug_classes,
            "populations": [p.value for p in self.populations],
            "publication_date": self.publication_date.isoformat(),
            "last_updated": self.last_updated.isoformat(),
            "source": self.source,
            "authority_level": self.authority_level,
            "specialty": self.specialty,
            "is_current": self.is_current,
            "language": self.language,
        }

Pre-Filtering (Filter Before Search)

Apply metadata filters before vector similarity search. Reduces the search space — faster when the filter dramatically narrows results:

Python
from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue, MatchAny, Range

client = QdrantClient(url="http://localhost:6333")

def search_with_prefilter(
    collection_name: str,
    query_embedding: list[float],
    top_k: int = 5,
    document_type: str = None,
    drug_name: str = None,
    population: str = None,
    min_authority_level: int = None,
    max_authority_level: int = None,
    only_current: bool = True,
) -> list[dict]:
    """
    Qdrant search with metadata pre-filtering.
    Qdrant applies the filter before computing similarities.
    """
    conditions = []

    if only_current:
        conditions.append(FieldCondition(key="is_current", match=MatchValue(value=True)))

    if document_type:
        conditions.append(FieldCondition(key="document_type", match=MatchValue(value=document_type)))

    if drug_name:
        # MatchAny checks if any element in the list matches
        conditions.append(FieldCondition(key="drug_names", match=MatchAny(any=[drug_name.lower()])))

    if population:
        conditions.append(FieldCondition(key="populations", match=MatchAny(any=[population])))

    if min_authority_level is not None or max_authority_level is not None:
        conditions.append(FieldCondition(
            key="authority_level",
            range=Range(
                gte=min_authority_level,
                lte=max_authority_level,
            ),
        ))

    filter_obj = Filter(must=conditions) if conditions else None

    results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        query_filter=filter_obj,
        limit=top_k,
        with_payload=True,
    )

    return [
        {
            "content": hit.payload.get("content", ""),
            "title": hit.payload.get("title", ""),
            "score": hit.score,
            "metadata": hit.payload,
        }
        for hit in results
    ]


# Example: find current FDA drug labels for warfarin only
results = search_with_prefilter(
    collection_name="clinical_documents",
    query_embedding=embed_text("warfarin dosing in atrial fibrillation"),
    drug_name="warfarin",
    document_type="drug_monograph",
    min_authority_level=1,
    max_authority_level=1,  # FDA label only
    only_current=True,
)

Post-Filtering (Filter After Search)

Retrieve more candidates than needed, then apply filters:

Python
def search_with_postfilter(
    query_embedding: list[float],
    all_documents: list[dict],
    all_embeddings: list[list[float]],
    top_k: int = 5,
    filters: dict = None,
) -> list[dict]:
    """
    Post-filtering: retrieve top-N, then filter by metadata.
    Less efficient but useful when pre-filtering isn't supported
    or when filters are complex.
    """
    import numpy as np

    # Retrieve many more candidates than needed
    over_retrieve_k = top_k * 5

    # Compute similarities
    embeddings_arr = np.array(all_embeddings)
    query_arr = np.array(query_embedding)
    similarities = embeddings_arr @ query_arr

    # Get top over_retrieve_k candidates
    top_indices = np.argsort(-similarities)[:over_retrieve_k]

    # Apply filters
    filtered_results = []
    for idx in top_indices:
        doc = all_documents[idx]
        meta = doc.get("metadata", {})

        # Apply each filter condition
        passes = True
        if filters:
            for key, value in filters.items():
                doc_value = meta.get(key)

                if isinstance(value, list):
                    # Any of the values must match
                    if isinstance(doc_value, list):
                        if not any(v in doc_value for v in value):
                            passes = False
                            break
                    elif doc_value not in value:
                        passes = False
                        break
                elif doc_value != value:
                    passes = False
                    break

        if passes:
            filtered_results.append({
                "document": doc,
                "similarity": float(similarities[idx]),
            })

        if len(filtered_results) >= top_k:
            break

    return filtered_results

Dynamic Filter Generation

Let the LLM generate filters from the user query:

Python
from openai import OpenAI
import json

client = OpenAI()

def extract_filters_from_query(query: str) -> dict:
    """Use an LLM to extract metadata filters from a natural language query."""
    FILTER_EXTRACTION_PROMPT = """Extract metadata filters from this clinical query.

Available filter fields:
- drug_name: specific drug name (e.g., "warfarin", "metformin")
- document_type: "drug_monograph", "clinical_guideline", "patient_education", "drug_interaction"
- population: "adult", "pediatric", "geriatric", "renal_impairment", "hepatic_impairment", "pregnancy"
- specialty: "cardiology", "nephrology", "endocrinology", "infectious_disease"

Return JSON with only the filters that are explicitly mentioned or strongly implied:
{"filters": {"field": value, ...}}

If no filters are applicable, return: {"filters": {}}"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": FILTER_EXTRACTION_PROMPT},
            {"role": "user", "content": f"Query: {query}"},
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )

    result = json.loads(response.choices[0].message.content)
    return result.get("filters", {})


# Test
query1 = "What's the warfarin dose adjustment for atrial fibrillation in elderly patients?"
filters1 = extract_filters_from_query(query1)
print(filters1)
#  {"drug_name": "warfarin", "population": "geriatric"}

query2 = "What are the common side effects of beta-blockers?"
filters2 = extract_filters_from_query(query2)
print(filters2)
#  {"drug_class": "beta-blockers"} or {}


def smart_retrieval(
    query: str,
    query_embedding: list[float],
    retriever,
    use_dynamic_filters: bool = True,
) -> list[dict]:
    """Full retrieval pipeline with automatic filter extraction."""
    filters = {}

    if use_dynamic_filters:
        filters = extract_filters_from_query(query)

    if filters:
        results = retriever.search(query_embedding, filters=filters)
        # Fallback: if filtered results are too few, retrieve without filters
        if len(results) < 3:
            results = retriever.search(query_embedding, filters={})
    else:
        results = retriever.search(query_embedding)

    return results

Temporal Filtering: Handling Document Freshness

Python
from datetime import datetime, timedelta

def get_freshness_score(
    doc_date: str,
    decay_rate: float = 0.1,
    max_age_days: int = 365 * 5,
) -> float:
    """
    Score document freshness. Recent docs score higher.
    Useful for re-ranking after retrieval.
    decay_rate controls how quickly older docs are penalized.
    """
    doc_dt = datetime.fromisoformat(doc_date)
    age_days = (datetime.now() - doc_dt).days

    if age_days > max_age_days:
        return 0.0

    return 1.0 - (decay_rate * age_days / max_age_days)


def combine_similarity_with_freshness(
    results: list[dict],
    freshness_weight: float = 0.2,
) -> list[dict]:
    """Re-rank results by combining similarity with document freshness."""
    for result in results:
        date_str = result.get("metadata", {}).get("last_updated", "2020-01-01")
        freshness = get_freshness_score(date_str)

        similarity = result.get("score", 0.5)
        result["combined_score"] = (
            (1 - freshness_weight) * similarity +
            freshness_weight * freshness
        )

    return sorted(results, key=lambda x: x["combined_score"], reverse=True)

Chroma Metadata Filtering

Python
import chromadb
from chromadb.utils import embedding_functions

client = chromadb.PersistentClient(path="./chroma_db")
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key="your-api-key",
    model_name="text-embedding-3-small",
)
collection = client.get_or_create_collection("clinical_docs", embedding_function=openai_ef)

# Chroma filter syntax
def query_chroma_with_filters(
    query_text: str,
    n_results: int = 5,
    document_type: str = None,
    drug_name: str = None,
    is_current: bool = True,
) -> list[dict]:
    """Query Chroma with metadata filters."""
    # Build Chroma's where clause
    conditions = []

    if is_current:
        conditions.append({"is_current": {"$eq": True}})

    if document_type:
        conditions.append({"document_type": {"$eq": document_type}})

    if drug_name:
        conditions.append({"drug_names": {"$contains": drug_name.lower()}})

    where = None
    if len(conditions) == 1:
        where = conditions[0]
    elif len(conditions) > 1:
        where = {"$and": conditions}

    results = collection.query(
        query_texts=[query_text],
        n_results=n_results,
        where=where,
        include=["documents", "metadatas", "distances"],
    )

    return [
        {
            "content": doc,
            "metadata": meta,
            "score": 1 - dist,
        }
        for doc, meta, dist in zip(
            results["documents"][0],
            results["metadatas"][0],
            results["distances"][0],
        )
    ]

Enjoyed this article?

Explore the AI Systems learning path for more.

Found this helpful?

Share:𝕏

Leave a comment

Have a question, correction, or just found this helpful? Leave a note below.