RAG Troubleshooting Guide

Systematic RAG Debugging

RAG failures fall into three categories:

Retrieval failures — Wrong or no documents retrieved
Generation failures — Documents were retrieved but answer is wrong
System failures — Latency, errors, or infrastructure problems

The diagnostic approach: isolate each component, test independently, fix the weakest link.

Failure Mode 1: Retrieval Returns Wrong Documents

Symptoms: The answer doesn't address the question. Hallucinated facts. "I don't have information about X" when X is in the knowledge base.

Diagnosis

Python

from openai import OpenAI
import numpy as np

client = OpenAI()


def diagnose_retrieval(
    query: str,
    retriever,
    expected_doc_ids: list[str] = None,
    top_k: int = 10,
) -> dict:
    """Run a retrieval diagnostic for a failing query."""
    # Step 1: Check if embedding is reasonable
    emb_resp = client.embeddings.create(
        model="text-embedding-3-small",
        input=[query],
    )
    query_embedding = emb_resp.data[0].embedding

    # Step 2: Retrieve more than needed
    docs = retriever.retrieve(query_embedding, top_k=top_k)

    if not docs:
        return {
            "status": "critical",
            "issue": "No documents retrieved at all",
            "fix": "Check vector store connection. Check if knowledge base is populated.",
        }

    # Step 3: Check score distribution
    scores = [d.get("score", d.get("distance", 0)) for d in docs]
    max_score = max(scores) if scores else 0
    min_score = min(scores) if scores else 0

    if max_score < 0.5:
        return {
            "status": "warning",
            "issue": f"Low similarity scores (max={max_score:.3f}). Query may not match knowledge base domain.",
            "fix": "Check embedding model matches what was used for ingestion. Check query language vs document language.",
            "top_docs": [{"title": d.get("title"), "score": s} for d, s in zip(docs[:3], scores[:3])],
        }

    # Step 4: Check if expected documents are present
    if expected_doc_ids:
        retrieved_ids = [d.get("id", "") for d in docs]
        found = [eid for eid in expected_doc_ids if eid in retrieved_ids]
        missing = [eid for eid in expected_doc_ids if eid not in retrieved_ids]
        rank_of_first = next(
            (i+1 for i, d in enumerate(docs) if d.get("id") in expected_doc_ids),
            None,
        )
        return {
            "status": "good" if not missing else "issue",
            "expected_docs_found": len(found),
            "expected_docs_missing": missing,
            "rank_of_first_relevant": rank_of_first,
            "top_docs": [{"id": d.get("id"), "title": d.get("title"), "score": s}
                         for d, s in zip(docs[:5], scores[:5])],
        }

    return {
        "status": "ok",
        "top_docs": [{"title": d.get("title"), "score": s}
                     for d, s in zip(docs[:5], scores[:5])],
        "score_range": {"max": max_score, "min": min_score},
    }


# Common retrieval fixes
RETRIEVAL_FIXES = {
    "embedding_mismatch": (
        "Ingestion used model A, queries use model B. "
        "Re-ingest with the same model you query with."
    ),
    "chunk_too_large": (
        "Chunks are too long, diluting relevant signal. "
        "Re-chunk with smaller chunk size (256-512 tokens)."
    ),
    "chunk_too_small": (
        "Chunks lack context. "
        "Re-chunk with larger size or add parent chunk retrieval."
    ),
    "query_too_short": (
        "Short queries lack semantic signal. "
        "Use HyDE or query expansion."
    ),
    "domain_mismatch": (
        "General embedding model performs poorly on domain text. "
        "Fine-tune or use domain-specific model (e.g., bge-medical)."
    ),
}

Failure Mode 2: Hallucinations in Generated Answers

Symptoms: Response contains facts not in any retrieved document. Confident but wrong drug dosages. Made-up references.

Diagnosis

Python

def diagnose_hallucination(
    query: str,
    context: str,
    response: str,
    model: str = "gpt-4o",
) -> dict:
    """Detect specific hallucinated claims in a response."""
    import json

    result = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "user",
                "content": f"""Identify claims in the response that are NOT supported by the context.

Context:
{context[:4000]}

Response to audit:
{response}

For each claim in the response:
1. Is it explicitly supported by the context?
2. Is it a reasonable inference from the context?
3. Is it fabricated or contradicted?

Return JSON:
{{
  "hallucinated_claims": ["list of claims not in context"],
  "supported_claims": ["list of claims with context support"],
  "inferred_claims": ["claims that are reasonable inferences"],
  "hallucination_rate": 0.0-1.0,
  "verdict": "clean|minor_issues|significant_hallucination"
}}""",
            }
        ],
        response_format={"type": "json_object"},
        temperature=0,
    )
    return json.loads(result.choices[0].message.content)


def fix_hallucination_via_prompt(
    query: str,
    context: str,
    original_response: str,
) -> str:
    """Revise a hallucinated response to be context-grounded."""
    return client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """You are a clinical fact-checker.
You will receive a question, the source context, and a draft answer that may contain hallucinations.
Rewrite the answer to include ONLY information from the context.
If the context doesn't contain enough information, say exactly that.""",
            },
            {
                "role": "user",
                "content": f"""Question: {query}

Source context:
{context}

Draft answer (may contain hallucinations):
{original_response}

Write a corrected, context-grounded answer:""",
            },
        ],
        temperature=0,
    ).choices[0].message.content

Failure Mode 3: Context Window Overflow

Symptoms: API errors about token limits. Truncated responses. LLM ignoring early context (lost-in-middle problem).

Python

import tiktoken


def count_rag_tokens(
    system_prompt: str,
    context_docs: list[dict],
    query: str,
    model: str = "gpt-4o",
) -> dict:
    """Count tokens in a full RAG prompt to catch overflow before sending."""
    encoding = tiktoken.encoding_for_model(model)

    system_tokens = len(encoding.encode(system_prompt))
    query_tokens = len(encoding.encode(query))
    context_text = "\n\n".join([d["content"] for d in context_docs])
    context_tokens = len(encoding.encode(context_text))
    overhead = 50  # For message formatting

    total = system_tokens + query_tokens + context_tokens + overhead

    # Model context limits
    limits = {
        "gpt-4o": 128000,
        "gpt-4o-mini": 128000,
        "claude-sonnet-4-6": 200000,
        "claude-haiku-4-5": 200000,
    }
    limit = limits.get(model, 128000)

    return {
        "total_tokens": total,
        "context_tokens": context_tokens,
        "query_tokens": query_tokens,
        "system_tokens": system_tokens,
        "limit": limit,
        "headroom": limit - total,
        "overflow": total > limit * 0.9,  # Warn at 90% of limit
        "utilization_pct": round(total / limit * 100, 1),
    }


def auto_trim_context(
    context_docs: list[dict],
    query: str,
    system_prompt: str,
    target_tokens: int = 3000,
    model: str = "gpt-4o",
) -> list[dict]:
    """
    Automatically trim context to stay within token budget.
    Keeps highest-scoring docs, truncates last doc if needed.
    """
    encoding = tiktoken.encoding_for_model(model)
    fixed_tokens = (
        len(encoding.encode(system_prompt)) +
        len(encoding.encode(query)) +
        100  # Overhead
    )
    context_budget = target_tokens - fixed_tokens

    selected = []
    used = 0

    for doc in context_docs:
        doc_tokens = len(encoding.encode(doc["content"]))
        if used + doc_tokens <= context_budget:
            selected.append(doc)
            used += doc_tokens
        else:
            # Truncate last doc
            remaining = context_budget - used
            if remaining > 100:  # At least 100 tokens worth including
                char_budget = remaining * 4
                selected.append({
                    **doc,
                    "content": doc["content"][:char_budget],
                    "truncated": True,
                })
            break

    return selected

Failure Mode 4: Slow Performance

Symptoms: p95 latency above 5 seconds. Timeouts under load. Users complaining about response time.

Python

import time
import asyncio


def profile_rag_query(query: str, pipeline) -> dict:
    """Time each stage of a RAG query to find bottlenecks."""
    timings = {}

    # Stage 1: Embedding
    t0 = time.time()
    emb = client.embeddings.create(model="text-embedding-3-small", input=[query])
    query_embedding = emb.data[0].embedding
    timings["embedding_ms"] = (time.time() - t0) * 1000

    # Stage 2: Retrieval
    t1 = time.time()
    docs = pipeline.retriever.retrieve(query_embedding, top_k=5)
    timings["retrieval_ms"] = (time.time() - t1) * 1000

    # Stage 3: Reranking (if present)
    if hasattr(pipeline, "reranker"):
        t2 = time.time()
        docs = pipeline.reranker.rerank(query, docs)
        timings["reranking_ms"] = (time.time() - t2) * 1000

    # Stage 4: LLM generation
    t3 = time.time()
    context = "\n\n".join([d["content"] for d in docs])
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Answer using only the context."},
            {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"},
        ],
        temperature=0,
    ).choices[0].message.content
    timings["llm_ms"] = (time.time() - t3) * 1000

    timings["total_ms"] = sum(timings.values())

    # Identify bottleneck
    bottleneck = max(timings, key=timings.get)
    suggestions = {
        "embedding_ms": "Pre-embed common queries. Use smaller embedding model.",
        "retrieval_ms": "Check vector store index type (HNSW vs flat). Add replicas. Check network latency to managed service.",
        "reranking_ms": "Use lighter cross-encoder. Reduce candidates from 50 to 20.",
        "llm_ms": "Use faster/cheaper model. Reduce context window. Enable response caching.",
    }

    return {
        "timings": timings,
        "bottleneck": bottleneck,
        "suggestion": suggestions.get(bottleneck, "Unknown bottleneck"),
        "response_preview": response[:100],
    }

Failure Mode 5: Knowledge Base Issues

Symptoms: Questions that should be answerable get "I don't have information." Known facts return wrong answers.

Python

def audit_knowledge_base(
    collection,
    test_queries: list[dict],
) -> dict:
    """
    Verify the knowledge base contains expected content.
    test_queries: [{"query": "...", "expected_keywords": ["...", "..."]}]
    """
    results = []

    for test in test_queries:
        query = test["query"]
        expected_keywords = test.get("expected_keywords", [])

        emb = client.embeddings.create(
            model="text-embedding-3-small", input=[query]
        )
        query_emb = emb.data[0].embedding

        retrieved = collection.query(
            query_embeddings=[query_emb],
            n_results=5,
        )

        documents = retrieved.get("documents", [[]])[0]
        full_text = " ".join(documents).lower()

        found_keywords = [kw for kw in expected_keywords if kw.lower() in full_text]
        missing_keywords = [kw for kw in expected_keywords if kw.lower() not in full_text]

        results.append({
            "query": query,
            "found_keywords": found_keywords,
            "missing_keywords": missing_keywords,
            "coverage": len(found_keywords) / max(len(expected_keywords), 1),
            "top_result_preview": documents[0][:150] if documents else "No results",
        })

    overall_coverage = sum(r["coverage"] for r in results) / max(len(results), 1)
    return {
        "overall_coverage": round(overall_coverage, 2),
        "total_tests": len(results),
        "failing_tests": [r for r in results if r["coverage"] < 0.5],
        "details": results,
    }


def check_embedding_model_consistency(
    collection,
    test_text: str = "warfarin drug interaction CYP2C9",
) -> dict:
    """
    Verify that the same embedding model is used for ingestion and queries.
    A mismatch causes terrible retrieval even with correct documents.
    """
    # Get a sample document embedding from the collection
    sample = collection.get(limit=1, include=["embeddings"])
    if not sample["embeddings"]:
        return {"status": "error", "message": "Collection is empty"}

    stored_dim = len(sample["embeddings"][0])

    # Check query embedding dimension
    query_emb = client.embeddings.create(
        model="text-embedding-3-small", input=[test_text]
    )
    query_dim = len(query_emb.data[0].embedding)

    model_dims = {
        "text-embedding-3-small": 1536,
        "text-embedding-3-large": 3072,
        "text-embedding-ada-002": 1536,
        "BAAI/bge-small-en-v1.5": 384,
        "BAAI/bge-large-en-v1.5": 1024,
    }

    stored_model = next((m for m, d in model_dims.items() if d == stored_dim), "unknown")
    query_model_name = "text-embedding-3-small"  # Known from code above

    return {
        "stored_embedding_dim": stored_dim,
        "query_embedding_dim": query_dim,
        "match": stored_dim == query_dim,
        "likely_stored_model": stored_model,
        "query_model": query_model_name,
        "status": "ok" if stored_dim == query_dim else "MISMATCH — re-ingest with consistent model",
    }

RAG Debugging Runbook

When a query fails, follow this order:

Is the knowledge base populated? Run collection.count() or equivalent.
Is the right model used for both ingestion and querying? Check embedding dimensions.
Does retrieval return anything? Log top-5 docs with scores.
Are scores high enough? Below 0.6 cosine similarity means poor semantic match.
Are retrieved docs actually relevant? Read them. If they're irrelevant, the problem is retrieval.
Is the answer in the context? If docs are relevant but answer is wrong, it's a generation problem.
Is the context too long? Check token count — LLMs degrade with very long contexts.
Is the system prompt conflicting? Test with a minimal prompt.

| Symptom | Likely Cause | Fix | |---|---|---| | Empty retrieval | Empty knowledge base or wrong collection | Verify ingestion ran | | Low scores (below 0.5) | Embedding model mismatch | Re-ingest with same model | | Irrelevant docs retrieved | Poor chunking strategy | Re-chunk with sections strategy | | Correct docs retrieved, wrong answer | Hallucination or context overflow | Add faithfulness prompt, trim context | | "I don't know" despite relevant docs | Context too large, lost-in-middle | Reorder docs (relevant first) | | Very slow queries | LLM latency | Add caching, use faster model | | Inconsistent answers | High LLM temperature | Set temperature=0 |

RAG Troubleshooting Guide

Systematic RAG Debugging

Failure Mode 1: Retrieval Returns Wrong Documents

Diagnosis

Failure Mode 2: Hallucinations in Generated Answers

Diagnosis

Failure Mode 3: Context Window Overflow

Failure Mode 4: Slow Performance

Failure Mode 5: Knowledge Base Issues

RAG Debugging Runbook

Enjoyed this article?

Leave a comment