RAG Troubleshooting Guide
Diagnose and fix common RAG failures: poor retrieval, hallucinations, irrelevant answers, slow performance, and context window issues. A systematic debugging guide.
Systematic RAG Debugging
RAG failures fall into three categories:
- Retrieval failures — Wrong or no documents retrieved
- Generation failures — Documents were retrieved but answer is wrong
- System failures — Latency, errors, or infrastructure problems
The diagnostic approach: isolate each component, test independently, fix the weakest link.
Failure Mode 1: Retrieval Returns Wrong Documents
Symptoms: The answer doesn't address the question. Hallucinated facts. "I don't have information about X" when X is in the knowledge base.
Diagnosis
from openai import OpenAI
import numpy as np
client = OpenAI()
def diagnose_retrieval(
query: str,
retriever,
expected_doc_ids: list[str] = None,
top_k: int = 10,
) -> dict:
"""Run a retrieval diagnostic for a failing query."""
# Step 1: Check if embedding is reasonable
emb_resp = client.embeddings.create(
model="text-embedding-3-small",
input=[query],
)
query_embedding = emb_resp.data[0].embedding
# Step 2: Retrieve more than needed
docs = retriever.retrieve(query_embedding, top_k=top_k)
if not docs:
return {
"status": "critical",
"issue": "No documents retrieved at all",
"fix": "Check vector store connection. Check if knowledge base is populated.",
}
# Step 3: Check score distribution
scores = [d.get("score", d.get("distance", 0)) for d in docs]
max_score = max(scores) if scores else 0
min_score = min(scores) if scores else 0
if max_score < 0.5:
return {
"status": "warning",
"issue": f"Low similarity scores (max={max_score:.3f}). Query may not match knowledge base domain.",
"fix": "Check embedding model matches what was used for ingestion. Check query language vs document language.",
"top_docs": [{"title": d.get("title"), "score": s} for d, s in zip(docs[:3], scores[:3])],
}
# Step 4: Check if expected documents are present
if expected_doc_ids:
retrieved_ids = [d.get("id", "") for d in docs]
found = [eid for eid in expected_doc_ids if eid in retrieved_ids]
missing = [eid for eid in expected_doc_ids if eid not in retrieved_ids]
rank_of_first = next(
(i+1 for i, d in enumerate(docs) if d.get("id") in expected_doc_ids),
None,
)
return {
"status": "good" if not missing else "issue",
"expected_docs_found": len(found),
"expected_docs_missing": missing,
"rank_of_first_relevant": rank_of_first,
"top_docs": [{"id": d.get("id"), "title": d.get("title"), "score": s}
for d, s in zip(docs[:5], scores[:5])],
}
return {
"status": "ok",
"top_docs": [{"title": d.get("title"), "score": s}
for d, s in zip(docs[:5], scores[:5])],
"score_range": {"max": max_score, "min": min_score},
}
# Common retrieval fixes
RETRIEVAL_FIXES = {
"embedding_mismatch": (
"Ingestion used model A, queries use model B. "
"Re-ingest with the same model you query with."
),
"chunk_too_large": (
"Chunks are too long, diluting relevant signal. "
"Re-chunk with smaller chunk size (256-512 tokens)."
),
"chunk_too_small": (
"Chunks lack context. "
"Re-chunk with larger size or add parent chunk retrieval."
),
"query_too_short": (
"Short queries lack semantic signal. "
"Use HyDE or query expansion."
),
"domain_mismatch": (
"General embedding model performs poorly on domain text. "
"Fine-tune or use domain-specific model (e.g., bge-medical)."
),
}Failure Mode 2: Hallucinations in Generated Answers
Symptoms: Response contains facts not in any retrieved document. Confident but wrong drug dosages. Made-up references.
Diagnosis
def diagnose_hallucination(
query: str,
context: str,
response: str,
model: str = "gpt-4o",
) -> dict:
"""Detect specific hallucinated claims in a response."""
import json
result = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": f"""Identify claims in the response that are NOT supported by the context.
Context:
{context[:4000]}
Response to audit:
{response}
For each claim in the response:
1. Is it explicitly supported by the context?
2. Is it a reasonable inference from the context?
3. Is it fabricated or contradicted?
Return JSON:
{{
"hallucinated_claims": ["list of claims not in context"],
"supported_claims": ["list of claims with context support"],
"inferred_claims": ["claims that are reasonable inferences"],
"hallucination_rate": 0.0-1.0,
"verdict": "clean|minor_issues|significant_hallucination"
}}""",
}
],
response_format={"type": "json_object"},
temperature=0,
)
return json.loads(result.choices[0].message.content)
def fix_hallucination_via_prompt(
query: str,
context: str,
original_response: str,
) -> str:
"""Revise a hallucinated response to be context-grounded."""
return client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": """You are a clinical fact-checker.
You will receive a question, the source context, and a draft answer that may contain hallucinations.
Rewrite the answer to include ONLY information from the context.
If the context doesn't contain enough information, say exactly that.""",
},
{
"role": "user",
"content": f"""Question: {query}
Source context:
{context}
Draft answer (may contain hallucinations):
{original_response}
Write a corrected, context-grounded answer:""",
},
],
temperature=0,
).choices[0].message.contentFailure Mode 3: Context Window Overflow
Symptoms: API errors about token limits. Truncated responses. LLM ignoring early context (lost-in-middle problem).
import tiktoken
def count_rag_tokens(
system_prompt: str,
context_docs: list[dict],
query: str,
model: str = "gpt-4o",
) -> dict:
"""Count tokens in a full RAG prompt to catch overflow before sending."""
encoding = tiktoken.encoding_for_model(model)
system_tokens = len(encoding.encode(system_prompt))
query_tokens = len(encoding.encode(query))
context_text = "\n\n".join([d["content"] for d in context_docs])
context_tokens = len(encoding.encode(context_text))
overhead = 50 # For message formatting
total = system_tokens + query_tokens + context_tokens + overhead
# Model context limits
limits = {
"gpt-4o": 128000,
"gpt-4o-mini": 128000,
"claude-sonnet-4-6": 200000,
"claude-haiku-4-5": 200000,
}
limit = limits.get(model, 128000)
return {
"total_tokens": total,
"context_tokens": context_tokens,
"query_tokens": query_tokens,
"system_tokens": system_tokens,
"limit": limit,
"headroom": limit - total,
"overflow": total > limit * 0.9, # Warn at 90% of limit
"utilization_pct": round(total / limit * 100, 1),
}
def auto_trim_context(
context_docs: list[dict],
query: str,
system_prompt: str,
target_tokens: int = 3000,
model: str = "gpt-4o",
) -> list[dict]:
"""
Automatically trim context to stay within token budget.
Keeps highest-scoring docs, truncates last doc if needed.
"""
encoding = tiktoken.encoding_for_model(model)
fixed_tokens = (
len(encoding.encode(system_prompt)) +
len(encoding.encode(query)) +
100 # Overhead
)
context_budget = target_tokens - fixed_tokens
selected = []
used = 0
for doc in context_docs:
doc_tokens = len(encoding.encode(doc["content"]))
if used + doc_tokens <= context_budget:
selected.append(doc)
used += doc_tokens
else:
# Truncate last doc
remaining = context_budget - used
if remaining > 100: # At least 100 tokens worth including
char_budget = remaining * 4
selected.append({
**doc,
"content": doc["content"][:char_budget],
"truncated": True,
})
break
return selectedFailure Mode 4: Slow Performance
Symptoms: p95 latency above 5 seconds. Timeouts under load. Users complaining about response time.
import time
import asyncio
def profile_rag_query(query: str, pipeline) -> dict:
"""Time each stage of a RAG query to find bottlenecks."""
timings = {}
# Stage 1: Embedding
t0 = time.time()
emb = client.embeddings.create(model="text-embedding-3-small", input=[query])
query_embedding = emb.data[0].embedding
timings["embedding_ms"] = (time.time() - t0) * 1000
# Stage 2: Retrieval
t1 = time.time()
docs = pipeline.retriever.retrieve(query_embedding, top_k=5)
timings["retrieval_ms"] = (time.time() - t1) * 1000
# Stage 3: Reranking (if present)
if hasattr(pipeline, "reranker"):
t2 = time.time()
docs = pipeline.reranker.rerank(query, docs)
timings["reranking_ms"] = (time.time() - t2) * 1000
# Stage 4: LLM generation
t3 = time.time()
context = "\n\n".join([d["content"] for d in docs])
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "Answer using only the context."},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"},
],
temperature=0,
).choices[0].message.content
timings["llm_ms"] = (time.time() - t3) * 1000
timings["total_ms"] = sum(timings.values())
# Identify bottleneck
bottleneck = max(timings, key=timings.get)
suggestions = {
"embedding_ms": "Pre-embed common queries. Use smaller embedding model.",
"retrieval_ms": "Check vector store index type (HNSW vs flat). Add replicas. Check network latency to managed service.",
"reranking_ms": "Use lighter cross-encoder. Reduce candidates from 50 to 20.",
"llm_ms": "Use faster/cheaper model. Reduce context window. Enable response caching.",
}
return {
"timings": timings,
"bottleneck": bottleneck,
"suggestion": suggestions.get(bottleneck, "Unknown bottleneck"),
"response_preview": response[:100],
}Failure Mode 5: Knowledge Base Issues
Symptoms: Questions that should be answerable get "I don't have information." Known facts return wrong answers.
def audit_knowledge_base(
collection,
test_queries: list[dict],
) -> dict:
"""
Verify the knowledge base contains expected content.
test_queries: [{"query": "...", "expected_keywords": ["...", "..."]}]
"""
results = []
for test in test_queries:
query = test["query"]
expected_keywords = test.get("expected_keywords", [])
emb = client.embeddings.create(
model="text-embedding-3-small", input=[query]
)
query_emb = emb.data[0].embedding
retrieved = collection.query(
query_embeddings=[query_emb],
n_results=5,
)
documents = retrieved.get("documents", [[]])[0]
full_text = " ".join(documents).lower()
found_keywords = [kw for kw in expected_keywords if kw.lower() in full_text]
missing_keywords = [kw for kw in expected_keywords if kw.lower() not in full_text]
results.append({
"query": query,
"found_keywords": found_keywords,
"missing_keywords": missing_keywords,
"coverage": len(found_keywords) / max(len(expected_keywords), 1),
"top_result_preview": documents[0][:150] if documents else "No results",
})
overall_coverage = sum(r["coverage"] for r in results) / max(len(results), 1)
return {
"overall_coverage": round(overall_coverage, 2),
"total_tests": len(results),
"failing_tests": [r for r in results if r["coverage"] < 0.5],
"details": results,
}
def check_embedding_model_consistency(
collection,
test_text: str = "warfarin drug interaction CYP2C9",
) -> dict:
"""
Verify that the same embedding model is used for ingestion and queries.
A mismatch causes terrible retrieval even with correct documents.
"""
# Get a sample document embedding from the collection
sample = collection.get(limit=1, include=["embeddings"])
if not sample["embeddings"]:
return {"status": "error", "message": "Collection is empty"}
stored_dim = len(sample["embeddings"][0])
# Check query embedding dimension
query_emb = client.embeddings.create(
model="text-embedding-3-small", input=[test_text]
)
query_dim = len(query_emb.data[0].embedding)
model_dims = {
"text-embedding-3-small": 1536,
"text-embedding-3-large": 3072,
"text-embedding-ada-002": 1536,
"BAAI/bge-small-en-v1.5": 384,
"BAAI/bge-large-en-v1.5": 1024,
}
stored_model = next((m for m, d in model_dims.items() if d == stored_dim), "unknown")
query_model_name = "text-embedding-3-small" # Known from code above
return {
"stored_embedding_dim": stored_dim,
"query_embedding_dim": query_dim,
"match": stored_dim == query_dim,
"likely_stored_model": stored_model,
"query_model": query_model_name,
"status": "ok" if stored_dim == query_dim else "MISMATCH — re-ingest with consistent model",
}RAG Debugging Runbook
When a query fails, follow this order:
- Is the knowledge base populated? Run
collection.count()or equivalent. - Is the right model used for both ingestion and querying? Check embedding dimensions.
- Does retrieval return anything? Log top-5 docs with scores.
- Are scores high enough? Below 0.6 cosine similarity means poor semantic match.
- Are retrieved docs actually relevant? Read them. If they're irrelevant, the problem is retrieval.
- Is the answer in the context? If docs are relevant but answer is wrong, it's a generation problem.
- Is the context too long? Check token count — LLMs degrade with very long contexts.
- Is the system prompt conflicting? Test with a minimal prompt.
| Symptom | Likely Cause | Fix | |---|---|---| | Empty retrieval | Empty knowledge base or wrong collection | Verify ingestion ran | | Low scores (below 0.5) | Embedding model mismatch | Re-ingest with same model | | Irrelevant docs retrieved | Poor chunking strategy | Re-chunk with sections strategy | | Correct docs retrieved, wrong answer | Hallucination or context overflow | Add faithfulness prompt, trim context | | "I don't know" despite relevant docs | Context too large, lost-in-middle | Reorder docs (relevant first) | | Very slow queries | LLM latency | Add caching, use faster model | | Inconsistent answers | High LLM temperature | Set temperature=0 |
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.