GenAI & LLM Interviews · Lesson 15 of 30
RAG Evaluation: Recall, MRR, Golden Sets
The RAG Evaluation Problem
A RAG system has two failure modes:
- Retrieval failure: The right documents were not retrieved
- Generation failure: Documents were retrieved but the answer is wrong or hallucinated
Good evaluation covers both. The three core metrics:
- Faithfulness: Does the answer only use information from the retrieved context?
- Context Relevance: Were the retrieved chunks actually relevant to the question?
- Answer Relevance: Does the answer actually address the question asked?
RAGAS: Automated RAG Evaluation
# pip install ragas
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_recall,
context_precision,
answer_correctness,
answer_similarity,
)
from datasets import Dataset
# RAGAS requires a dataset with these columns:
# - question: the user query
# - answer: the generated response
# - contexts: list of retrieved document strings
# - ground_truth: the reference answer (for recall/correctness)
def build_ragas_dataset(
rag_pipeline,
test_cases: list[dict],
) -> Dataset:
"""
Run RAG pipeline on test cases and build evaluation dataset.
test_cases: [{"question": "...", "ground_truth": "..."}]
"""
rows = []
for case in test_cases:
query = case["question"]
result = rag_pipeline.query(query)
rows.append({
"question": query,
"answer": result["response"],
"contexts": [doc["content"] for doc in result.get("retrieved_docs", [])],
"ground_truth": case.get("ground_truth", ""),
})
return Dataset.from_list(rows)
def run_ragas_evaluation(
rag_pipeline,
test_cases: list[dict],
llm_judge_model: str = "gpt-4o",
) -> dict:
"""Run full RAGAS evaluation suite."""
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
# RAGAS uses an LLM to judge faithfulness and relevance
judge_llm = LangchainLLMWrapper(ChatOpenAI(model=llm_judge_model, temperature=0))
dataset = build_ragas_dataset(rag_pipeline, test_cases)
results = evaluate(
dataset=dataset,
metrics=[
faithfulness,
answer_relevancy,
context_precision,
context_recall,
answer_correctness,
],
llm=judge_llm,
)
return {
"faithfulness": results["faithfulness"],
"answer_relevancy": results["answer_relevancy"],
"context_precision": results["context_precision"],
"context_recall": results["context_recall"],
"answer_correctness": results["answer_correctness"],
"summary": results.to_pandas().describe().to_dict(),
}
# Clinical test cases
CLINICAL_TEST_CASES = [
{
"question": "What is the mechanism of action of warfarin?",
"ground_truth": "Warfarin inhibits vitamin K epoxide reductase (VKORC1), blocking the regeneration of vitamin K, which is required for the synthesis of clotting factors II, VII, IX, and X.",
},
{
"question": "What drugs are contraindicated with fluoxetine?",
"ground_truth": "MAOIs are contraindicated with fluoxetine due to risk of serotonin syndrome. Pimozide and thioridazine are also contraindicated due to CYP2D6 inhibition.",
},
{
"question": "What is the recommended starting dose of metformin for type 2 diabetes?",
"ground_truth": "The typical starting dose is 500mg twice daily or 850mg once daily with meals, titrated gradually to reduce GI side effects.",
},
]Custom Faithfulness Metric
Build your own LLM judge for domain-specific faithfulness:
from openai import OpenAI
import json
client = OpenAI()
FAITHFULNESS_PROMPT = """You are evaluating whether an AI response is faithful to its source documents.
A response is FAITHFUL if every factual claim in the response can be directly supported by the provided context.
A response is UNFAITHFUL if it contains claims not present in or contradicted by the context.
Context:
{context}
Question: {question}
Response: {response}
For each claim in the response:
1. Identify the claim
2. Find supporting text in context (or mark as unsupported)
3. Score: supported or unsupported
Return JSON:
{{
"claims": [
{{"claim": "...", "supported": true/false, "evidence": "quote from context or null"}}
],
"faithfulness_score": 0.0-1.0,
"verdict": "faithful/partially_faithful/unfaithful",
"unsupported_claims": ["list of unsupported claims"]
}}"""
def evaluate_faithfulness(
question: str,
context: str,
response: str,
model: str = "gpt-4o",
) -> dict:
"""Evaluate whether a response is faithful to its context."""
result = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": FAITHFULNESS_PROMPT.format(
context=context[:4000],
question=question,
response=response,
),
}
],
response_format={"type": "json_object"},
temperature=0,
)
return json.loads(result.choices[0].message.content)
def evaluate_context_relevance(
question: str,
contexts: list[str],
model: str = "gpt-4o-mini",
) -> dict:
"""Score how relevant each retrieved chunk is to the question."""
scored_contexts = []
for i, ctx in enumerate(contexts):
result = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": f"""Rate how relevant this context chunk is for answering the question.
Question: {question}
Context chunk:
{ctx[:1000]}
Return JSON:
{{"relevance_score": 0.0-1.0, "reason": "brief explanation"}}""",
}
],
response_format={"type": "json_object"},
temperature=0,
)
scored = json.loads(result.choices[0].message.content)
scored_contexts.append({
"chunk_index": i,
"relevance_score": scored["relevance_score"],
"reason": scored["reason"],
})
avg_relevance = sum(c["relevance_score"] for c in scored_contexts) / len(scored_contexts)
return {
"chunks": scored_contexts,
"average_relevance": avg_relevance,
"relevant_chunks": sum(1 for c in scored_contexts if c["relevance_score"] >= 0.7),
}Retrieval Metrics
Measure retrieval quality with labeled datasets:
from dataclasses import dataclass
@dataclass
class RetrievalTestCase:
"""A labeled retrieval test: query + relevant document IDs."""
query: str
relevant_doc_ids: list[str] # Ground-truth relevant docs
topic: str = ""
def precision_at_k(retrieved_ids: list[str], relevant_ids: set[str], k: int) -> float:
"""Fraction of top-k retrieved docs that are relevant."""
top_k = retrieved_ids[:k]
hits = sum(1 for doc_id in top_k if doc_id in relevant_ids)
return hits / k
def recall_at_k(retrieved_ids: list[str], relevant_ids: set[str], k: int) -> float:
"""Fraction of relevant docs found in top-k."""
if not relevant_ids:
return 0.0
top_k = set(retrieved_ids[:k])
hits = len(top_k & relevant_ids)
return hits / len(relevant_ids)
def mean_reciprocal_rank(retrieved_ids: list[str], relevant_ids: set[str]) -> float:
"""MRR: reciprocal rank of the first relevant document."""
for rank, doc_id in enumerate(retrieved_ids, start=1):
if doc_id in relevant_ids:
return 1.0 / rank
return 0.0
def normalized_dcg(retrieved_ids: list[str], relevant_ids: set[str], k: int) -> float:
"""NDCG@k: accounts for position of relevant docs."""
import math
def dcg(ids):
score = 0.0
for i, doc_id in enumerate(ids[:k], start=1):
relevance = 1.0 if doc_id in relevant_ids else 0.0
score += relevance / math.log2(i + 1)
return score
actual_dcg = dcg(retrieved_ids)
ideal_dcg = dcg(list(relevant_ids)[:k]) # Ideal: all relevant first
return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0.0
def evaluate_retriever(
retriever,
test_cases: list[RetrievalTestCase],
k_values: list[int] = None,
) -> dict:
"""Run a full retrieval evaluation suite."""
if k_values is None:
k_values = [1, 3, 5, 10]
all_mrr = []
precision_by_k = {k: [] for k in k_values}
recall_by_k = {k: [] for k in k_values}
ndcg_by_k = {k: [] for k in k_values}
for case in test_cases:
from openai import OpenAI as OAI
emb_resp = OAI().embeddings.create(
model="text-embedding-3-small", input=[case.query]
)
query_emb = emb_resp.data[0].embedding
retrieved = retriever.retrieve(query_emb, top_k=max(k_values))
retrieved_ids = [r["id"] for r in retrieved]
relevant_ids = set(case.relevant_doc_ids)
all_mrr.append(mean_reciprocal_rank(retrieved_ids, relevant_ids))
for k in k_values:
precision_by_k[k].append(precision_at_k(retrieved_ids, relevant_ids, k))
recall_by_k[k].append(recall_at_k(retrieved_ids, relevant_ids, k))
ndcg_by_k[k].append(normalized_dcg(retrieved_ids, relevant_ids, k))
return {
"mrr": sum(all_mrr) / len(all_mrr),
"precision": {f"p@{k}": sum(precision_by_k[k]) / len(precision_by_k[k]) for k in k_values},
"recall": {f"r@{k}": sum(recall_by_k[k]) / len(recall_by_k[k]) for k in k_values},
"ndcg": {f"ndcg@{k}": sum(ndcg_by_k[k]) / len(ndcg_by_k[k]) for k in k_values},
"n_queries": len(test_cases),
}TruLens Integration
TruLens provides automatic tracing and feedback for RAG pipelines:
# pip install trulens trulens-providers-openai
from trulens.core import TruSession
from trulens.apps.custom import TruCustomApp, instrument
from trulens.providers.openai import OpenAI as TruOpenAI
# Initialize TruLens session
session = TruSession()
session.reset_database()
# Set up OpenAI feedback provider
provider = TruOpenAI(model_engine="gpt-4o")
# Define feedback functions
f_groundedness = provider.groundedness_measure_with_cot_reasons
f_qa_relevance = provider.relevance_with_cot_reasons
f_context_relevance = provider.context_relevance_with_cot_reasons
class InstrumentedRAGPipeline:
"""RAG pipeline instrumented for TruLens evaluation."""
def __init__(self, retriever, llm_client):
self.retriever = retriever
self.llm = llm_client
@instrument
def retrieve(self, query: str) -> list[str]:
"""Retrieve relevant documents."""
from openai import OpenAI as OAI
emb = OAI().embeddings.create(model="text-embedding-3-small", input=[query])
query_emb = emb.data[0].embedding
docs = self.retriever.retrieve(query_emb, top_k=5)
return [d["content"] for d in docs]
@instrument
def generate(self, query: str, contexts: list[str]) -> str:
"""Generate answer from contexts."""
context_str = "\n\n".join(contexts)
return self.llm.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "Answer using only the provided context."},
{"role": "user", "content": f"Context:\n{context_str}\n\nQuestion: {query}"},
],
temperature=0,
).choices[0].message.content
@instrument
def query(self, query: str) -> str:
"""Full RAG pipeline."""
contexts = self.retrieve(query)
return self.generate(query, contexts)
def setup_trulens_recorder(pipeline: InstrumentedRAGPipeline) -> TruCustomApp:
"""Wrap pipeline with TruLens recording."""
return TruCustomApp(
pipeline,
app_name="ClinicalRAG",
app_version="v1",
feedbacks=[
f_groundedness,
f_qa_relevance,
f_context_relevance,
],
)End-to-End Evaluation Report
def run_full_evaluation(
rag_pipeline,
retriever,
test_cases: list[dict],
retrieval_cases: list[RetrievalTestCase] = None,
) -> dict:
"""
Comprehensive RAG evaluation: retrieval + generation + faithfulness.
"""
results = {}
# 1. RAGAS evaluation (faithfulness, relevance, correctness)
print("Running RAGAS evaluation...")
ragas_results = run_ragas_evaluation(rag_pipeline, test_cases)
results["ragas"] = ragas_results
# 2. Retrieval-level evaluation
if retrieval_cases:
print("Running retrieval evaluation...")
retrieval_results = evaluate_retriever(retriever, retrieval_cases)
results["retrieval"] = retrieval_results
# 3. Per-example faithfulness audit
print("Running faithfulness audit...")
faithfulness_scores = []
for case in test_cases[:10]: # Audit first 10 to manage cost
rag_result = rag_pipeline.query(case["question"])
context = "\n\n".join([d["content"] for d in rag_result.get("retrieved_docs", [])])
faith = evaluate_faithfulness(
case["question"], context, rag_result["response"]
)
faithfulness_scores.append(faith["faithfulness_score"])
results["faithfulness_audit"] = {
"mean": sum(faithfulness_scores) / len(faithfulness_scores),
"min": min(faithfulness_scores),
"n_audited": len(faithfulness_scores),
}
# 4. Summary grade
ragas_score = ragas_results.get("faithfulness", 0) * 0.4 + \
ragas_results.get("answer_relevancy", 0) * 0.3 + \
ragas_results.get("context_precision", 0) * 0.3
results["overall_score"] = ragas_score
results["grade"] = (
"A" if ragas_score >= 0.85 else
"B" if ragas_score >= 0.70 else
"C" if ragas_score >= 0.55 else "D"
)
return resultsEvaluation Metric Reference
| Metric | What it measures | Range | Target | |---|---|---|---| | Faithfulness | Answer only uses context | 0-1 | Above 0.90 | | Answer Relevancy | Answer addresses the question | 0-1 | Above 0.85 | | Context Precision | Fraction of retrieved chunks used | 0-1 | Above 0.70 | | Context Recall | Fraction of needed info retrieved | 0-1 | Above 0.80 | | MRR | First relevant doc rank | 0-1 | Above 0.70 | | NDCG@5 | Ranked retrieval quality | 0-1 | Above 0.75 | | Precision@5 | Relevant docs in top 5 | 0-1 | Above 0.60 |