RAG Evaluation Metrics
The metrics used to evaluate RAG systems — retrieval quality (precision, recall, MRR, NDCG) and generation quality (faithfulness, answer relevance, context utilisation).
Two Evaluation Surfaces
RAG has two components that must be evaluated independently:
Retrieval evaluation:
Did we retrieve the right documents?
Metrics: Precision@k, Recall@k, MRR, NDCG@k
Generation evaluation:
Did the LLM generate a good answer given those documents?
Metrics: Faithfulness, Answer Relevance, Context Precision/Recall
End-to-end evaluation:
Is the final answer correct?
Metrics: Exact Match, F1, LLM-as-judge correctness scoreSeparating retrieval and generation eval lets you identify where the pipeline fails.
Retrieval Metrics
from typing import Sequence
def precision_at_k(retrieved: list[str], relevant: set[str], k: int) -> float:
"""Fraction of top-k retrieved docs that are relevant."""
top_k = retrieved[:k]
return sum(1 for d in top_k if d in relevant) / k
def recall_at_k(retrieved: list[str], relevant: set[str], k: int) -> float:
"""Fraction of relevant docs found in top-k retrieved."""
top_k = retrieved[:k]
if not relevant:
return 0.0
return sum(1 for d in top_k if d in relevant) / len(relevant)
def mean_reciprocal_rank(
retrieved_lists: list[list[str]],
relevant_sets: list[set[str]],
) -> float:
"""MRR: average of 1/rank-of-first-relevant-doc."""
rr_scores = []
for retrieved, relevant in zip(retrieved_lists, relevant_sets):
rr = 0.0
for rank, doc_id in enumerate(retrieved, start=1):
if doc_id in relevant:
rr = 1.0 / rank
break
rr_scores.append(rr)
return sum(rr_scores) / len(rr_scores) if rr_scores else 0.0
def ndcg_at_k(
retrieved: list[str],
relevance_scores: dict[str, float], # doc_id → relevance (0, 1, 2, ...)
k: int,
) -> float:
"""NDCG@k: normalised discounted cumulative gain."""
import math
def dcg(docs: list[str], scores: dict, k: int) -> float:
return sum(
scores.get(doc, 0) / math.log2(i + 2)
for i, doc in enumerate(docs[:k])
)
ideal = sorted(relevance_scores.values(), reverse=True)[:k]
ideal_dcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(ideal))
if ideal_dcg == 0:
return 0.0
return dcg(retrieved, relevance_scores, k) / ideal_dcgGeneration Metrics
from anthropic import Anthropic
client = Anthropic()
FAITHFULNESS_PROMPT = """You are evaluating whether a generated answer is faithful to the retrieved context.
Context:
{context}
Generated answer:
{answer}
Task: Identify every factual claim in the answer. For each claim, determine whether it is directly supported by the context.
Faithfulness score = (supported claims) / (total claims)
Respond with JSON:
{{"total_claims": N, "supported_claims": M, "faithfulness_score": M/N, "unsupported": ["claim1", ...]}}"""
def compute_faithfulness(
answer: str,
context_chunks: list[str],
) -> dict:
import json
context = "\n\n".join(context_chunks)
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=500,
messages=[{"role": "user", "content": FAITHFULNESS_PROMPT.format(
context=context, answer=answer
)}]
)
try:
return json.loads(response.content[0].text)
except json.JSONDecodeError:
return {"faithfulness_score": None, "error": "parse_error"}
ANSWER_RELEVANCE_PROMPT = """Given this answer, generate 3 questions that this answer would be the best response to.
Answer: {answer}
Return JSON: {{"questions": ["q1", "q2", "q3"]}}"""
def compute_answer_relevance(
query: str,
answer: str,
embedder,
) -> float:
"""Answer relevance: cosine similarity of query to generated reverse-questions."""
import json
import numpy as np
response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=200,
messages=[{"role": "user", "content": ANSWER_RELEVANCE_PROMPT.format(answer=answer)}]
)
try:
result = json.loads(response.content[0].text)
reverse_questions = result["questions"]
except (json.JSONDecodeError, KeyError):
return 0.0
query_emb = embedder.encode([query])[0]
reverse_embs = embedder.encode(reverse_questions)
sims = [
float(np.dot(query_emb, rq_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(rq_emb)))
for rq_emb in reverse_embs
]
return float(np.mean(sims))Evaluation Dataset Construction
from dataclasses import dataclass
@dataclass
class RAGEvalCase:
query: str
relevant_chunk_ids: set[str] # for retrieval eval
reference_answer: str # for generation eval
ground_truth_sources: list[str] # document names that should be cited
def run_retrieval_eval(
eval_cases: list[RAGEvalCase],
retriever,
k: int = 5,
) -> dict:
precisions, recalls, mrr_scores = [], [], []
for case in eval_cases:
results = retriever.retrieve(case.query, top_k=k)
retrieved_ids = [r["metadata"]["chunk_id"] for r in results]
precisions.append(precision_at_k(retrieved_ids, case.relevant_chunk_ids, k))
recalls.append(recall_at_k(retrieved_ids, case.relevant_chunk_ids, k))
mrr_scores.append(mean_reciprocal_rank(
retrieved_lists=[[r["metadata"]["chunk_id"] for r in retriever.retrieve(c.query, k)] for c in eval_cases],
relevant_sets=[c.relevant_chunk_ids for c in eval_cases],
))
return {
f"precision@{k}": sum(precisions) / len(precisions),
f"recall@{k}": sum(recalls) / len(recalls),
"mrr": mrr_scores[0] if mrr_scores else 0.0,
}Target Scores
Metric | Poor | Acceptable | Good | Excellent
-------------------|-------|------------|-------|----------
Precision@5 | <0.40 | 0.40–0.60 | 0.60–0.80 | >0.80
Recall@5 | <0.50 | 0.50–0.70 | 0.70–0.85 | >0.85
MRR | <0.50 | 0.50–0.70 | 0.70–0.85 | >0.85
Faithfulness | <0.70 | 0.70–0.85 | 0.85–0.95 | >0.95
Answer Relevance | <0.70 | 0.70–0.85 | 0.85–0.95 | >0.95
Clinical RAG minimum bar: Faithfulness > 0.90
(Unfaithful clinical answers are safety risks)Interview Answer
"RAG evaluation splits into retrieval and generation. Retrieval: Precision@k (fraction of retrieved docs that are relevant), Recall@k (fraction of relevant docs retrieved), MRR (rank of first relevant result), NDCG@k (discounted gain for graded relevance). Generation: faithfulness (fraction of answer claims supported by retrieved context — the most critical metric for clinical use), answer relevance (similarity of query to reverse-questions generated from the answer), and context precision/recall (from RAGAS). I prioritise faithfulness above all for clinical RAG — a faithful answer based on imperfect context is safe; an unfaithful answer is a safety risk regardless of retrieval quality."
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.