Evaluating Agentic AI Systems
Why agent evaluation is hard, and how to do it anyway. Task completion rate, step efficiency, trajectory evaluation, and human review sampling with Python examples.
Why Agent Evaluation Is Harder Than LLM Evaluation
Evaluating a single LLM response is hard. Evaluating an agent is harder because:
- Multi-step: the agent takes N actions to produce one output. Which step was wrong?
- Non-deterministic: the same goal produces different tool call sequences each run
- No single ground truth: there are many valid paths to the correct answer
- Side effects: the agent may take actions (API calls, writes) that can't be undone
You cannot just compare the output to a golden string. You need multiple evaluation lenses.
Evaluation Dimensions
| Dimension | Question | How to Measure | |---|---|---| | Task completion | Did the agent answer correctly? | Human rating or LLM judge | | Step efficiency | Did it take too many steps? | Steps used vs optimal steps | | Tool accuracy | Did it use the right tools? | Tool call log analysis | | Faithfulness | Is the answer grounded in tool results? | NLI check vs retrieved data | | Safety | Did it refuse harmful requests? | Red-team test set |
1. Task Completion Rate
The most important metric. Did the agent successfully complete the task?
# evaluation/task_completion.py
from openai import AsyncAzureOpenAI
from pydantic import BaseModel
class TaskEvalResult(BaseModel):
task_id: str
goal: str
agent_answer: str
reference_answer: str
completed: bool
score: float # 0.0-1.0
reason: str
async def evaluate_task_completion(
task_id: str,
goal: str,
agent_answer: str,
reference_answer: str,
client: AsyncAzureOpenAI,
) -> TaskEvalResult:
"""LLM-as-judge evaluation of task completion."""
judge_prompt = f"""You are evaluating an AI agent's answer.
Task: {goal}
Reference answer (what a correct response should cover):
{reference_answer}
Agent's answer:
{agent_answer}
Rate the agent's answer on a scale of 0-10:
- 10: Fully correct, complete, covers all key points
- 7-9: Mostly correct with minor gaps
- 4-6: Partially correct, missing key information
- 1-3: Mostly wrong but contains some relevant information
- 0: Completely wrong or refused to answer
Respond as JSON: {{"score": <0-10>, "completed": <true/false>, "reason": "<one sentence>"}}"""
response = await client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": judge_prompt}],
response_format={"type": "json_object"},
temperature=0,
)
data = eval(response.choices[0].message.content)
return TaskEvalResult(
task_id=task_id,
goal=goal,
agent_answer=agent_answer,
reference_answer=reference_answer,
completed=data["score"] >= 7,
score=data["score"] / 10.0,
reason=data["reason"],
)2. Step Efficiency
An agent that takes 8 steps to answer a question that should take 2 is inefficient and expensive.
# evaluation/step_efficiency.py
from dataclasses import dataclass
@dataclass
class StepMetrics:
task_id: str
steps_used: int
optimal_steps: int
efficiency: float # 0.0-1.0, higher is better
tool_calls: list[str]
def compute_step_efficiency(
steps_used: int,
optimal_steps: int,
) -> float:
"""Efficiency = optimal / actual. 1.0 = perfect, lower = less efficient."""
if steps_used == 0:
return 0.0
return min(1.0, optimal_steps / steps_used)
# Golden dataset: each task has an expected step count
GOLDEN_TASKS = [
{
"task_id": "drug-info-ibuprofen",
"goal": "What is ibuprofen used for?",
"optimal_steps": 1, # Simple lookup: one tool call
},
{
"task_id": "interaction-warfarin-ibuprofen",
"goal": "Check interaction between warfarin and ibuprofen",
"optimal_steps": 2, # Lookup both drugs, then check interaction
},
{
"task_id": "dosage-pediatric-amoxicillin",
"goal": "What is the pediatric dosage for amoxicillin by weight?",
"optimal_steps": 3, # Lookup drug, find pediatric section, compute by weight
},
]
def analyze_efficiency(run_log: dict, golden: dict) -> StepMetrics:
return StepMetrics(
task_id=golden["task_id"],
steps_used=len(run_log["tool_calls"]),
optimal_steps=golden["optimal_steps"],
efficiency=compute_step_efficiency(
len(run_log["tool_calls"]),
golden["optimal_steps"],
),
tool_calls=[tc["tool_name"] for tc in run_log["tool_calls"]],
)3. Golden Trajectory Testing
Compare the agent's actual steps to a reference trajectory:
# evaluation/trajectory.py
REFERENCE_TRAJECTORIES = {
"interaction-warfarin-ibuprofen": {
"expected_tools": ["search_drug_database", "check_interaction"],
"expected_tool_order": True, # Must be in this order
"required_content": ["bleeding risk", "NSAID", "anticoagulant"],
}
}
def evaluate_trajectory(
task_id: str,
actual_tool_calls: list[str],
final_answer: str,
) -> dict:
if task_id not in REFERENCE_TRAJECTORIES:
return {"evaluated": False}
ref = REFERENCE_TRAJECTORIES[task_id]
# Check all expected tools were called
expected = set(ref["expected_tools"])
actual_set = set(actual_tool_calls)
missing_tools = expected - actual_set
extra_tools = actual_set - expected
# Check required content in final answer
answer_lower = final_answer.lower()
missing_content = [
phrase for phrase in ref["required_content"]
if phrase.lower() not in answer_lower
]
return {
"task_id": task_id,
"missing_tools": list(missing_tools),
"extra_tools": list(extra_tools),
"missing_content": missing_content,
"passed": len(missing_tools) == 0 and len(missing_content) == 0,
}4. Human Review Sampling
Automated metrics catch regressions but miss nuanced quality issues. Sample 5% of production runs for human review:
# evaluation/sampling.py
import random
import structlog
log = structlog.get_logger()
def should_sample_for_review(run_id: str, rate: float = 0.05) -> bool:
"""Randomly flag 5% of runs for human review."""
return random.random() < rate
def flag_for_review(run_id: str, reason: str, priority: str = "normal"):
"""Add run to human review queue."""
log.info(
"flagged_for_review",
run_id=run_id,
reason=reason,
priority=priority,
)
# In production: add to review queue (DB table, Slack message, etc.)
# Also flag specific cases automatically:
# - Agent hit max iterations
# - Output safety check triggered
# - Step efficiency below 0.3
# - User rated the response poorly (thumbs down)
async def post_run_review_check(run_result: dict):
if run_result.get("hit_max_iterations"):
flag_for_review(run_result["run_id"], "hit_max_iterations", priority="high")
if run_result.get("output_blocked"):
flag_for_review(run_result["run_id"], "output_blocked", priority="high")
if run_result.get("step_efficiency", 1.0) < 0.3:
flag_for_review(run_result["run_id"], "low_efficiency", priority="normal")
if should_sample_for_review(run_result["run_id"]):
flag_for_review(run_result["run_id"], "random_sample", priority="low")5. Running the Evaluation Suite in CI
# evaluation/run_eval.py
import asyncio
from evaluation.task_completion import evaluate_task_completion
from evaluation.trajectory import evaluate_trajectory
EVAL_TASKS = [
# (task_id, goal, reference_answer)
("drug-info-ibuprofen", "What is ibuprofen?", "NSAID, pain reliever, COX inhibitor"),
("interaction-check", "Is it safe to combine warfarin and ibuprofen?", "Major interaction, increased bleeding risk, consult pharmacist"),
# ... 50 more tasks
]
async def run_evaluation_suite(agent, client):
results = []
for task_id, goal, reference in EVAL_TASKS:
# Run the agent
run_log = await agent.run_with_logging(goal)
# Evaluate task completion
completion = await evaluate_task_completion(
task_id, goal, run_log["answer"], reference, client
)
# Evaluate trajectory
trajectory = evaluate_trajectory(
task_id, run_log["tool_calls"], run_log["answer"]
)
results.append({
"task_id": task_id,
"completed": completion.completed,
"score": completion.score,
"trajectory_passed": trajectory.get("passed", True),
})
# Compute aggregate metrics
completion_rate = sum(r["completed"] for r in results) / len(results)
avg_score = sum(r["score"] for r in results) / len(results)
print(f"Task completion rate: {completion_rate:.1%}")
print(f"Average score: {avg_score:.2f}")
# Fail CI if quality dropped
if completion_rate < 0.80:
raise AssertionError(f"Completion rate {completion_rate:.1%} below 80% threshold")
return results
if __name__ == "__main__":
asyncio.run(run_evaluation_suite(agent=..., client=...))Run this in CI on every PR that changes the agent's tools, prompts, or logic. Set a threshold: if task completion drops more than 5% from the baseline, block the merge.
Evaluation Dashboard Metrics
Track these week-over-week:
- Task completion rate: target above 85%
- Average step efficiency: target above 0.6
- Human review pass rate: target above 90%
- Safety block rate: target under 0.5% (too high = model is unsafe; too low = guardrails aren't triggering)
- Mean steps per task: watch for upward trend (agent becoming less efficient)
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.