LLM Evaluation Q&A · Lesson 15 of 16
A/B Testing Prompts and Model Versions
Why A/B Test LLM Applications?
Offline evaluation (golden datasets, benchmarks) tells you a change should improve quality. A/B testing in production tells you it actually does with real users.
LLM A/B tests are subtle: users may prefer a response that's longer (verbosity effect), not necessarily better. You need objective metrics alongside user preference signals.
What to A/B Test
| Change | Test | |---|---| | System prompt rewrite | Response quality score, user satisfaction | | Model upgrade (gpt-4o-mini → gpt-4o) | Quality improvement vs cost increase | | RAG retrieval parameter change | RAGAS metrics on production queries | | Chunk size change | Context recall, answer relevancy | | Temperature change | Response variety vs consistency |
Traffic Splitting
Route a percentage of requests to the new variant:
import hashlib
import random
def assign_variant(user_id: str, experiment_name: str, split_pct: float = 0.5) -> str:
"""Deterministic, per-user assignment. Same user always gets same variant."""
hash_input = f"{experiment_name}:{user_id}"
hash_value = int(hashlib.md5(hash_input.encode()).hexdigest(), 16)
bucket = (hash_value % 1000) / 1000.0
return "treatment" if bucket < split_pct else "control"
# Usage in request handler
def handle_request(user_id: str, query: str) -> dict:
variant = assign_variant(user_id, "prompt_v2_experiment", split_pct=0.10)
if variant == "treatment":
system_prompt = SYSTEM_PROMPT_V2 # New prompt
else:
system_prompt = SYSTEM_PROMPT_V1 # Control
response = run_llm(query, system_prompt)
# Log for analysis
log_event({
"user_id": user_id,
"experiment": "prompt_v2_experiment",
"variant": variant,
"query_hash": hashlib.md5(query.encode()).hexdigest(),
"response_length": len(response),
})
return {"response": response, "variant": variant}The hash-based assignment ensures:
- Each user consistently gets the same variant (no switching mid-conversation)
- Traffic splits correctly across many users
Metrics to Collect
Primary metric: The goal of the experiment (quality score, task completion rate)
Guardrail metrics: Things that must not get worse (latency, cost, safety)
import time
from dataclasses import dataclass
@dataclass
class ExperimentEvent:
user_id: str
session_id: str
variant: str
experiment: str
timestamp: float
response_latency_ms: float
response_length: int
cost_usd: float
quality_score: float | None # From LLM judge (async)
thumbs_up: bool | None # From user feedback
def log_experiment_event(event: ExperimentEvent, storage):
storage.append({
"user_id": event.user_id,
"variant": event.variant,
"experiment": event.experiment,
"latency_ms": event.response_latency_ms,
"length": event.response_length,
"cost": event.cost_usd,
"quality": event.quality_score,
"thumbs_up": event.thumbs_up,
"ts": event.timestamp,
})Statistical Analysis
Determine if the difference between variants is statistically significant:
from scipy import stats
import numpy as np
def analyze_experiment(
control_scores: list[float],
treatment_scores: list[float],
alpha: float = 0.05,
) -> dict:
"""Two-sided t-test for quality score difference."""
t_stat, p_value = stats.ttest_ind(treatment_scores, control_scores)
control_mean = np.mean(control_scores)
treatment_mean = np.mean(treatment_scores)
relative_change = (treatment_mean - control_mean) / control_mean
# Effect size (Cohen's d)
pooled_std = np.sqrt(
(np.std(control_scores)**2 + np.std(treatment_scores)**2) / 2
)
cohens_d = (treatment_mean - control_mean) / pooled_std if pooled_std > 0 else 0
significant = p_value < alpha
return {
"control_n": len(control_scores),
"treatment_n": len(treatment_scores),
"control_mean": control_mean,
"treatment_mean": treatment_mean,
"relative_change_pct": relative_change * 100,
"p_value": p_value,
"significant": significant,
"cohens_d": cohens_d,
"recommendation": "deploy" if (significant and relative_change > 0) else "reject",
}
# Example
control_quality = [3.8, 4.1, 3.9, 4.0, 3.7, 4.2, 3.8, 4.0, 3.9, 4.1]
treatment_quality = [4.2, 4.4, 4.1, 4.3, 4.0, 4.5, 4.2, 4.4, 4.3, 4.1]
result = analyze_experiment(control_quality, treatment_quality)
print(f"Relative change: {result['relative_change_pct']:+.1f}%")
print(f"p-value: {result['p_value']:.4f}")
print(f"Significant: {result['significant']}")
print(f"Recommendation: {result['recommendation']}")Sample Size Planning
Run the experiment long enough to reach statistical significance. Too short = noisy results; too long = unnecessarily exposing users to a potentially worse experience.
from scipy.stats import norm
import math
def required_sample_size(
baseline_mean: float,
baseline_std: float,
min_detectable_effect: float, # Absolute change you want to detect
alpha: float = 0.05,
power: float = 0.80,
) -> int:
"""Compute required n per variant."""
z_alpha = norm.ppf(1 - alpha / 2) # Two-sided
z_beta = norm.ppf(power)
n = 2 * ((z_alpha + z_beta) * baseline_std / min_detectable_effect) ** 2
return math.ceil(n)
# Example: detect a 0.2 point improvement in 1-5 quality score
n = required_sample_size(
baseline_mean=3.8,
baseline_std=0.3,
min_detectable_effect=0.2,
alpha=0.05,
power=0.80,
)
print(f"Required n per variant: {n}") # ~18 for this example; need ~36 totalGuardrail Checks
Before declaring a winner, verify guardrail metrics haven't degraded:
def check_guardrails(
control_data: list[dict],
treatment_data: list[dict],
guardrails: dict,
) -> list[str]:
"""Return list of violated guardrails."""
violations = []
for metric, config in guardrails.items():
control_values = [d[metric] for d in control_data]
treatment_values = [d[metric] for d in treatment_data]
control_mean = sum(control_values) / len(control_values)
treatment_mean = sum(treatment_values) / len(treatment_values)
max_regression = config.get("max_regression_pct", 5) / 100
if treatment_mean > control_mean * (1 + max_regression):
violations.append(
f"{metric}: treatment={treatment_mean:.3f} exceeded control={control_mean:.3f} "
f"by more than {max_regression*100:.0f}%"
)
return violations
guardrails = {
"latency_ms": {"max_regression_pct": 20}, # Allow up to 20% latency increase
"cost_usd": {"max_regression_pct": 10}, # Allow up to 10% cost increase
}
violations = check_guardrails(control_logs, treatment_logs, guardrails)
if violations:
print("Guardrail violations — do not deploy:")
for v in violations:
print(f" {v}")A treatment that improves quality but doubles cost may still be worth deploying — but that's a business decision, not an automatic deploy. Guardrails make the cost explicit and require deliberate sign-off.
Decision Framework
When the experiment concludes:
| Primary metric | Guardrails | Recommendation | |---|---|---| | Significant improvement | All pass | Deploy | | Significant improvement | Some fail | Business decision (explicit trade-off) | | No significant difference | All pass | Deploy if other benefits (cheaper/faster) | | Significant degradation | Any | Reject |
Always document the experiment results, sample sizes, and decision rationale. This becomes institutional knowledge for future decisions.