AI-Powered Call Quality Scoring for Contact Centers
Automatically score call quality using LLMs ā build a scoring rubric, send transcripts to Claude or GPT-4, extract structured scores, store in DynamoDB, and surface insights in an analytics dashboard.
Why Automate Quality Scoring?
Traditional call quality: a QA supervisor listens to 2ā5% of calls and scores them on a rubric. This is slow, inconsistent, and misses 95%+ of calls.
AI quality scoring: every call gets scored automatically within minutes of ending ā at near-human accuracy for the criteria that matter most.
What you can score automatically:
- Did the agent greet the patient by name?
- Was the hold time explained?
- Were scheduling options clearly communicated?
- Was the call wrapped up professionally?
- Were there signs of patient frustration (sentiment)?
- Did the agent follow HIPAA protocol (never repeating DOB or SSN aloud)?
Architecture
S3: call recording uploaded
ā
DeepGram Lambda: transcript stored in DynamoDB
ā
DynamoDB Stream event
ā
Quality Scoring Lambda
ā
Claude/GPT-4 API: score against rubric
ā
DynamoDB: store scores
ā
API ā Dashboard: surface to supervisorsDefining a Scoring Rubric
Document your rubric as a structured schema ā this becomes your prompt context:
# lib/scoring_rubric.py
RUBRIC = {
"version": "1.2",
"categories": [
{
"id": "greeting",
"name": "Professional Greeting",
"weight": 10,
"criteria": [
"Agent identified themselves and the practice name",
"Agent greeted the caller warmly",
"Agent asked how they could help"
]
},
{
"id": "scheduling",
"name": "Scheduling Effectiveness",
"weight": 25,
"criteria": [
"Confirmed patient name and date of birth",
"Offered at least two appointment options",
"Confirmed appointment details before ending",
"Provided pre-appointment instructions if needed"
]
},
{
"id": "hold_protocol",
"name": "Hold Protocol",
"weight": 15,
"criteria": [
"Asked permission before placing on hold",
"Explained reason for hold",
"Thanked patient for holding when returning",
"Hold time under 2 minutes"
]
},
{
"id": "hipaa_compliance",
"name": "HIPAA Compliance",
"weight": 30,
"criteria": [
"Verified patient identity before discussing records",
"Did not read SSN, full DOB, or insurance ID aloud unnecessarily",
"Used proper verification questions"
]
},
{
"id": "resolution",
"name": "Call Resolution",
"weight": 20,
"criteria": [
"Patient's request was fully addressed",
"Next steps were clearly communicated",
"Call closed professionally"
]
}
]
}Quality Scoring Lambda
# lambda/quality_scoring/handler.py
import boto3
import json
import os
from datetime import datetime
import anthropic
dynamodb = boto3.resource("dynamodb")
table = dynamodb.Table(os.environ["TABLE_NAME"])
secrets = boto3.client("secretsmanager")
_claude = None
def get_claude():
global _claude
if _claude is None:
key = json.loads(
secrets.get_secret_value(SecretId="prod/anthropic/api-key")["SecretString"]
)["api_key"]
_claude = anthropic.Anthropic(api_key=key)
return _claude
def build_scoring_prompt(transcript: str, utterances: list, rubric: dict) -> str:
# Format transcript with speaker labels
formatted = "\n".join([
f"[{u['role'].upper()}]: {u['text']}"
for u in utterances
])
rubric_text = "\n".join([
f"\n{cat['id'].upper()} ({cat['name']}, weight: {cat['weight']}pts):\n" +
"\n".join(f" - {c}" for c in cat["criteria"])
for cat in rubric["categories"]
])
return f"""You are a call quality analyst for an optometry practice contact center.
Score this call transcript against the quality rubric below.
Return ONLY valid JSON matching the exact schema provided.
## RUBRIC
{rubric_text}
## TRANSCRIPT
{formatted}
## REQUIRED JSON SCHEMA
{{
"overall_score": <0-100 integer>,
"categories": {{
"<category_id>": {{
"score": <0-100 integer for this category>,
"met": [<list of criteria that were clearly met>],
"missed": [<list of criteria that were not met or unclear>],
"notes": "<brief observation>"
}}
}},
"highlights": ["<positive observation>", ...],
"coaching_points": ["<actionable improvement>", ...],
"hipaa_flags": ["<any HIPAA concern>"],
"sentiment_summary": "<overall patient sentiment: positive/neutral/frustrated/angry>",
"auto_fail": <true if any critical HIPAA violation, false otherwise>
}}"""
def score_call(call_id: str, clinic_id: str) -> dict:
# Fetch transcript from DynamoDB
response = table.query(
IndexName="GSI1",
KeyConditionExpression="GSI1PK = :pk AND GSI1SK = :sk",
ExpressionAttributeValues={
":pk": f"CALL#{call_id}",
":sk": "TRANSCRIPT"
}
)
if not response["Items"]:
raise ValueError(f"No transcript found for call {call_id}")
transcript_item = response["Items"][0]
from lib.scoring_rubric import RUBRIC
prompt = build_scoring_prompt(
transcript_item["full_transcript"],
transcript_item.get("utterances", []),
RUBRIC
)
claude = get_claude()
message = claude.messages.create(
model="claude-opus-4-6",
max_tokens=1500,
messages=[{"role": "user", "content": prompt}]
)
raw = message.content[0].text.strip()
# Strip markdown code blocks if present
if raw.startswith("```"):
raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
scores = json.loads(raw)
return scores
def handler(event, context):
"""Triggered by DynamoDB Stream when a transcript is inserted."""
for record in event["Records"]:
if record["eventName"] != "INSERT":
continue
new_image = record["dynamodb"]["NewImage"]
item_type = new_image.get("type", {}).get("S", "")
if item_type != "TRANSCRIPT":
continue
call_id = new_image["call_id"]["S"]
clinic_id = new_image["clinic_id"]["S"]
try:
scores = score_call(call_id, clinic_id)
store_scores(call_id, clinic_id, scores)
# Alert if auto-fail (HIPAA violation)
if scores.get("auto_fail"):
notify_compliance_team(call_id, clinic_id, scores["hipaa_flags"])
except Exception as e:
print(f"Scoring failed for call {call_id}: {e}")
raise
def store_scores(call_id: str, clinic_id: str, scores: dict) -> None:
table.put_item(Item={
"PK": f"CLINIC#{clinic_id}",
"SK": f"SCORE#{datetime.utcnow().strftime('%Y-%m-%d')}#{call_id}",
"GSI1PK": f"CALL#{call_id}",
"GSI1SK": "SCORE",
"type": "CALL_SCORE",
"call_id": call_id,
"clinic_id": clinic_id,
"overall_score": scores["overall_score"],
"categories": json.dumps(scores["categories"]),
"highlights": scores.get("highlights", []),
"coaching_points": scores.get("coaching_points", []),
"hipaa_flags": scores.get("hipaa_flags", []),
"sentiment_summary": scores.get("sentiment_summary", "neutral"),
"auto_fail": scores.get("auto_fail", False),
"scored_at": datetime.utcnow().isoformat(),
})Aggregating Scores for Dashboard
def get_agent_performance(agent_id: str, start_date: str, end_date: str) -> dict:
"""Get aggregated quality scores for an agent over a date range."""
response = table.query(
IndexName="AgentScoreIndex",
KeyConditionExpression="agent_id = :agent AND scored_date BETWEEN :start AND :end",
ExpressionAttributeValues={
":agent": agent_id,
":start": start_date,
":end": end_date,
}
)
scores = response["Items"]
if not scores:
return {"agent_id": agent_id, "call_count": 0}
overall_scores = [int(s["overall_score"]) for s in scores]
# Aggregate per-category scores
category_scores = {}
for score in scores:
cats = json.loads(score.get("categories", "{}"))
for cat_id, data in cats.items():
if cat_id not in category_scores:
category_scores[cat_id] = []
category_scores[cat_id].append(data["score"])
coaching_freq = {}
for score in scores:
for point in score.get("coaching_points", []):
coaching_freq[point] = coaching_freq.get(point, 0) + 1
top_coaching = sorted(coaching_freq.items(), key=lambda x: -x[1])[:5]
return {
"agent_id": agent_id,
"call_count": len(scores),
"avg_overall_score": round(sum(overall_scores) / len(overall_scores), 1),
"min_score": min(overall_scores),
"max_score": max(overall_scores),
"auto_fail_count": sum(1 for s in scores if s.get("auto_fail")),
"category_averages": {
cat: round(sum(vals) / len(vals), 1)
for cat, vals in category_scores.items()
},
"top_coaching_areas": [point for point, _ in top_coaching],
}React Dashboard Component
// src/components/QualityScoreCard.tsx
import { Badge } from "@/components/ui/badge";
interface ScoreCardProps {
callId: string;
overallScore: number;
categories: Record<string, { score: number; notes: string }>;
coachingPoints: string[];
hipaaFlags: string[];
autoFail: boolean;
sentimentSummary: string;
}
export function QualityScoreCard({
overallScore, categories, coachingPoints, hipaaFlags, autoFail, sentimentSummary
}: ScoreCardProps) {
const scoreColor = overallScore >= 90 ? "text-green-500"
: overallScore >= 75 ? "text-yellow-500"
: "text-red-500";
return (
<div className="rounded-xl border border-border bg-card p-6 space-y-4">
{autoFail && (
<div className="flex items-center gap-2 p-3 bg-red-500/10 border border-red-500/30 rounded-lg">
<span className="text-red-500 font-bold">ā Auto-Fail: HIPAA Concern</span>
</div>
)}
<div className="flex items-center justify-between">
<h3 className="text-lg font-semibold">Quality Score</h3>
<span className={`text-4xl font-bold ${scoreColor}`}>{overallScore}</span>
</div>
<div className="grid grid-cols-2 gap-3">
{Object.entries(categories).map(([id, data]) => (
<div key={id} className="p-3 rounded-lg bg-muted/50">
<div className="flex justify-between items-center mb-1">
<span className="text-xs font-medium capitalize">{id.replace("_", " ")}</span>
<span className={`text-sm font-bold ${
data.score >= 90 ? "text-green-500"
: data.score >= 70 ? "text-yellow-500"
: "text-red-500"
}`}>{data.score}</span>
</div>
<p className="text-xs text-muted-foreground">{data.notes}</p>
</div>
))}
</div>
{coachingPoints.length > 0 && (
<div>
<h4 className="text-sm font-medium mb-2">Coaching Points</h4>
<ul className="space-y-1">
{coachingPoints.map((point, i) => (
<li key={i} className="text-sm text-muted-foreground flex gap-2">
<span className="text-amber-500">ā</span>{point}
</li>
))}
</ul>
</div>
)}
<div className="flex items-center gap-2">
<span className="text-xs text-muted-foreground">Patient sentiment:</span>
<Badge variant={sentimentSummary === "positive" ? "default" : "secondary"}>
{sentimentSummary}
</Badge>
</div>
</div>
);
}Cost Estimation
For 500 calls/day averaging 400 words of transcript:
- Claude Haiku (cheaper, still accurate for scoring): ~$0.0003 per call
- Claude Sonnet: ~$0.003 per call
- Claude Opus: ~$0.015 per call
Recommendation: Use Sonnet for production scoring. At 500 calls/day: 500 Ć $0.003 = $1.50/day = ~$45/month. That's a fraction of what a human QA team costs.
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.