Real-Time AI Transcription with DeepGram on AWS
Integrate DeepGram speech-to-text into a serverless AWS pipeline — real-time WebSocket streaming, batch transcription of S3 recordings, speaker diarization, custom vocabulary, and storing transcripts in DynamoDB.
Why DeepGram?
DeepGram is the leading AI speech-to-text API for contact centers and call analytics. Compared to AWS Transcribe:
| Feature | DeepGram | AWS Transcribe | |---------|----------|----------------| | Real-time WER | ~5–8% | ~10–15% | | Latency (streaming) | ~300ms | ~1–2s | | Custom vocabulary | Yes | Yes | | Speaker diarization | Yes (2+ speakers) | Yes | | Medical models | Yes | Yes (separate) | | Price per minute | ~$0.0043 | ~$0.024 |
For a patient call center, accuracy and latency matter — DeepGram wins on both.
Architecture: Batch Transcription Pipeline
Amazon Connect recording
↓ saved to S3
S3 event → Lambda trigger
↓
Download audio from S3
↓
POST to DeepGram API
↓
Store transcript + metadata in DynamoDB
↓
Trigger quality scoring LambdaSetting Up DeepGram
pip install deepgram-sdkStore the API key in AWS Secrets Manager:
import boto3
import json
def get_deepgram_key() -> str:
client = boto3.client("secretsmanager")
secret = client.get_secret_value(SecretId="prod/deepgram/api-key")
return json.loads(secret["SecretString"])["api_key"]Batch Transcription: S3 Recording → DynamoDB
# lambda/transcribe_call/handler.py
import boto3
import os
import json
from datetime import datetime
from deepgram import DeepgramClient, PrerecordedOptions
s3 = boto3.client("s3")
dynamodb = boto3.resource("dynamodb")
table = dynamodb.Table(os.environ["TABLE_NAME"])
secrets = boto3.client("secretsmanager")
_deepgram = None
def get_deepgram() -> DeepgramClient:
global _deepgram
if _deepgram is None:
key = json.loads(
secrets.get_secret_value(SecretId="prod/deepgram/api-key")["SecretString"]
)["api_key"]
_deepgram = DeepgramClient(key)
return _deepgram
def handler(event, context):
"""Triggered by S3 event when a call recording is uploaded."""
for record in event["Records"]:
bucket = record["s3"]["bucket"]["name"]
key = record["s3"]["object"]["key"]
# e.g. key = "recordings/CLN-001/2026-04-16/call-abc123.wav"
parts = key.split("/")
clinic_id = parts[1]
call_id = parts[-1].replace(".wav", "").replace(".mp3", "")
try:
transcript = transcribe_recording(bucket, key)
store_transcript(clinic_id, call_id, key, transcript)
trigger_quality_scoring(clinic_id, call_id)
except Exception as e:
print(f"Transcription failed for {key}: {e}")
raise # re-raise to trigger DLQ
def transcribe_recording(bucket: str, key: str) -> dict:
"""Download from S3 and transcribe with DeepGram."""
# Get a presigned URL so DeepGram can fetch the file directly
url = s3.generate_presigned_url(
"get_object",
Params={"Bucket": bucket, "Key": key},
ExpiresIn=3600
)
dg = get_deepgram()
options = PrerecordedOptions(
model="nova-2", # best accuracy model
language="en-US",
smart_format=True, # proper punctuation and formatting
diarize=True, # separate agent vs patient speech
punctuate=True,
utterances=True, # split into utterances with timestamps
sentiment=True, # sentiment per utterance
topics=True, # auto-detect conversation topics
intents=True, # detect caller intents
custom_vocabulary=[ # optometry-specific terms
"refraction", "visual acuity", "ophthalmologist",
"optometrist", "diopter", "astigmatism", "glaucoma",
"macular degeneration", "retinal exam", "slit lamp"
]
)
response = dg.listen.prerecorded.v("1").transcribe_url(
{"url": url}, options
)
return response.to_dict()
def store_transcript(clinic_id: str, call_id: str, s3_key: str, result: dict) -> None:
"""Store structured transcript data in DynamoDB."""
channel = result["results"]["channels"][0]
alternative = channel["alternatives"][0]
# Extract utterances with speaker labels
utterances = []
for utterance in result["results"].get("utterances", []):
utterances.append({
"speaker": utterance["speaker"], # 0 = agent, 1 = patient
"text": utterance["transcript"],
"start": str(utterance["start"]),
"end": str(utterance["end"]),
"confidence": str(utterance["confidence"]),
"sentiment": utterance.get("sentiment", {}).get("sentiment", "neutral"),
})
# Overall metrics
words = alternative.get("words", [])
word_count = len(words)
duration = result["metadata"]["duration"]
table.put_item(Item={
"PK": f"CLINIC#{clinic_id}",
"SK": f"TRANSCRIPT#{datetime.utcnow().strftime('%Y-%m-%d')}#{call_id}",
"GSI1PK": f"CALL#{call_id}",
"GSI1SK": "TRANSCRIPT",
"type": "TRANSCRIPT",
"call_id": call_id,
"clinic_id": clinic_id,
"s3_key": s3_key,
"full_transcript": alternative["transcript"],
"utterances": utterances,
"word_count": word_count,
"duration_seconds": str(duration),
"confidence": str(alternative["confidence"]),
"topics": result["results"].get("topics", {}).get("segments", []),
"intents": result["results"].get("intents", {}).get("segments", []),
"created_at": datetime.utcnow().isoformat(),
})Real-Time Streaming Transcription
For live call monitoring, use DeepGram's WebSocket streaming API. Connect a Lambda function to the audio stream:
# For real-time streaming — runs as a persistent process (e.g., EC2/ECS)
# Not Lambda (which has a 15-min timeout and doesn't support long-lived WebSockets)
import asyncio
from deepgram import DeepgramClient, LiveOptions, LiveTranscriptionEvents
async def stream_call(audio_stream, call_id: str, on_transcript):
dg = DeepgramClient(DEEPGRAM_API_KEY)
connection = dg.listen.asynclive.v("1")
async def on_message(self, result, **kwargs):
sentence = result.channel.alternatives[0].transcript
if sentence:
await on_transcript({
"call_id": call_id,
"transcript": sentence,
"is_final": result.is_final,
"speaker": result.channel.alternatives[0].words[0].speaker if result.channel.alternatives[0].words else 0,
"timestamp": result.start
})
connection.on(LiveTranscriptionEvents.Transcript, on_message)
options = LiveOptions(
model="nova-2",
language="en-US",
smart_format=True,
diarize=True,
interim_results=True,
utterance_end_ms=1000,
vad_events=True, # voice activity detection
endpointing=300 # ms of silence = end of utterance
)
await connection.start(options)
# Feed audio chunks from Amazon Connect media stream
async for chunk in audio_stream:
await connection.send(chunk)
await connection.finish()Connecting Amazon Connect Media Streaming
Enable real-time audio streaming in your contact flow:
{
"Type": "StartMediaStreaming",
"Parameters": {
"MediaStreamingStartCondition": "BeforeTransfer",
"MediaStreams": [{
"Type": "AUDIO",
"Participants": ["CUSTOMER", "AGENT"]
}]
}
}Amazon Connect streams audio to Kinesis Video Streams. Your processing service reads from KVS and pipes chunks to DeepGram.
Speaker Diarization in Practice
DeepGram labels speakers as 0, 1, 2, etc. For a two-party call:
- Speaker
0is typically the first voice detected (usually the IVR/agent greeting) - Speaker
1is the patient
Post-process to map speaker numbers to roles:
def assign_speaker_roles(utterances: list, agent_phone: str) -> list:
"""
Heuristic: the first speaker is almost always the agent.
In a real system, you'd use ANI/DNIS metadata from Connect.
"""
if not utterances:
return utterances
agent_speaker_id = utterances[0]["speaker"] # first speaker = agent
return [
{
**u,
"role": "agent" if u["speaker"] == agent_speaker_id else "patient"
}
for u in utterances
]Extracting Key Information
Use the transcript to extract structured data:
def extract_call_summary(transcript: str, intents: list) -> dict:
"""
Extract structured info from the transcript.
In production, pass this to an LLM for better extraction.
"""
summary = {
"appointment_requested": False,
"insurance_mentioned": False,
"callback_requested": False,
"complaint": False,
}
lower = transcript.lower()
if any(w in lower for w in ["appointment", "schedule", "book", "available"]):
summary["appointment_requested"] = True
if any(w in lower for w in ["insurance", "coverage", "deductible", "copay"]):
summary["insurance_mentioned"] = True
# Use DeepGram's intent detection
for segment in intents:
for intent in segment.get("intents", []):
if intent["intent"] == "cancel":
summary["cancellation_intent"] = True
return summaryCost Optimization
DeepGram charges per minute of audio. For a call center handling 500 calls/day averaging 4 minutes:
500 × 4 × $0.0043 = $8.60/day = ~$258/month
Compare to AWS Transcribe: 500 × 4 × $0.024 = $48/day = ~$1,440/month
Tip: Store transcripts in DynamoDB and S3. Never re-transcribe a recording — check for an existing transcript first.
WebSocket & Real-Time Knowledge Check
5 questions · Test what you just learned · Instant explanations
Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.