Real-Time AI Transcription with DeepGram on AWS

Why DeepGram?

DeepGram is the leading AI speech-to-text API for contact centers and call analytics. Compared to AWS Transcribe:

| Feature | DeepGram | AWS Transcribe | |---------|----------|----------------| | Real-time WER | ~5–8% | ~10–15% | | Latency (streaming) | ~300ms | ~1–2s | | Custom vocabulary | Yes | Yes | | Speaker diarization | Yes (2+ speakers) | Yes | | Medical models | Yes | Yes (separate) | | Price per minute | ~$0.0043 | ~$0.024 |

For a patient call center, accuracy and latency matter — DeepGram wins on both.

Architecture: Batch Transcription Pipeline

Amazon Connect recording
    ↓ saved to S3
S3 event → Lambda trigger
    ↓
Download audio from S3
    ↓
POST to DeepGram API
    ↓
Store transcript + metadata in DynamoDB
    ↓
Trigger quality scoring Lambda

Setting Up DeepGram

Bash

pip install deepgram-sdk

Store the API key in AWS Secrets Manager:

Python

import boto3
import json

def get_deepgram_key() -> str:
    client = boto3.client("secretsmanager")
    secret = client.get_secret_value(SecretId="prod/deepgram/api-key")
    return json.loads(secret["SecretString"])["api_key"]

Batch Transcription: S3 Recording → DynamoDB

Python

# lambda/transcribe_call/handler.py

import boto3
import os
import json
from datetime import datetime
from deepgram import DeepgramClient, PrerecordedOptions

s3 = boto3.client("s3")
dynamodb = boto3.resource("dynamodb")
table = dynamodb.Table(os.environ["TABLE_NAME"])
secrets = boto3.client("secretsmanager")

_deepgram = None

def get_deepgram() -> DeepgramClient:
    global _deepgram
    if _deepgram is None:
        key = json.loads(
            secrets.get_secret_value(SecretId="prod/deepgram/api-key")["SecretString"]
        )["api_key"]
        _deepgram = DeepgramClient(key)
    return _deepgram


def handler(event, context):
    """Triggered by S3 event when a call recording is uploaded."""
    for record in event["Records"]:
        bucket = record["s3"]["bucket"]["name"]
        key = record["s3"]["object"]["key"]
        
        # e.g. key = "recordings/CLN-001/2026-04-16/call-abc123.wav"
        parts = key.split("/")
        clinic_id = parts[1]
        call_id = parts[-1].replace(".wav", "").replace(".mp3", "")
        
        try:
            transcript = transcribe_recording(bucket, key)
            store_transcript(clinic_id, call_id, key, transcript)
            trigger_quality_scoring(clinic_id, call_id)
        except Exception as e:
            print(f"Transcription failed for {key}: {e}")
            raise  # re-raise to trigger DLQ


def transcribe_recording(bucket: str, key: str) -> dict:
    """Download from S3 and transcribe with DeepGram."""
    # Get a presigned URL so DeepGram can fetch the file directly
    url = s3.generate_presigned_url(
        "get_object",
        Params={"Bucket": bucket, "Key": key},
        ExpiresIn=3600
    )
    
    dg = get_deepgram()
    
    options = PrerecordedOptions(
        model="nova-2",              # best accuracy model
        language="en-US",
        smart_format=True,           # proper punctuation and formatting
        diarize=True,                # separate agent vs patient speech
        punctuate=True,
        utterances=True,             # split into utterances with timestamps
        sentiment=True,              # sentiment per utterance
        topics=True,                 # auto-detect conversation topics
        intents=True,                # detect caller intents
        custom_vocabulary=[          # optometry-specific terms
            "refraction", "visual acuity", "ophthalmologist",
            "optometrist", "diopter", "astigmatism", "glaucoma",
            "macular degeneration", "retinal exam", "slit lamp"
        ]
    )
    
    response = dg.listen.prerecorded.v("1").transcribe_url(
        {"url": url}, options
    )
    
    return response.to_dict()


def store_transcript(clinic_id: str, call_id: str, s3_key: str, result: dict) -> None:
    """Store structured transcript data in DynamoDB."""
    channel = result["results"]["channels"][0]
    alternative = channel["alternatives"][0]
    
    # Extract utterances with speaker labels
    utterances = []
    for utterance in result["results"].get("utterances", []):
        utterances.append({
            "speaker": utterance["speaker"],        # 0 = agent, 1 = patient
            "text": utterance["transcript"],
            "start": str(utterance["start"]),
            "end": str(utterance["end"]),
            "confidence": str(utterance["confidence"]),
            "sentiment": utterance.get("sentiment", {}).get("sentiment", "neutral"),
        })
    
    # Overall metrics
    words = alternative.get("words", [])
    word_count = len(words)
    duration = result["metadata"]["duration"]
    
    table.put_item(Item={
        "PK": f"CLINIC#{clinic_id}",
        "SK": f"TRANSCRIPT#{datetime.utcnow().strftime('%Y-%m-%d')}#{call_id}",
        "GSI1PK": f"CALL#{call_id}",
        "GSI1SK": "TRANSCRIPT",
        "type": "TRANSCRIPT",
        "call_id": call_id,
        "clinic_id": clinic_id,
        "s3_key": s3_key,
        "full_transcript": alternative["transcript"],
        "utterances": utterances,
        "word_count": word_count,
        "duration_seconds": str(duration),
        "confidence": str(alternative["confidence"]),
        "topics": result["results"].get("topics", {}).get("segments", []),
        "intents": result["results"].get("intents", {}).get("segments", []),
        "created_at": datetime.utcnow().isoformat(),
    })

Real-Time Streaming Transcription

For live call monitoring, use DeepGram's WebSocket streaming API. Connect a Lambda function to the audio stream:

Python

# For real-time streaming — runs as a persistent process (e.g., EC2/ECS)
# Not Lambda (which has a 15-min timeout and doesn't support long-lived WebSockets)

import asyncio
from deepgram import DeepgramClient, LiveOptions, LiveTranscriptionEvents

async def stream_call(audio_stream, call_id: str, on_transcript):
    dg = DeepgramClient(DEEPGRAM_API_KEY)
    
    connection = dg.listen.asynclive.v("1")
    
    async def on_message(self, result, **kwargs):
        sentence = result.channel.alternatives[0].transcript
        if sentence:
            await on_transcript({
                "call_id": call_id,
                "transcript": sentence,
                "is_final": result.is_final,
                "speaker": result.channel.alternatives[0].words[0].speaker if result.channel.alternatives[0].words else 0,
                "timestamp": result.start
            })
    
    connection.on(LiveTranscriptionEvents.Transcript, on_message)
    
    options = LiveOptions(
        model="nova-2",
        language="en-US",
        smart_format=True,
        diarize=True,
        interim_results=True,
        utterance_end_ms=1000,
        vad_events=True,           # voice activity detection
        endpointing=300            # ms of silence = end of utterance
    )
    
    await connection.start(options)
    
    # Feed audio chunks from Amazon Connect media stream
    async for chunk in audio_stream:
        await connection.send(chunk)
    
    await connection.finish()

Connecting Amazon Connect Media Streaming

Enable real-time audio streaming in your contact flow:

JSON

{
  "Type": "StartMediaStreaming",
  "Parameters": {
    "MediaStreamingStartCondition": "BeforeTransfer",
    "MediaStreams": [{
      "Type": "AUDIO",
      "Participants": ["CUSTOMER", "AGENT"]
    }]
  }
}

Amazon Connect streams audio to Kinesis Video Streams. Your processing service reads from KVS and pipes chunks to DeepGram.

Speaker Diarization in Practice

DeepGram labels speakers as 0, 1, 2, etc. For a two-party call:

Speaker 0 is typically the first voice detected (usually the IVR/agent greeting)
Speaker 1 is the patient

Post-process to map speaker numbers to roles:

Python

def assign_speaker_roles(utterances: list, agent_phone: str) -> list:
    """
    Heuristic: the first speaker is almost always the agent.
    In a real system, you'd use ANI/DNIS metadata from Connect.
    """
    if not utterances:
        return utterances
    
    agent_speaker_id = utterances[0]["speaker"]  # first speaker = agent
    
    return [
        {
            **u,
            "role": "agent" if u["speaker"] == agent_speaker_id else "patient"
        }
        for u in utterances
    ]

Extracting Key Information

Use the transcript to extract structured data:

Python

def extract_call_summary(transcript: str, intents: list) -> dict:
    """
    Extract structured info from the transcript.
    In production, pass this to an LLM for better extraction.
    """
    summary = {
        "appointment_requested": False,
        "insurance_mentioned": False,
        "callback_requested": False,
        "complaint": False,
    }
    
    lower = transcript.lower()
    
    if any(w in lower for w in ["appointment", "schedule", "book", "available"]):
        summary["appointment_requested"] = True
    
    if any(w in lower for w in ["insurance", "coverage", "deductible", "copay"]):
        summary["insurance_mentioned"] = True
    
    # Use DeepGram's intent detection
    for segment in intents:
        for intent in segment.get("intents", []):
            if intent["intent"] == "cancel":
                summary["cancellation_intent"] = True
    
    return summary

Cost Optimization

DeepGram charges per minute of audio. For a call center handling 500 calls/day averaging 4 minutes:

500 × 4 × $0.0043 = $8.60/day = ~$258/month

Compare to AWS Transcribe: 500 × 4 × $0.024 = $48/day = ~$1,440/month

Tip: Store transcripts in DynamoDB and S3. Never re-transcribe a recording — check for an existing transcript first.

Real-Time AI Transcription with DeepGram on AWS

Why DeepGram?

Architecture: Batch Transcription Pipeline

Setting Up DeepGram

Batch Transcription: S3 Recording → DynamoDB

Real-Time Streaming Transcription

Connecting Amazon Connect Media Streaming

Speaker Diarization in Practice

Extracting Key Information

Cost Optimization

WebSocket & Real-Time Knowledge Check

Enjoyed this article?

Leave a comment