GenAI & LLM Interviews · Lesson 6 of 30
LLM Cost Breakdown: Tokens, Caching, Routing
How LLM Costs Work
API-based LLMs charge per token. Understanding the token economics is essential for production planning:
Python
# Approximate pricing (2025 — verify current pricing)
LLM_PRICING = {
"gpt-4o": {
"input": 2.50, # $ per million tokens
"output": 10.00,
"context_window": 128_000,
},
"gpt-4o-mini": {
"input": 0.15,
"output": 0.60,
"context_window": 128_000,
},
"claude-opus-4-7": {
"input": 15.00,
"output": 75.00,
"context_window": 200_000,
},
"claude-sonnet-4-6": {
"input": 3.00,
"output": 15.00,
"context_window": 200_000,
},
"claude-haiku-4-5": {
"input": 0.80,
"output": 4.00,
"context_window": 200_000,
},
"gemini-1.5-pro": {
"input": 3.50,
"output": 10.50,
"context_window": 1_000_000,
},
}
def estimate_api_cost(
model: str,
input_tokens_per_request: int,
output_tokens_per_request: int,
requests_per_month: int,
) -> dict:
"""Estimate monthly API cost for a given workload."""
pricing = LLM_PRICING.get(model)
if not pricing:
raise ValueError(f"Unknown model: {model}")
monthly_input_tokens = input_tokens_per_request * requests_per_month
monthly_output_tokens = output_tokens_per_request * requests_per_month
input_cost = (monthly_input_tokens / 1_000_000) * pricing["input"]
output_cost = (monthly_output_tokens / 1_000_000) * pricing["output"]
total_cost = input_cost + output_cost
return {
"model": model,
"monthly_requests": requests_per_month,
"monthly_input_tokens": monthly_input_tokens,
"monthly_output_tokens": monthly_output_tokens,
"input_cost": f"${input_cost:.2f}",
"output_cost": f"${output_cost:.2f}",
"total_monthly_cost": f"${total_cost:.2f}",
}
# Example: drug interaction chatbot
estimate = estimate_api_cost(
model="gpt-4o",
input_tokens_per_request=800, # System prompt + user message
output_tokens_per_request=300, # Average response
requests_per_month=100_000,
)
print(f"Drug interaction chatbot (gpt-4o): {estimate['total_monthly_cost']}/month")
# ≈ $11,000/month for 100K requestsToken Counting
Accurate token counting before API calls helps predict costs:
Python
import tiktoken
from transformers import AutoTokenizer
def count_tokens_openai(text: str, model: str = "gpt-4o") -> int:
"""Count tokens for OpenAI models using tiktoken."""
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
def count_tokens_huggingface(text: str, model_id: str) -> int:
"""Count tokens for HuggingFace models."""
tokenizer = AutoTokenizer.from_pretrained(model_id)
return len(tokenizer.encode(text))
def estimate_conversation_tokens(messages: list[dict], model: str = "gpt-4o") -> dict:
"""
Estimate token count for a full conversation, including message formatting overhead.
OpenAI adds overhead tokens for each message in the format.
"""
encoding = tiktoken.encoding_for_model(model)
# 3 tokens overhead per message + 3 for the reply primer
overhead_per_message = 3
reply_primer = 3
total = reply_primer
for message in messages:
total += overhead_per_message
total += len(encoding.encode(message.get("content", "")))
total += len(encoding.encode(message.get("role", "")))
return {
"total_tokens": total,
"estimated_cost_gpt4o": f"${total / 1_000_000 * 2.50:.4f}",
}Strategy 1: Model Routing
Route requests to cheaper models when they're sufficient:
Python
from openai import OpenAI
from enum import Enum
client = OpenAI()
class ComplexityLevel(str, Enum):
SIMPLE = "simple" # Factual lookup, simple Q&A
MODERATE = "moderate" # Multi-step reasoning
COMPLEX = "complex" # Clinical judgment, nuanced analysis
def classify_request_complexity(query: str) -> ComplexityLevel:
"""Classify request complexity using a cheap model."""
response = client.chat.completions.create(
model="gpt-4o-mini", # Cheap classifier
messages=[
{
"role": "system",
"content": """Classify the complexity of this pharmacology question.
simple: Single factual lookup (drug name, basic interaction yes/no, simple definition)
moderate: Requires multi-step reasoning or synthesis of multiple facts
complex: Requires clinical judgment, patient-specific analysis, or complex pharmacokinetics
Return ONLY: simple, moderate, or complex""",
},
{"role": "user", "content": query},
],
max_tokens=10,
temperature=0,
)
classification = response.choices[0].message.content.strip().lower()
return ComplexityLevel(classification) if classification in [e.value for e in ComplexityLevel] else ComplexityLevel.MODERATE
MODEL_FOR_COMPLEXITY = {
ComplexityLevel.SIMPLE: "gpt-4o-mini", # $0.15/M input
ComplexityLevel.MODERATE: "gpt-4o", # $2.50/M input
ComplexityLevel.COMPLEX: "claude-sonnet-4-6", # $3.00/M input
}
def routed_query(query: str) -> dict:
"""Route query to appropriate model based on complexity."""
complexity = classify_request_complexity(query)
model = MODEL_FOR_COMPLEXITY[complexity]
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": query}],
temperature=0,
)
return {
"complexity": complexity.value,
"model_used": model,
"response": response.choices[0].message.content,
"tokens_used": response.usage.total_tokens,
}Strategy 2: Prompt Caching
Prefix caching stores computed KV cache for repeated system prompts:
Python
# OpenAI prompt caching (automatically applied for prompts over 1024 tokens)
# Claude supports explicit cache control
import anthropic
claude_client = anthropic.Anthropic()
# Large system prompt that stays constant across many requests
LARGE_CLINICAL_SYSTEM = """You are a clinical pharmacist...""" + """
[Extensive clinical knowledge base, drug monographs, dosing tables, etc.
This would be thousands of tokens in production]
""" * 50 # Simulating a large static system prompt
def cached_clinical_query(question: str) -> str:
"""Use Claude's cache_control to cache the system prompt."""
response = claude_client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=[
{
"type": "text",
"text": LARGE_CLINICAL_SYSTEM,
"cache_control": {"type": "ephemeral"}, # Cache this block
}
],
messages=[{"role": "user", "content": question}],
)
# Check cache usage in response metadata
usage = response.usage
print(f"Cache read: {usage.cache_read_input_tokens} tokens (saved!)")
print(f"Cache write: {usage.cache_creation_input_tokens} tokens (new cache)")
return response.content[0].text
# First call: pays full price to create cache
# Subsequent calls within 5 minutes: cache_read_input_tokens priced at 10% of normal
# Savings: up to 90% on cached tokens for repeated requests with same system promptStrategy 3: Response Caching
Cache LLM responses for identical or semantically similar queries:
Python
import hashlib
import json
import redis
from datetime import timedelta
class LLMResponseCache:
"""Cache LLM responses to avoid redundant API calls."""
def __init__(self, redis_url: str = "redis://localhost:6379", ttl_hours: int = 24):
self.cache = redis.from_url(redis_url)
self.ttl = timedelta(hours=ttl_hours)
self.hits = 0
self.misses = 0
def _cache_key(self, model: str, messages: list[dict], temperature: float) -> str:
"""Create deterministic cache key from request parameters."""
content = json.dumps({
"model": model,
"messages": messages,
"temperature": temperature,
}, sort_keys=True)
return f"llm_cache:{hashlib.sha256(content.encode()).hexdigest()}"
def get(self, model: str, messages: list[dict], temperature: float = 0) -> str | None:
"""Retrieve cached response if available."""
if temperature > 0:
return None # Don't cache non-deterministic responses
key = self._cache_key(model, messages, temperature)
cached = self.cache.get(key)
if cached:
self.hits += 1
return json.loads(cached)["response"]
self.misses += 1
return None
def set(self, model: str, messages: list[dict], temperature: float, response: str) -> None:
"""Cache a response."""
if temperature > 0:
return # Don't cache non-deterministic responses
key = self._cache_key(model, messages, temperature)
value = json.dumps({"response": response, "model": model})
self.cache.setex(key, self.ttl, value)
def get_stats(self) -> dict:
total = self.hits + self.misses
return {
"hit_rate": self.hits / total if total > 0 else 0,
"total_requests": total,
"estimated_savings": f"${self.hits * 0.003:.2f}", # Rough estimate
}
cache = LLMResponseCache()
def cached_api_call(model: str, messages: list[dict], temperature: float = 0) -> str:
"""LLM call with caching."""
# Check cache first
cached_response = cache.get(model, messages, temperature)
if cached_response:
return cached_response
# Make API call
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
)
result = response.choices[0].message.content
# Cache the result
cache.set(model, messages, temperature, result)
return resultStrategy 4: Output Truncation and Streaming
Charge for fewer output tokens by limiting response length:
Python
def cost_optimized_response(
query: str,
max_tokens: int = 150, # Limit response length
model: str = "gpt-4o-mini",
) -> str:
"""Get a cost-optimized response with shorter output."""
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": f"Answer concisely in under {max_tokens // 4} words.",
},
{"role": "user", "content": query},
],
max_tokens=max_tokens,
temperature=0,
)
return response.choices[0].message.content
def stream_response(query: str, model: str = "gpt-4o") -> None:
"""Stream response to reduce time-to-first-token (not a cost saving, but better UX)."""
with client.chat.completions.stream(
model=model,
messages=[{"role": "user", "content": query}],
) as stream:
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
print() # Newline at endCost Monitoring Dashboard
Python
from datetime import datetime
class CostMonitor:
"""Track and alert on LLM API spending."""
def __init__(self, daily_budget_usd: float = 100.0):
self.daily_budget = daily_budget_usd
self.daily_spend = 0.0
self.request_log = []
def record_request(
self,
model: str,
input_tokens: int,
output_tokens: int,
feature: str = "unknown",
) -> dict:
pricing = LLM_PRICING.get(model, {"input": 0, "output": 0})
cost = (
input_tokens / 1_000_000 * pricing["input"] +
output_tokens / 1_000_000 * pricing["output"]
)
self.daily_spend += cost
self.request_log.append({
"timestamp": datetime.now().isoformat(),
"model": model,
"feature": feature,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"cost": cost,
})
if self.daily_spend > self.daily_budget * 0.8:
print(f"WARNING: Daily spend ${self.daily_spend:.2f} exceeds 80% of ${self.daily_budget} budget")
return {"cost": cost, "daily_total": self.daily_spend}
def get_daily_breakdown(self) -> dict:
"""Breakdown costs by feature."""
by_feature = {}
for req in self.request_log:
feature = req["feature"]
if feature not in by_feature:
by_feature[feature] = {"cost": 0.0, "requests": 0, "tokens": 0}
by_feature[feature]["cost"] += req["cost"]
by_feature[feature]["requests"] += 1
by_feature[feature]["tokens"] += req["input_tokens"] + req["output_tokens"]
return {
"total_daily_spend": f"${self.daily_spend:.4f}",
"budget_remaining": f"${self.daily_budget - self.daily_spend:.4f}",
"by_feature": by_feature,
}