AI Systemsadvanced
LLM Cost Breakdown and Optimization
How LLM costs are structured, how to estimate them, and practical strategies for reducing API costs without sacrificing quality in production systems.
Asma Hafeez KhanMay 16, 20267 min read
LLMCost OptimizationCachingModel SelectionProduction
How LLM Costs Work
API-based LLMs charge per token. Understanding the token economics is essential for production planning:
Python
# Approximate pricing (2025 — verify current pricing)
LLM_PRICING = {
"gpt-4o": {
"input": 2.50, # $ per million tokens
"output": 10.00,
"context_window": 128_000,
},
"gpt-4o-mini": {
"input": 0.15,
"output": 0.60,
"context_window": 128_000,
},
"claude-opus-4-7": {
"input": 15.00,
"output": 75.00,
"context_window": 200_000,
},
"claude-sonnet-4-6": {
"input": 3.00,
"output": 15.00,
"context_window": 200_000,
},
"claude-haiku-4-5": {
"input": 0.80,
"output": 4.00,
"context_window": 200_000,
},
"gemini-1.5-pro": {
"input": 3.50,
"output": 10.50,
"context_window": 1_000_000,
},
}
def estimate_api_cost(
model: str,
input_tokens_per_request: int,
output_tokens_per_request: int,
requests_per_month: int,
) -> dict:
"""Estimate monthly API cost for a given workload."""
pricing = LLM_PRICING.get(model)
if not pricing:
raise ValueError(f"Unknown model: {model}")
monthly_input_tokens = input_tokens_per_request * requests_per_month
monthly_output_tokens = output_tokens_per_request * requests_per_month
input_cost = (monthly_input_tokens / 1_000_000) * pricing["input"]
output_cost = (monthly_output_tokens / 1_000_000) * pricing["output"]
total_cost = input_cost + output_cost
return {
"model": model,
"monthly_requests": requests_per_month,
"monthly_input_tokens": monthly_input_tokens,
"monthly_output_tokens": monthly_output_tokens,
"input_cost": f"${input_cost:.2f}",
"output_cost": f"${output_cost:.2f}",
"total_monthly_cost": f"${total_cost:.2f}",
}
# Example: drug interaction chatbot
estimate = estimate_api_cost(
model="gpt-4o",
input_tokens_per_request=800, # System prompt + user message
output_tokens_per_request=300, # Average response
requests_per_month=100_000,
)
print(f"Drug interaction chatbot (gpt-4o): {estimate['total_monthly_cost']}/month")
# ≈ $11,000/month for 100K requestsToken Counting
Accurate token counting before API calls helps predict costs:
Python
import tiktoken
from transformers import AutoTokenizer
def count_tokens_openai(text: str, model: str = "gpt-4o") -> int:
"""Count tokens for OpenAI models using tiktoken."""
encoding = tiktoken.encoding_for_model(model)
return len(encoding.encode(text))
def count_tokens_huggingface(text: str, model_id: str) -> int:
"""Count tokens for HuggingFace models."""
tokenizer = AutoTokenizer.from_pretrained(model_id)
return len(tokenizer.encode(text))
def estimate_conversation_tokens(messages: list[dict], model: str = "gpt-4o") -> dict:
"""
Estimate token count for a full conversation, including message formatting overhead.
OpenAI adds overhead tokens for each message in the format.
"""
encoding = tiktoken.encoding_for_model(model)
# 3 tokens overhead per message + 3 for the reply primer
overhead_per_message = 3
reply_primer = 3
total = reply_primer
for message in messages:
total += overhead_per_message
total += len(encoding.encode(message.get("content", "")))
total += len(encoding.encode(message.get("role", "")))
return {
"total_tokens": total,
"estimated_cost_gpt4o": f"${total / 1_000_000 * 2.50:.4f}",
}Strategy 1: Model Routing
Route requests to cheaper models when they're sufficient:
Python
from openai import OpenAI
from enum import Enum
client = OpenAI()
class ComplexityLevel(str, Enum):
SIMPLE = "simple" # Factual lookup, simple Q&A
MODERATE = "moderate" # Multi-step reasoning
COMPLEX = "complex" # Clinical judgment, nuanced analysis
def classify_request_complexity(query: str) -> ComplexityLevel:
"""Classify request complexity using a cheap model."""
response = client.chat.completions.create(
model="gpt-4o-mini", # Cheap classifier
messages=[
{
"role": "system",
"content": """Classify the complexity of this pharmacology question.
simple: Single factual lookup (drug name, basic interaction yes/no, simple definition)
moderate: Requires multi-step reasoning or synthesis of multiple facts
complex: Requires clinical judgment, patient-specific analysis, or complex pharmacokinetics
Return ONLY: simple, moderate, or complex""",
},
{"role": "user", "content": query},
],
max_tokens=10,
temperature=0,
)
classification = response.choices[0].message.content.strip().lower()
return ComplexityLevel(classification) if classification in [e.value for e in ComplexityLevel] else ComplexityLevel.MODERATE
MODEL_FOR_COMPLEXITY = {
ComplexityLevel.SIMPLE: "gpt-4o-mini", # $0.15/M input
ComplexityLevel.MODERATE: "gpt-4o", # $2.50/M input
ComplexityLevel.COMPLEX: "claude-sonnet-4-6", # $3.00/M input
}
def routed_query(query: str) -> dict:
"""Route query to appropriate model based on complexity."""
complexity = classify_request_complexity(query)
model = MODEL_FOR_COMPLEXITY[complexity]
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": query}],
temperature=0,
)
return {
"complexity": complexity.value,
"model_used": model,
"response": response.choices[0].message.content,
"tokens_used": response.usage.total_tokens,
}Strategy 2: Prompt Caching
Prefix caching stores computed KV cache for repeated system prompts:
Python
# OpenAI prompt caching (automatically applied for prompts over 1024 tokens)
# Claude supports explicit cache control
import anthropic
claude_client = anthropic.Anthropic()
# Large system prompt that stays constant across many requests
LARGE_CLINICAL_SYSTEM = """You are a clinical pharmacist...""" + """
[Extensive clinical knowledge base, drug monographs, dosing tables, etc.
This would be thousands of tokens in production]
""" * 50 # Simulating a large static system prompt
def cached_clinical_query(question: str) -> str:
"""Use Claude's cache_control to cache the system prompt."""
response = claude_client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=[
{
"type": "text",
"text": LARGE_CLINICAL_SYSTEM,
"cache_control": {"type": "ephemeral"}, # Cache this block
}
],
messages=[{"role": "user", "content": question}],
)
# Check cache usage in response metadata
usage = response.usage
print(f"Cache read: {usage.cache_read_input_tokens} tokens (saved!)")
print(f"Cache write: {usage.cache_creation_input_tokens} tokens (new cache)")
return response.content[0].text
# First call: pays full price to create cache
# Subsequent calls within 5 minutes: cache_read_input_tokens priced at 10% of normal
# Savings: up to 90% on cached tokens for repeated requests with same system promptStrategy 3: Response Caching
Cache LLM responses for identical or semantically similar queries:
Python
import hashlib
import json
import redis
from datetime import timedelta
class LLMResponseCache:
"""Cache LLM responses to avoid redundant API calls."""
def __init__(self, redis_url: str = "redis://localhost:6379", ttl_hours: int = 24):
self.cache = redis.from_url(redis_url)
self.ttl = timedelta(hours=ttl_hours)
self.hits = 0
self.misses = 0
def _cache_key(self, model: str, messages: list[dict], temperature: float) -> str:
"""Create deterministic cache key from request parameters."""
content = json.dumps({
"model": model,
"messages": messages,
"temperature": temperature,
}, sort_keys=True)
return f"llm_cache:{hashlib.sha256(content.encode()).hexdigest()}"
def get(self, model: str, messages: list[dict], temperature: float = 0) -> str | None:
"""Retrieve cached response if available."""
if temperature > 0:
return None # Don't cache non-deterministic responses
key = self._cache_key(model, messages, temperature)
cached = self.cache.get(key)
if cached:
self.hits += 1
return json.loads(cached)["response"]
self.misses += 1
return None
def set(self, model: str, messages: list[dict], temperature: float, response: str) -> None:
"""Cache a response."""
if temperature > 0:
return # Don't cache non-deterministic responses
key = self._cache_key(model, messages, temperature)
value = json.dumps({"response": response, "model": model})
self.cache.setex(key, self.ttl, value)
def get_stats(self) -> dict:
total = self.hits + self.misses
return {
"hit_rate": self.hits / total if total > 0 else 0,
"total_requests": total,
"estimated_savings": f"${self.hits * 0.003:.2f}", # Rough estimate
}
cache = LLMResponseCache()
def cached_api_call(model: str, messages: list[dict], temperature: float = 0) -> str:
"""LLM call with caching."""
# Check cache first
cached_response = cache.get(model, messages, temperature)
if cached_response:
return cached_response
# Make API call
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
)
result = response.choices[0].message.content
# Cache the result
cache.set(model, messages, temperature, result)
return resultStrategy 4: Output Truncation and Streaming
Charge for fewer output tokens by limiting response length:
Python
def cost_optimized_response(
query: str,
max_tokens: int = 150, # Limit response length
model: str = "gpt-4o-mini",
) -> str:
"""Get a cost-optimized response with shorter output."""
response = client.chat.completions.create(
model=model,
messages=[
{
"role": "system",
"content": f"Answer concisely in under {max_tokens // 4} words.",
},
{"role": "user", "content": query},
],
max_tokens=max_tokens,
temperature=0,
)
return response.choices[0].message.content
def stream_response(query: str, model: str = "gpt-4o") -> None:
"""Stream response to reduce time-to-first-token (not a cost saving, but better UX)."""
with client.chat.completions.stream(
model=model,
messages=[{"role": "user", "content": query}],
) as stream:
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
print() # Newline at endCost Monitoring Dashboard
Python
from datetime import datetime
class CostMonitor:
"""Track and alert on LLM API spending."""
def __init__(self, daily_budget_usd: float = 100.0):
self.daily_budget = daily_budget_usd
self.daily_spend = 0.0
self.request_log = []
def record_request(
self,
model: str,
input_tokens: int,
output_tokens: int,
feature: str = "unknown",
) -> dict:
pricing = LLM_PRICING.get(model, {"input": 0, "output": 0})
cost = (
input_tokens / 1_000_000 * pricing["input"] +
output_tokens / 1_000_000 * pricing["output"]
)
self.daily_spend += cost
self.request_log.append({
"timestamp": datetime.now().isoformat(),
"model": model,
"feature": feature,
"input_tokens": input_tokens,
"output_tokens": output_tokens,
"cost": cost,
})
if self.daily_spend > self.daily_budget * 0.8:
print(f"WARNING: Daily spend ${self.daily_spend:.2f} exceeds 80% of ${self.daily_budget} budget")
return {"cost": cost, "daily_total": self.daily_spend}
def get_daily_breakdown(self) -> dict:
"""Breakdown costs by feature."""
by_feature = {}
for req in self.request_log:
feature = req["feature"]
if feature not in by_feature:
by_feature[feature] = {"cost": 0.0, "requests": 0, "tokens": 0}
by_feature[feature]["cost"] += req["cost"]
by_feature[feature]["requests"] += 1
by_feature[feature]["tokens"] += req["input_tokens"] + req["output_tokens"]
return {
"total_daily_spend": f"${self.daily_spend:.4f}",
"budget_remaining": f"${self.daily_budget - self.daily_spend:.4f}",
"by_feature": by_feature,
}Found this helpful?
Leave a comment
Have a question, correction, or just found this helpful? Leave a note below.