Scenario Based Questions · Lesson 5 of 13
Scenario: Reduce LLM API Costs by 60%
The Scenario
The CTO sends an email: "Our Azure OpenAI bill is $8,000 this month. We need to get it under $3,200. Engineering, fix this by end of quarter."
This is a real conversation happening at companies everywhere. LLM costs scale with usage in a way that surprises teams who estimated based on early pilot traffic. The good news: there is typically 60-80% in savings available without degrading user experience. The bad news: you have to measure before you cut.
Step 1: Understand Where the Money Goes
Before touching a single line of code, instrument your token usage. The biggest mistake teams make is optimizing blindly.
import logging
from dataclasses import dataclass
from datetime import datetime
from openai import AzureOpenAI
logger = logging.getLogger("cost.tracking")
# Azure OpenAI pricing (check current pricing at azure.microsoft.com/pricing)
PRICING = {
"gpt-4o": {
"input_per_1k": 0.005,
"output_per_1k": 0.015,
},
"gpt-4o-mini": {
"input_per_1k": 0.00015,
"output_per_1k": 0.0006,
},
"text-embedding-3-large": {
"input_per_1k": 0.00013,
"output_per_1k": 0.0,
},
}
@dataclass
class TokenUsageRecord:
request_id: str
endpoint: str # e.g. "rag_query", "summarization", "classification"
model: str
prompt_tokens: int
completion_tokens: int
total_tokens: int
estimated_cost_usd: float
timestamp: str
def log_usage(
request_id: str,
endpoint: str,
model: str,
usage, # openai.types.CompletionUsage
) -> TokenUsageRecord:
pricing = PRICING.get(model, {"input_per_1k": 0.005, "output_per_1k": 0.015})
cost = (
usage.prompt_tokens / 1000 * pricing["input_per_1k"]
+ usage.completion_tokens / 1000 * pricing["output_per_1k"]
)
record = TokenUsageRecord(
request_id=request_id,
endpoint=endpoint,
model=model,
prompt_tokens=usage.prompt_tokens,
completion_tokens=usage.completion_tokens,
total_tokens=usage.total_tokens,
estimated_cost_usd=cost,
timestamp=datetime.utcnow().isoformat(),
)
logger.info(vars(record))
return recordAfter one week of logging, aggregate by endpoint:
import sqlite3
import pandas as pd
def analyze_cost_by_endpoint(db_path: str, days: int = 7) -> pd.DataFrame:
conn = sqlite3.connect(db_path)
df = pd.read_sql("""
SELECT
endpoint,
model,
COUNT(*) as request_count,
AVG(prompt_tokens) as avg_prompt_tokens,
AVG(completion_tokens) as avg_completion_tokens,
SUM(estimated_cost_usd) as total_cost_usd
FROM token_usage
WHERE timestamp >= datetime('now', ?)
GROUP BY endpoint, model
ORDER BY total_cost_usd DESC
""", conn, params=(f"-{days} days",))
conn.close()
return df
# Example output you might see:
# endpoint model requests avg_prompt avg_completion total_cost
# rag_query gpt-4o 45,200 1,850 320 $4,100
# document_summary gpt-4o 12,800 3,200 800 $2,200
# intent_classifier gpt-4o 58,000 120 15 $950
# embedding text-emb-3L 58,000 85 0 $640This breakdown reveals two key insights:
intent_classifiercalls GPT-4o 58,000 times with only 120 prompt tokens — a massive waste. This should run on GPT-4o mini.document_summaryhas huge prompts (3,200 tokens). These need prompt compression.
Lever 1: Semantic Cache (40-70% cost reduction potential)
The semantic cache is covered in detail in the RAG latency lesson. From a cost perspective, every cache hit eliminates one LLM call. At $0.005/1k input tokens with 1,850-token prompts, each RAG query costs approximately $0.009. At 45,200 queries/week, that is approximately $410/week or $1,800/month just for RAG queries.
A 60% cache hit rate eliminates $1,080/month from that endpoint alone.
from prometheus_client import Counter, Gauge, Histogram
llm_calls_total = Counter("llm_calls_total", "Total LLM calls", ["endpoint", "model", "cache_status"])
llm_cost_total = Counter("llm_cost_usd_total", "Total LLM cost in USD", ["endpoint", "model"])
async def cost_tracked_llm_call(
prompt: str,
endpoint_name: str,
model: str,
cache,
) -> str:
# Check cache first
cache_result = cache.get(prompt)
if cache_result:
llm_calls_total.labels(endpoint=endpoint_name, model=model, cache_status="hit").inc()
return cache_result
# Cache miss — call LLM
llm_calls_total.labels(endpoint=endpoint_name, model=model, cache_status="miss").inc()
response = openai_client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
)
answer = response.choices[0].message.content
# Track cost
record = log_usage("auto", endpoint_name, model, response.usage)
llm_cost_total.labels(endpoint=endpoint_name, model=model).inc(record.estimated_cost_usd)
# Store in cache
cache.set(prompt, answer)
return answerLever 2: Model Routing (the biggest single win)
GPT-4o mini is approximately 30x cheaper than GPT-4o and handles simple queries just as well. The strategy: classify each query's complexity and route accordingly.
from enum import Enum
from openai import AzureOpenAI
class QueryComplexity(Enum):
SIMPLE = "simple"
COMPLEX = "complex"
CLASSIFIER_PROMPT = """Classify the following user query as SIMPLE or COMPLEX.
SIMPLE: factual lookup, yes/no question, short definition, greeting
COMPLEX: multi-step reasoning, comparison, synthesis across multiple topics, ambiguous intent
Respond with only the word SIMPLE or COMPLEX.
Query: {query}"""
def classify_query(query: str) -> QueryComplexity:
"""
Use GPT-4o mini (cheap) to classify, then route accordingly.
Classification cost: ~$0.00002 per query (negligible).
"""
response = openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{"role": "user", "content": CLASSIFIER_PROMPT.format(query=query)}
],
max_tokens=5,
temperature=0,
)
label = response.choices[0].message.content.strip().upper()
return QueryComplexity.SIMPLE if label == "SIMPLE" else QueryComplexity.COMPLEX
def route_model(query: str) -> str:
"""
Returns the model name to use for this query.
Saves approximately 30x on simple queries.
"""
complexity = classify_query(query)
if complexity == QueryComplexity.SIMPLE:
return "gpt-4o-mini"
return "gpt-4o"
async def routed_llm_call(query: str, context: str) -> dict:
model = route_model(query)
response = openai_client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": f"Context:\n{context}"},
{"role": "user", "content": query},
],
)
return {
"answer": response.choices[0].message.content,
"model_used": model,
"cost_usd": calculate_cost(model, response.usage),
}Expected savings assuming 60% of queries are simple:
- Before: 100% of queries at $0.009 each = $9/1,000 queries
- After: 60% at $0.0003 (mini) + 40% at $0.009 (gpt-4o) = $0.0018 + $0.0036 = $0.0054/1,000 queries
- Savings: approximately 40%
Lever 3: Prompt Compression
Long prompts cost money. For the document_summary endpoint with 3,200-token prompts, compressing to 1,500 tokens saves 53% of input cost on that endpoint.
import tiktoken
def compress_context(context: str, max_tokens: int = 1500) -> str:
"""
Trim context to stay within token budget.
Keeps the most relevant top chunks (already sorted by relevance).
"""
enc = tiktoken.encoding_for_model("gpt-4o")
tokens = enc.encode(context)
if len(tokens) <= max_tokens:
return context
# Truncate and decode
trimmed_tokens = tokens[:max_tokens]
return enc.decode(trimmed_tokens)
def compress_system_prompt(verbose_prompt: str) -> str:
"""
Replace wordy instructions with concise equivalents.
Example: reduces a 500-token system prompt to under 200 tokens.
"""
replacements = {
"Please make sure that you always": "Always",
"It is very important that you": "You must",
"In your response, please ensure": "Ensure",
"When providing your answer, you should": "Answer by",
"You are a helpful AI assistant that": "You",
}
result = verbose_prompt
for verbose, concise in replacements.items():
result = result.replace(verbose, concise)
return result
def measure_compression_ratio(original: str, compressed: str) -> dict:
enc = tiktoken.encoding_for_model("gpt-4o")
original_tokens = len(enc.encode(original))
compressed_tokens = len(enc.encode(compressed))
savings_pct = (original_tokens - compressed_tokens) / original_tokens
return {
"original_tokens": original_tokens,
"compressed_tokens": compressed_tokens,
"savings_percent": savings_pct,
"estimated_monthly_savings_usd": savings_pct * monthly_input_cost,
}Lever 4: Batching Embedding Calls
Many teams embed documents one at a time. Azure OpenAI embedding API accepts up to 2,048 inputs per call. Batching reduces API overhead:
async def batch_embed_documents(texts: list[str], batch_size: int = 100) -> list[list[float]]:
"""
Embed in batches of 100 to reduce API call overhead.
Single-item embedding: 58,000 API calls.
Batched embedding: 580 API calls (100x fewer).
"""
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
response = openai_client.embeddings.create(
model="text-embedding-3-large",
input=batch,
)
embeddings = [item.embedding for item in sorted(response.data, key=lambda x: x.index)]
all_embeddings.extend(embeddings)
return all_embeddingsLever 5: Use Smaller Embedding Models for Less Critical Tasks
text-embedding-3-small costs 5x less than text-embedding-3-large and performs well for most RAG workloads. Only use large embeddings when precision is critical.
EMBEDDING_MODELS = {
"high_precision": "text-embedding-3-large", # $0.00013/1k tokens
"standard": "text-embedding-3-small", # $0.00002/1k tokens
}
def select_embedding_model(use_case: str) -> str:
if use_case in ("financial_reports", "legal_documents", "medical_records"):
return EMBEDDING_MODELS["high_precision"]
return EMBEDDING_MODELS["standard"]Putting It Together: Cost Dashboard
from datetime import datetime, timedelta
import sqlite3
def generate_cost_report(db_path: str) -> dict:
conn = sqlite3.connect(db_path)
this_month = conn.execute("""
SELECT SUM(estimated_cost_usd) FROM token_usage
WHERE timestamp >= date('now', 'start of month')
""").fetchone()[0] or 0
last_month = conn.execute("""
SELECT SUM(estimated_cost_usd) FROM token_usage
WHERE timestamp >= date('now', 'start of month', '-1 month')
AND timestamp < date('now', 'start of month')
""").fetchone()[0] or 0
cache_savings = conn.execute("""
SELECT COUNT(*) * AVG(estimated_cost_usd)
FROM token_usage WHERE cache_hit = 1
""").fetchone()[0] or 0
model_routing_savings = conn.execute("""
SELECT SUM(estimated_cost_usd_if_gpt4o - estimated_cost_usd)
FROM token_usage WHERE model = 'gpt-4o-mini'
""").fetchone()[0] or 0
conn.close()
return {
"current_month_usd": round(this_month, 2),
"last_month_usd": round(last_month, 2),
"mom_change_pct": round((this_month - last_month) / max(last_month, 1) * 100, 1),
"cache_savings_usd": round(cache_savings, 2),
"model_routing_savings_usd": round(model_routing_savings, 2),
"projected_monthly_usd": round(this_month / datetime.now().day * 30, 2),
}Projected Savings Summary
Starting from $8,000/month:
| Lever | Estimated Savings | Confidence | |---|---|---| | Semantic cache (60% hit rate on RAG queries) | $1,800/month | High | | Model routing (60% simple queries to mini) | $1,600/month | High | | Prompt compression (document summary endpoint) | $700/month | Medium | | Batch embedding calls | $200/month | High | | Switch to text-embedding-3-small for standard use cases | $300/month | Medium |
Total projected savings: approximately $4,600/month, bringing the bill from $8,000 to approximately $3,400 — a 57.5% reduction, close to the target 60%.
The key principle: measure first, cut second. Without the token usage breakdown, you might compress prompts that account for 5% of costs while ignoring the model routing opportunity that accounts for 40%.