Learnixo
Back to blog
AI Systemsadvanced

LLM Cost Breakdown and Optimization

How LLM costs are structured, how to estimate them, and practical strategies for reducing API costs without sacrificing quality in production systems.

Asma Hafeez KhanMay 16, 20267 min read
LLMCost OptimizationCachingModel SelectionProduction
Share:𝕏

How LLM Costs Work

API-based LLMs charge per token. Understanding the token economics is essential for production planning:

Python
# Approximate pricing (2025  verify current pricing)
LLM_PRICING = {
    "gpt-4o": {
        "input": 2.50,      # $ per million tokens
        "output": 10.00,
        "context_window": 128_000,
    },
    "gpt-4o-mini": {
        "input": 0.15,
        "output": 0.60,
        "context_window": 128_000,
    },
    "claude-opus-4-7": {
        "input": 15.00,
        "output": 75.00,
        "context_window": 200_000,
    },
    "claude-sonnet-4-6": {
        "input": 3.00,
        "output": 15.00,
        "context_window": 200_000,
    },
    "claude-haiku-4-5": {
        "input": 0.80,
        "output": 4.00,
        "context_window": 200_000,
    },
    "gemini-1.5-pro": {
        "input": 3.50,
        "output": 10.50,
        "context_window": 1_000_000,
    },
}

def estimate_api_cost(
    model: str,
    input_tokens_per_request: int,
    output_tokens_per_request: int,
    requests_per_month: int,
) -> dict:
    """Estimate monthly API cost for a given workload."""
    pricing = LLM_PRICING.get(model)
    if not pricing:
        raise ValueError(f"Unknown model: {model}")

    monthly_input_tokens = input_tokens_per_request * requests_per_month
    monthly_output_tokens = output_tokens_per_request * requests_per_month

    input_cost = (monthly_input_tokens / 1_000_000) * pricing["input"]
    output_cost = (monthly_output_tokens / 1_000_000) * pricing["output"]
    total_cost = input_cost + output_cost

    return {
        "model": model,
        "monthly_requests": requests_per_month,
        "monthly_input_tokens": monthly_input_tokens,
        "monthly_output_tokens": monthly_output_tokens,
        "input_cost": f"${input_cost:.2f}",
        "output_cost": f"${output_cost:.2f}",
        "total_monthly_cost": f"${total_cost:.2f}",
    }


# Example: drug interaction chatbot
estimate = estimate_api_cost(
    model="gpt-4o",
    input_tokens_per_request=800,    # System prompt + user message
    output_tokens_per_request=300,   # Average response
    requests_per_month=100_000,
)
print(f"Drug interaction chatbot (gpt-4o): {estimate['total_monthly_cost']}/month")
#  $11,000/month for 100K requests

Token Counting

Accurate token counting before API calls helps predict costs:

Python
import tiktoken
from transformers import AutoTokenizer

def count_tokens_openai(text: str, model: str = "gpt-4o") -> int:
    """Count tokens for OpenAI models using tiktoken."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def count_tokens_huggingface(text: str, model_id: str) -> int:
    """Count tokens for HuggingFace models."""
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    return len(tokenizer.encode(text))

def estimate_conversation_tokens(messages: list[dict], model: str = "gpt-4o") -> dict:
    """
    Estimate token count for a full conversation, including message formatting overhead.
    OpenAI adds overhead tokens for each message in the format.
    """
    encoding = tiktoken.encoding_for_model(model)

    # 3 tokens overhead per message + 3 for the reply primer
    overhead_per_message = 3
    reply_primer = 3

    total = reply_primer
    for message in messages:
        total += overhead_per_message
        total += len(encoding.encode(message.get("content", "")))
        total += len(encoding.encode(message.get("role", "")))

    return {
        "total_tokens": total,
        "estimated_cost_gpt4o": f"${total / 1_000_000 * 2.50:.4f}",
    }

Strategy 1: Model Routing

Route requests to cheaper models when they're sufficient:

Python
from openai import OpenAI
from enum import Enum

client = OpenAI()

class ComplexityLevel(str, Enum):
    SIMPLE = "simple"          # Factual lookup, simple Q&A
    MODERATE = "moderate"      # Multi-step reasoning
    COMPLEX = "complex"        # Clinical judgment, nuanced analysis

def classify_request_complexity(query: str) -> ComplexityLevel:
    """Classify request complexity using a cheap model."""
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Cheap classifier
        messages=[
            {
                "role": "system",
                "content": """Classify the complexity of this pharmacology question.

simple: Single factual lookup (drug name, basic interaction yes/no, simple definition)
moderate: Requires multi-step reasoning or synthesis of multiple facts
complex: Requires clinical judgment, patient-specific analysis, or complex pharmacokinetics

Return ONLY: simple, moderate, or complex""",
            },
            {"role": "user", "content": query},
        ],
        max_tokens=10,
        temperature=0,
    )
    classification = response.choices[0].message.content.strip().lower()
    return ComplexityLevel(classification) if classification in [e.value for e in ComplexityLevel] else ComplexityLevel.MODERATE


MODEL_FOR_COMPLEXITY = {
    ComplexityLevel.SIMPLE: "gpt-4o-mini",     # $0.15/M input
    ComplexityLevel.MODERATE: "gpt-4o",         # $2.50/M input
    ComplexityLevel.COMPLEX: "claude-sonnet-4-6", # $3.00/M input
}

def routed_query(query: str) -> dict:
    """Route query to appropriate model based on complexity."""
    complexity = classify_request_complexity(query)
    model = MODEL_FOR_COMPLEXITY[complexity]

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": query}],
        temperature=0,
    )

    return {
        "complexity": complexity.value,
        "model_used": model,
        "response": response.choices[0].message.content,
        "tokens_used": response.usage.total_tokens,
    }

Strategy 2: Prompt Caching

Prefix caching stores computed KV cache for repeated system prompts:

Python
# OpenAI prompt caching (automatically applied for prompts over 1024 tokens)
# Claude supports explicit cache control

import anthropic

claude_client = anthropic.Anthropic()

# Large system prompt that stays constant across many requests
LARGE_CLINICAL_SYSTEM = """You are a clinical pharmacist...""" + """
[Extensive clinical knowledge base, drug monographs, dosing tables, etc.
This would be thousands of tokens in production]
""" * 50  # Simulating a large static system prompt

def cached_clinical_query(question: str) -> str:
    """Use Claude's cache_control to cache the system prompt."""
    response = claude_client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        system=[
            {
                "type": "text",
                "text": LARGE_CLINICAL_SYSTEM,
                "cache_control": {"type": "ephemeral"},  # Cache this block
            }
        ],
        messages=[{"role": "user", "content": question}],
    )

    # Check cache usage in response metadata
    usage = response.usage
    print(f"Cache read: {usage.cache_read_input_tokens} tokens (saved!)")
    print(f"Cache write: {usage.cache_creation_input_tokens} tokens (new cache)")

    return response.content[0].text

# First call: pays full price to create cache
# Subsequent calls within 5 minutes: cache_read_input_tokens priced at 10% of normal
# Savings: up to 90% on cached tokens for repeated requests with same system prompt

Strategy 3: Response Caching

Cache LLM responses for identical or semantically similar queries:

Python
import hashlib
import json
import redis
from datetime import timedelta

class LLMResponseCache:
    """Cache LLM responses to avoid redundant API calls."""

    def __init__(self, redis_url: str = "redis://localhost:6379", ttl_hours: int = 24):
        self.cache = redis.from_url(redis_url)
        self.ttl = timedelta(hours=ttl_hours)
        self.hits = 0
        self.misses = 0

    def _cache_key(self, model: str, messages: list[dict], temperature: float) -> str:
        """Create deterministic cache key from request parameters."""
        content = json.dumps({
            "model": model,
            "messages": messages,
            "temperature": temperature,
        }, sort_keys=True)
        return f"llm_cache:{hashlib.sha256(content.encode()).hexdigest()}"

    def get(self, model: str, messages: list[dict], temperature: float = 0) -> str | None:
        """Retrieve cached response if available."""
        if temperature > 0:
            return None  # Don't cache non-deterministic responses

        key = self._cache_key(model, messages, temperature)
        cached = self.cache.get(key)

        if cached:
            self.hits += 1
            return json.loads(cached)["response"]

        self.misses += 1
        return None

    def set(self, model: str, messages: list[dict], temperature: float, response: str) -> None:
        """Cache a response."""
        if temperature > 0:
            return  # Don't cache non-deterministic responses

        key = self._cache_key(model, messages, temperature)
        value = json.dumps({"response": response, "model": model})
        self.cache.setex(key, self.ttl, value)

    def get_stats(self) -> dict:
        total = self.hits + self.misses
        return {
            "hit_rate": self.hits / total if total > 0 else 0,
            "total_requests": total,
            "estimated_savings": f"${self.hits * 0.003:.2f}",  # Rough estimate
        }


cache = LLMResponseCache()

def cached_api_call(model: str, messages: list[dict], temperature: float = 0) -> str:
    """LLM call with caching."""
    # Check cache first
    cached_response = cache.get(model, messages, temperature)
    if cached_response:
        return cached_response

    # Make API call
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    result = response.choices[0].message.content

    # Cache the result
    cache.set(model, messages, temperature, result)
    return result

Strategy 4: Output Truncation and Streaming

Charge for fewer output tokens by limiting response length:

Python
def cost_optimized_response(
    query: str,
    max_tokens: int = 150,  # Limit response length
    model: str = "gpt-4o-mini",
) -> str:
    """Get a cost-optimized response with shorter output."""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": f"Answer concisely in under {max_tokens // 4} words.",
            },
            {"role": "user", "content": query},
        ],
        max_tokens=max_tokens,
        temperature=0,
    )
    return response.choices[0].message.content


def stream_response(query: str, model: str = "gpt-4o") -> None:
    """Stream response to reduce time-to-first-token (not a cost saving, but better UX)."""
    with client.chat.completions.stream(
        model=model,
        messages=[{"role": "user", "content": query}],
    ) as stream:
        for chunk in stream:
            if chunk.choices[0].delta.content:
                print(chunk.choices[0].delta.content, end="", flush=True)
    print()  # Newline at end

Cost Monitoring Dashboard

Python
from datetime import datetime

class CostMonitor:
    """Track and alert on LLM API spending."""

    def __init__(self, daily_budget_usd: float = 100.0):
        self.daily_budget = daily_budget_usd
        self.daily_spend = 0.0
        self.request_log = []

    def record_request(
        self,
        model: str,
        input_tokens: int,
        output_tokens: int,
        feature: str = "unknown",
    ) -> dict:
        pricing = LLM_PRICING.get(model, {"input": 0, "output": 0})
        cost = (
            input_tokens / 1_000_000 * pricing["input"] +
            output_tokens / 1_000_000 * pricing["output"]
        )

        self.daily_spend += cost
        self.request_log.append({
            "timestamp": datetime.now().isoformat(),
            "model": model,
            "feature": feature,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "cost": cost,
        })

        if self.daily_spend > self.daily_budget * 0.8:
            print(f"WARNING: Daily spend ${self.daily_spend:.2f} exceeds 80% of ${self.daily_budget} budget")

        return {"cost": cost, "daily_total": self.daily_spend}

    def get_daily_breakdown(self) -> dict:
        """Breakdown costs by feature."""
        by_feature = {}
        for req in self.request_log:
            feature = req["feature"]
            if feature not in by_feature:
                by_feature[feature] = {"cost": 0.0, "requests": 0, "tokens": 0}
            by_feature[feature]["cost"] += req["cost"]
            by_feature[feature]["requests"] += 1
            by_feature[feature]["tokens"] += req["input_tokens"] + req["output_tokens"]

        return {
            "total_daily_spend": f"${self.daily_spend:.4f}",
            "budget_remaining": f"${self.daily_budget - self.daily_spend:.4f}",
            "by_feature": by_feature,
        }

Enjoyed this article?

Explore the AI Systems learning path for more.

Found this helpful?

Share:𝕏

Leave a comment

Have a question, correction, or just found this helpful? Leave a note below.