Learnixo

GenAI & LLM Interviews · Lesson 6 of 30

LLM Cost Breakdown: Tokens, Caching, Routing

How LLM Costs Work

API-based LLMs charge per token. Understanding the token economics is essential for production planning:

Python
# Approximate pricing (2025  verify current pricing)
LLM_PRICING = {
    "gpt-4o": {
        "input": 2.50,      # $ per million tokens
        "output": 10.00,
        "context_window": 128_000,
    },
    "gpt-4o-mini": {
        "input": 0.15,
        "output": 0.60,
        "context_window": 128_000,
    },
    "claude-opus-4-7": {
        "input": 15.00,
        "output": 75.00,
        "context_window": 200_000,
    },
    "claude-sonnet-4-6": {
        "input": 3.00,
        "output": 15.00,
        "context_window": 200_000,
    },
    "claude-haiku-4-5": {
        "input": 0.80,
        "output": 4.00,
        "context_window": 200_000,
    },
    "gemini-1.5-pro": {
        "input": 3.50,
        "output": 10.50,
        "context_window": 1_000_000,
    },
}

def estimate_api_cost(
    model: str,
    input_tokens_per_request: int,
    output_tokens_per_request: int,
    requests_per_month: int,
) -> dict:
    """Estimate monthly API cost for a given workload."""
    pricing = LLM_PRICING.get(model)
    if not pricing:
        raise ValueError(f"Unknown model: {model}")

    monthly_input_tokens = input_tokens_per_request * requests_per_month
    monthly_output_tokens = output_tokens_per_request * requests_per_month

    input_cost = (monthly_input_tokens / 1_000_000) * pricing["input"]
    output_cost = (monthly_output_tokens / 1_000_000) * pricing["output"]
    total_cost = input_cost + output_cost

    return {
        "model": model,
        "monthly_requests": requests_per_month,
        "monthly_input_tokens": monthly_input_tokens,
        "monthly_output_tokens": monthly_output_tokens,
        "input_cost": f"${input_cost:.2f}",
        "output_cost": f"${output_cost:.2f}",
        "total_monthly_cost": f"${total_cost:.2f}",
    }


# Example: drug interaction chatbot
estimate = estimate_api_cost(
    model="gpt-4o",
    input_tokens_per_request=800,    # System prompt + user message
    output_tokens_per_request=300,   # Average response
    requests_per_month=100_000,
)
print(f"Drug interaction chatbot (gpt-4o): {estimate['total_monthly_cost']}/month")
#  $11,000/month for 100K requests

Token Counting

Accurate token counting before API calls helps predict costs:

Python
import tiktoken
from transformers import AutoTokenizer

def count_tokens_openai(text: str, model: str = "gpt-4o") -> int:
    """Count tokens for OpenAI models using tiktoken."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

def count_tokens_huggingface(text: str, model_id: str) -> int:
    """Count tokens for HuggingFace models."""
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    return len(tokenizer.encode(text))

def estimate_conversation_tokens(messages: list[dict], model: str = "gpt-4o") -> dict:
    """
    Estimate token count for a full conversation, including message formatting overhead.
    OpenAI adds overhead tokens for each message in the format.
    """
    encoding = tiktoken.encoding_for_model(model)

    # 3 tokens overhead per message + 3 for the reply primer
    overhead_per_message = 3
    reply_primer = 3

    total = reply_primer
    for message in messages:
        total += overhead_per_message
        total += len(encoding.encode(message.get("content", "")))
        total += len(encoding.encode(message.get("role", "")))

    return {
        "total_tokens": total,
        "estimated_cost_gpt4o": f"${total / 1_000_000 * 2.50:.4f}",
    }

Strategy 1: Model Routing

Route requests to cheaper models when they're sufficient:

Python
from openai import OpenAI
from enum import Enum

client = OpenAI()

class ComplexityLevel(str, Enum):
    SIMPLE = "simple"          # Factual lookup, simple Q&A
    MODERATE = "moderate"      # Multi-step reasoning
    COMPLEX = "complex"        # Clinical judgment, nuanced analysis

def classify_request_complexity(query: str) -> ComplexityLevel:
    """Classify request complexity using a cheap model."""
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Cheap classifier
        messages=[
            {
                "role": "system",
                "content": """Classify the complexity of this pharmacology question.

simple: Single factual lookup (drug name, basic interaction yes/no, simple definition)
moderate: Requires multi-step reasoning or synthesis of multiple facts
complex: Requires clinical judgment, patient-specific analysis, or complex pharmacokinetics

Return ONLY: simple, moderate, or complex""",
            },
            {"role": "user", "content": query},
        ],
        max_tokens=10,
        temperature=0,
    )
    classification = response.choices[0].message.content.strip().lower()
    return ComplexityLevel(classification) if classification in [e.value for e in ComplexityLevel] else ComplexityLevel.MODERATE


MODEL_FOR_COMPLEXITY = {
    ComplexityLevel.SIMPLE: "gpt-4o-mini",     # $0.15/M input
    ComplexityLevel.MODERATE: "gpt-4o",         # $2.50/M input
    ComplexityLevel.COMPLEX: "claude-sonnet-4-6", # $3.00/M input
}

def routed_query(query: str) -> dict:
    """Route query to appropriate model based on complexity."""
    complexity = classify_request_complexity(query)
    model = MODEL_FOR_COMPLEXITY[complexity]

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": query}],
        temperature=0,
    )

    return {
        "complexity": complexity.value,
        "model_used": model,
        "response": response.choices[0].message.content,
        "tokens_used": response.usage.total_tokens,
    }

Strategy 2: Prompt Caching

Prefix caching stores computed KV cache for repeated system prompts:

Python
# OpenAI prompt caching (automatically applied for prompts over 1024 tokens)
# Claude supports explicit cache control

import anthropic

claude_client = anthropic.Anthropic()

# Large system prompt that stays constant across many requests
LARGE_CLINICAL_SYSTEM = """You are a clinical pharmacist...""" + """
[Extensive clinical knowledge base, drug monographs, dosing tables, etc.
This would be thousands of tokens in production]
""" * 50  # Simulating a large static system prompt

def cached_clinical_query(question: str) -> str:
    """Use Claude's cache_control to cache the system prompt."""
    response = claude_client.messages.create(
        model="claude-sonnet-4-6",
        max_tokens=1024,
        system=[
            {
                "type": "text",
                "text": LARGE_CLINICAL_SYSTEM,
                "cache_control": {"type": "ephemeral"},  # Cache this block
            }
        ],
        messages=[{"role": "user", "content": question}],
    )

    # Check cache usage in response metadata
    usage = response.usage
    print(f"Cache read: {usage.cache_read_input_tokens} tokens (saved!)")
    print(f"Cache write: {usage.cache_creation_input_tokens} tokens (new cache)")

    return response.content[0].text

# First call: pays full price to create cache
# Subsequent calls within 5 minutes: cache_read_input_tokens priced at 10% of normal
# Savings: up to 90% on cached tokens for repeated requests with same system prompt

Strategy 3: Response Caching

Cache LLM responses for identical or semantically similar queries:

Python
import hashlib
import json
import redis
from datetime import timedelta

class LLMResponseCache:
    """Cache LLM responses to avoid redundant API calls."""

    def __init__(self, redis_url: str = "redis://localhost:6379", ttl_hours: int = 24):
        self.cache = redis.from_url(redis_url)
        self.ttl = timedelta(hours=ttl_hours)
        self.hits = 0
        self.misses = 0

    def _cache_key(self, model: str, messages: list[dict], temperature: float) -> str:
        """Create deterministic cache key from request parameters."""
        content = json.dumps({
            "model": model,
            "messages": messages,
            "temperature": temperature,
        }, sort_keys=True)
        return f"llm_cache:{hashlib.sha256(content.encode()).hexdigest()}"

    def get(self, model: str, messages: list[dict], temperature: float = 0) -> str | None:
        """Retrieve cached response if available."""
        if temperature > 0:
            return None  # Don't cache non-deterministic responses

        key = self._cache_key(model, messages, temperature)
        cached = self.cache.get(key)

        if cached:
            self.hits += 1
            return json.loads(cached)["response"]

        self.misses += 1
        return None

    def set(self, model: str, messages: list[dict], temperature: float, response: str) -> None:
        """Cache a response."""
        if temperature > 0:
            return  # Don't cache non-deterministic responses

        key = self._cache_key(model, messages, temperature)
        value = json.dumps({"response": response, "model": model})
        self.cache.setex(key, self.ttl, value)

    def get_stats(self) -> dict:
        total = self.hits + self.misses
        return {
            "hit_rate": self.hits / total if total > 0 else 0,
            "total_requests": total,
            "estimated_savings": f"${self.hits * 0.003:.2f}",  # Rough estimate
        }


cache = LLMResponseCache()

def cached_api_call(model: str, messages: list[dict], temperature: float = 0) -> str:
    """LLM call with caching."""
    # Check cache first
    cached_response = cache.get(model, messages, temperature)
    if cached_response:
        return cached_response

    # Make API call
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    result = response.choices[0].message.content

    # Cache the result
    cache.set(model, messages, temperature, result)
    return result

Strategy 4: Output Truncation and Streaming

Charge for fewer output tokens by limiting response length:

Python
def cost_optimized_response(
    query: str,
    max_tokens: int = 150,  # Limit response length
    model: str = "gpt-4o-mini",
) -> str:
    """Get a cost-optimized response with shorter output."""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": f"Answer concisely in under {max_tokens // 4} words.",
            },
            {"role": "user", "content": query},
        ],
        max_tokens=max_tokens,
        temperature=0,
    )
    return response.choices[0].message.content


def stream_response(query: str, model: str = "gpt-4o") -> None:
    """Stream response to reduce time-to-first-token (not a cost saving, but better UX)."""
    with client.chat.completions.stream(
        model=model,
        messages=[{"role": "user", "content": query}],
    ) as stream:
        for chunk in stream:
            if chunk.choices[0].delta.content:
                print(chunk.choices[0].delta.content, end="", flush=True)
    print()  # Newline at end

Cost Monitoring Dashboard

Python
from datetime import datetime

class CostMonitor:
    """Track and alert on LLM API spending."""

    def __init__(self, daily_budget_usd: float = 100.0):
        self.daily_budget = daily_budget_usd
        self.daily_spend = 0.0
        self.request_log = []

    def record_request(
        self,
        model: str,
        input_tokens: int,
        output_tokens: int,
        feature: str = "unknown",
    ) -> dict:
        pricing = LLM_PRICING.get(model, {"input": 0, "output": 0})
        cost = (
            input_tokens / 1_000_000 * pricing["input"] +
            output_tokens / 1_000_000 * pricing["output"]
        )

        self.daily_spend += cost
        self.request_log.append({
            "timestamp": datetime.now().isoformat(),
            "model": model,
            "feature": feature,
            "input_tokens": input_tokens,
            "output_tokens": output_tokens,
            "cost": cost,
        })

        if self.daily_spend > self.daily_budget * 0.8:
            print(f"WARNING: Daily spend ${self.daily_spend:.2f} exceeds 80% of ${self.daily_budget} budget")

        return {"cost": cost, "daily_total": self.daily_spend}

    def get_daily_breakdown(self) -> dict:
        """Breakdown costs by feature."""
        by_feature = {}
        for req in self.request_log:
            feature = req["feature"]
            if feature not in by_feature:
                by_feature[feature] = {"cost": 0.0, "requests": 0, "tokens": 0}
            by_feature[feature]["cost"] += req["cost"]
            by_feature[feature]["requests"] += 1
            by_feature[feature]["tokens"] += req["input_tokens"] + req["output_tokens"]

        return {
            "total_daily_spend": f"${self.daily_spend:.4f}",
            "budget_remaining": f"${self.daily_budget - self.daily_spend:.4f}",
            "by_feature": by_feature,
        }