.NET & C# Development · Lesson 172 of 229
AI in Production — Cost Control, Caching, and Fallback Models
AI in Production — Cost Control, Caching, and Fallback Models
At scale, AI API costs compound fast: 1M requests/month at $0.01/request = $10K/month. Production AI needs caching, cost tracking, prompt optimisation, and fallback strategies — not just correct outputs.
The Cost Problem
gpt-4o pricing (approximate):
Input: $2.50 per 1M tokens
Output: $10.00 per 1M tokens
A 500-token prompt + 200-token response = $0.00325 per call
At 100k calls/day = $325/day = ~$10k/month
Cost levers:
1. Caching identical or near-identical prompts (biggest win)
2. Use smaller models where quality is sufficient
3. Compress/summarise prompts before sending
4. Batching (offline processing) vs real-time
5. Set max_tokens to prevent runaway outputsPattern 1: Semantic Response Cache
// Cache responses for semantically similar queries — not just exact matches
public class SemanticCache(
IEmbeddingGenerator<string, Embedding<float>> embedder,
IDistributedCache cache,
IChatClient inner)
: DelegatingChatClient(inner)
{
private readonly List<(float[] Embedding, string Key)> _index = [];
private readonly SemaphoreSlim _lock = new(1, 1);
private const double SimilarityThreshold = 0.95;
public override async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> messages,
ChatOptions? options = null,
CancellationToken ct = default)
{
var userMessage = messages.LastOrDefault(m => m.Role == ChatRole.User)?.Text;
if (userMessage is null) return await base.CompleteAsync(messages, options, ct);
// 1. Generate embedding for the query
var queryResult = await embedder.GenerateAsync([userMessage], cancellationToken: ct);
var queryEmbedding = queryResult[0].Vector.ToArray();
// 2. Check for a semantically similar cached response
var cachedKey = FindSimilarCachedKey(queryEmbedding);
if (cachedKey is not null)
{
var cachedJson = await cache.GetStringAsync(cachedKey, ct);
if (cachedJson is not null)
return JsonSerializer.Deserialize<ChatCompletion>(cachedJson)!;
}
// 3. Cache miss — call the model
var response = await base.CompleteAsync(messages, options, ct);
// 4. Store in cache with embedding index
var cacheKey = $"ai:semantic:{Guid.NewGuid():N}";
await cache.SetStringAsync(cacheKey,
JsonSerializer.Serialize(response),
new DistributedCacheEntryOptions { AbsoluteExpirationRelativeToNow = TimeSpan.FromHours(24) },
ct);
await _lock.WaitAsync(ct);
try { _index.Add((queryEmbedding, cacheKey)); }
finally { _lock.Release(); }
return response;
}
private string? FindSimilarCachedKey(float[] query)
{
string? best = null;
double bestSim = SimilarityThreshold;
foreach (var (embedding, key) in _index)
{
var sim = CosineSimilarity(query, embedding);
if (sim > bestSim)
{
bestSim = sim;
best = key;
}
}
return best;
}
private static double CosineSimilarity(float[] a, float[] b)
{
double dot = 0, na = 0, nb = 0;
for (int i = 0; i < a.Length; i++) { dot += a[i] * b[i]; na += a[i] * a[i]; nb += b[i] * b[i]; }
return dot / (Math.Sqrt(na) * Math.Sqrt(nb));
}
}Pattern 2: Tiered Model Routing
// Route to cheaper models for simple tasks, expensive ones for complex
public class TieredChatClient(
IChatClient cheap, // e.g. gpt-4o-mini
IChatClient standard, // e.g. gpt-4o
IChatClient premium) // e.g. o3
: IChatClient
{
public ChatClientMetadata Metadata => standard.Metadata;
public TService? GetService<TService>(object? key = null) where TService : class => null;
public void Dispose() { }
public async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> messages,
ChatOptions? options = null,
CancellationToken ct = default)
{
var tier = DetermineTier(messages, options);
return tier switch
{
ModelTier.Cheap => await cheap.CompleteAsync(messages, options, ct),
ModelTier.Standard => await standard.CompleteAsync(messages, options, ct),
ModelTier.Premium => await premium.CompleteAsync(messages, options, ct),
_ => await standard.CompleteAsync(messages, options, ct),
};
}
private static ModelTier DetermineTier(IList<ChatMessage> messages, ChatOptions? options)
{
// Explicit override from caller
if (options?.AdditionalProperties?.TryGetValue("model_tier", out var tier) == true)
return Enum.Parse<ModelTier>(tier!.ToString()!);
var userMessage = messages.LastOrDefault(m => m.Role == ChatRole.User)?.Text ?? "";
// Simple classification/extraction → cheap
if (userMessage.Length < 200 && IsClassificationTask(userMessage))
return ModelTier.Cheap;
// Complex reasoning, code generation → premium
if (IsComplexTask(userMessage))
return ModelTier.Premium;
return ModelTier.Standard;
}
private static bool IsClassificationTask(string msg)
=> msg.Contains("classify", StringComparison.OrdinalIgnoreCase)
|| msg.Contains("categorise", StringComparison.OrdinalIgnoreCase)
|| msg.Contains("yes or no", StringComparison.OrdinalIgnoreCase);
private static bool IsComplexTask(string msg)
=> msg.Contains("architecture", StringComparison.OrdinalIgnoreCase)
|| msg.Contains("design", StringComparison.OrdinalIgnoreCase)
|| msg.Length > 2000;
public IAsyncEnumerable<StreamingChatCompletionUpdate> CompleteStreamingAsync(
IList<ChatMessage> messages, ChatOptions? options = null, CancellationToken ct = default)
=> standard.CompleteStreamingAsync(messages, options, ct);
}
public enum ModelTier { Cheap, Standard, Premium }Pattern 3: Fallback Chain
// Try primary provider, fall back to secondary on failure
public class FallbackChatClient(
IChatClient primary,
IChatClient fallback,
ILogger<FallbackChatClient> logger)
: IChatClient
{
public ChatClientMetadata Metadata => primary.Metadata;
public TService? GetService<TService>(object? key = null) where TService : class => null;
public void Dispose() { }
public async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> messages,
ChatOptions? options = null,
CancellationToken ct = default)
{
try
{
return await primary.CompleteAsync(messages, options, ct);
}
catch (Exception ex) when (IsRetriable(ex))
{
logger.LogWarning(ex, "Primary AI provider failed, falling back");
return await fallback.CompleteAsync(messages, options, ct);
}
}
private static bool IsRetriable(Exception ex)
=> ex is HttpRequestException or TimeoutException
|| (ex is AggregateException agg && agg.InnerExceptions.Any(IsRetriable));
public IAsyncEnumerable<StreamingChatCompletionUpdate> CompleteStreamingAsync(
IList<ChatMessage> messages, ChatOptions? options = null, CancellationToken ct = default)
=> primary.CompleteStreamingAsync(messages, options, ct);
}
// Registration
builder.Services.AddChatClient(services =>
{
var primary = new OpenAIClient(config["OpenAI:ApiKey"]!).AsChatClient("gpt-4o");
var fallback = new OllamaChatClient(new Uri("http://localhost:11434"), "llama3.2");
return new FallbackChatClient(primary, fallback,
services.GetRequiredService<ILogger<FallbackChatClient>>());
});Pattern 4: Token Budget Tracking
// Track token usage per user/tenant for billing and limits
public class TokenBudgetMiddleware(
IChatClient inner,
ITokenBudgetService budgets,
ILogger<TokenBudgetMiddleware> logger)
: DelegatingChatClient(inner)
{
public override async Task<ChatCompletion> CompleteAsync(
IList<ChatMessage> messages,
ChatOptions? options = null,
CancellationToken ct = default)
{
var tenantId = ResolveTenantId(options);
// Check budget before calling
var remaining = await budgets.GetRemainingAsync(tenantId, ct);
if (remaining <= 0)
throw new TokenBudgetExceededException($"Token budget exhausted for tenant {tenantId}");
// Estimate input tokens (rough: 1 token ≈ 4 characters)
var estimatedInput = messages.Sum(m => (m.Text?.Length ?? 0) / 4);
if (estimatedInput > remaining)
{
logger.LogWarning("Tenant {TenantId} may exceed budget: estimated {Estimated} > remaining {Remaining}",
tenantId, estimatedInput, remaining);
}
var response = await base.CompleteAsync(messages, options, ct);
// Deduct actual tokens used
if (response.Usage is { } usage)
{
await budgets.DeductAsync(
tenantId,
usage.InputTokenCount + usage.OutputTokenCount ?? 0,
ct);
logger.LogInformation(
"Tenant {TenantId} used {Tokens} tokens. In:{In} Out:{Out}",
tenantId,
(usage.InputTokenCount ?? 0) + (usage.OutputTokenCount ?? 0),
usage.InputTokenCount,
usage.OutputTokenCount);
}
return response;
}
private static string ResolveTenantId(ChatOptions? options)
{
options?.AdditionalProperties?.TryGetValue("tenant_id", out var id);
return id?.ToString() ?? "default";
}
}
public class TokenBudgetExceededException(string message) : Exception(message);Pattern 5: Prompt Compression
// Summarise long context before sending to save tokens
public class PromptCompressor(IChatClient compressor)
{
// Threshold: compress conversation history when it exceeds this many tokens
private const int CompressionThreshold = 3000;
public async Task<List<ChatMessage>> CompressHistoryAsync(
List<ChatMessage> messages,
CancellationToken ct = default)
{
var historyMessages = messages.Where(m => m.Role != ChatRole.System).ToList();
var systemMessages = messages.Where(m => m.Role == ChatRole.System).ToList();
var estimatedTokens = historyMessages.Sum(m => (m.Text?.Length ?? 0) / 4);
if (estimatedTokens < CompressionThreshold) return messages;
// Keep last 4 messages verbatim (recent context)
var recentMessages = historyMessages.TakeLast(4).ToList();
var oldMessages = historyMessages.SkipLast(4).ToList();
if (oldMessages.Count == 0) return messages;
// Summarise the old messages
var summaryPrompt = $"""
Summarise this conversation history concisely (max 200 words).
Capture key facts, decisions, and context needed for continuing.
{string.Join("\n", oldMessages.Select(m => $"{m.Role}: {m.Text}"))}
""";
var summaryResponse = await compressor.CompleteAsync(
[new ChatMessage(ChatRole.User, summaryPrompt)], cancellationToken: ct);
var summary = summaryResponse.Message.Text ?? "";
// Build compressed message list
var result = new List<ChatMessage>(systemMessages);
result.Add(new ChatMessage(ChatRole.System, $"[Conversation summary: {summary}]"));
result.AddRange(recentMessages);
return result;
}
}Pattern 6: Cost Dashboard Endpoint
// Expose AI cost metrics via an endpoint for monitoring
public record AiCostSummary(
string Period,
long TotalTokensInput,
long TotalTokensOutput,
decimal EstimatedCostUsd,
Dictionary<string, long> TokensByTenant,
Dictionary<string, long> TokensByModel);
app.MapGet("/admin/ai-costs", async (
ITokenBudgetService budgets,
CancellationToken ct) =>
{
var summary = await budgets.GetSummaryAsync(DateTime.UtcNow.AddDays(-1), DateTime.UtcNow, ct);
return Results.Ok(summary);
})
.RequireAuthorization("Admin");Middleware Stack — Putting It Together
builder.Services.AddChatClient(services =>
{
// Inner-most: actual provider
var openAi = new OpenAIClient(config["OpenAI:ApiKey"]!).AsChatClient("gpt-4o");
// Fallback to Ollama when OpenAI is down
var withFallback = new FallbackChatClient(openAi,
new OllamaChatClient(new Uri("http://localhost:11434"), "llama3.2"),
services.GetRequiredService<ILogger<FallbackChatClient>>());
return withFallback;
})
// Exact-match caching in Redis (cheapest — check this first)
.UseDistributedCache()
// Token budget enforcement
.Use((inner, services) => new TokenBudgetMiddleware(inner,
services.GetRequiredService<ITokenBudgetService>(),
services.GetRequiredService<ILogger<TokenBudgetMiddleware>>()))
// Observability — log every request with token counts
.UseLogging()
.UseOpenTelemetry();Interview Answer
"Production AI cost control has five layers. First, exact-match caching with UseDistributedCache() — identical prompts never hit the model twice. Second, semantic caching — embed the query and find near-identical cached responses (cosine similarity above 0.95). Third, model tiering — route simple classification to gpt-4o-mini (5x cheaper) and complex reasoning to the full model, with caller-overridable logic. Fourth, token budget middleware — track input+output tokens per tenant from UsageDetails, deduct from a Redis counter, reject when exhausted. Fifth, prompt compression — when conversation history exceeds a token threshold, summarise old messages before sending. Layer these as DelegatingChatClient middleware in the IChatClient pipeline so business code stays clean. For resilience: a FallbackChatClient tries primary, catches HttpRequestException or TimeoutException, and transparently retries on Ollama or a secondary API key. Monitor with OpenTelemetry — token counts, cost per request, and cache hit rates should be on your dashboard."