.NET & C# Development · Lesson 166 of 229
Streaming AI Responses in .NET — IAsyncEnumerable and Server-Sent Events
Streaming AI Responses in .NET — IAsyncEnumerable and Server-Sent Events
LLM responses can take 10–30 seconds for long outputs. Streaming sends tokens to the browser as they are generated — users see a response immediately instead of staring at a spinner. This guide wires streaming from the LLM all the way to the browser.
Why Streaming Matters
Without streaming:
User sends message → waits 15 seconds → full response appears
User experience: blank screen, no feedback, high perceived latency
With streaming:
User sends message → first token appears in < 1 second → words flow in
User experience: feels instant, like watching someone typeStep 1: Streaming from IChatClient
// Microsoft.Extensions.AI — CompleteStreamingAsync returns IAsyncEnumerable
public class ChatService(IChatClient chatClient)
{
public async IAsyncEnumerable<string> StreamAsync(
string userMessage,
[EnumeratorCancellation] CancellationToken ct = default)
{
var messages = new List<ChatMessage>
{
new(ChatRole.System, "You are a helpful assistant."),
new(ChatRole.User, userMessage),
};
await foreach (var update in chatClient.CompleteStreamingAsync(messages, cancellationToken: ct))
{
if (update.Text is { Length: > 0 } text)
yield return text;
}
}
}Step 2: Server-Sent Events (SSE) Endpoint
SSE is the standard for streaming text from server to browser over HTTP. It works over HTTP/1.1, supports reconnection, and needs no WebSocket upgrade.
// Minimal API SSE endpoint
app.MapGet("/api/chat/stream", async (
[FromQuery] string message,
ChatService chat,
HttpContext ctx,
CancellationToken ct) =>
{
// SSE headers — must be set before writing any body
ctx.Response.Headers.ContentType = "text/event-stream";
ctx.Response.Headers.CacheControl = "no-cache";
ctx.Response.Headers.Connection = "keep-alive";
ctx.Response.Headers["X-Accel-Buffering"] = "no"; // disable Nginx buffering
await foreach (var token in chat.StreamAsync(message, ct))
{
// SSE format: "data: <content>\n\n"
await ctx.Response.WriteAsync($"data: {JsonSerializer.Serialize(token)}\n\n", ct);
await ctx.Response.Body.FlushAsync(ct); // push immediately — don't buffer
}
// Signal stream end
await ctx.Response.WriteAsync("data: [DONE]\n\n", ct);
await ctx.Response.Body.FlushAsync(ct);
});// Controller version with streaming support
[ApiController]
[Route("api/chat")]
public class ChatController(ChatService chat) : ControllerBase
{
[HttpGet("stream")]
public async Task StreamChat([FromQuery] string message, CancellationToken ct)
{
Response.Headers.ContentType = "text/event-stream";
Response.Headers.CacheControl = "no-cache";
await foreach (var token in chat.StreamAsync(message, ct))
{
await Response.WriteAsync($"data: {JsonSerializer.Serialize(token)}\n\n", ct);
await Response.Body.FlushAsync(ct);
}
await Response.WriteAsync("data: [DONE]\n\n", ct);
await Response.Body.FlushAsync(ct);
}
}Step 3: POST with Streaming (Request Body + SSE)
GET requests have URL length limits. For longer messages use POST with SSE response.
public record ChatRequest(string Message, string? SystemPrompt, List<MessageDto>? History);
public record MessageDto(string Role, string Content);
app.MapPost("/api/chat/stream", async (
ChatRequest request,
ChatService chat,
HttpContext ctx,
CancellationToken ct) =>
{
ctx.Response.Headers.ContentType = "text/event-stream";
ctx.Response.Headers.CacheControl = "no-cache";
var history = request.History?
.Select(m => new ChatMessage(
m.Role == "user" ? ChatRole.User : ChatRole.Assistant,
m.Content))
.ToList() ?? [];
await foreach (var token in chat.StreamWithHistoryAsync(request.Message, history, request.SystemPrompt, ct))
{
var data = JsonSerializer.Serialize(new { token, done = false });
await ctx.Response.WriteAsync($"data: {data}\n\n", ct);
await ctx.Response.Body.FlushAsync(ct);
}
await ctx.Response.WriteAsync($"data: {JsonSerializer.Serialize(new { token = "", done = true })}\n\n", ct);
await ctx.Response.Body.FlushAsync(ct);
});Step 4: TypeScript / React Client
// Hook for consuming the SSE stream in React
export function useChat() {
const [messages, setMessages] = useState<Message[]>([]);
const [isStreaming, setIsStreaming] = useState(false);
const sendMessage = async (userMessage: string) => {
// Add user message immediately
setMessages(prev => [...prev, { role: "user", content: userMessage }]);
// Add empty assistant message to fill in
setMessages(prev => [...prev, { role: "assistant", content: "" }]);
setIsStreaming(true);
const response = await fetch("/api/chat/stream", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ message: userMessage }),
});
const reader = response.body!.getReader();
const decoder = new TextDecoder();
let buffer = "";
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n\n");
buffer = lines.pop() ?? "";
for (const line of lines) {
if (!line.startsWith("data: ")) continue;
const data = JSON.parse(line.slice(6));
if (data.done) { setIsStreaming(false); return; }
// Append token to last assistant message
setMessages(prev => {
const updated = [...prev];
updated[updated.length - 1] = {
...updated[updated.length - 1],
content: updated[updated.length - 1].content + data.token,
};
return updated;
});
}
}
setIsStreaming(false);
};
return { messages, sendMessage, isStreaming };
}Step 5: Handling Cancellation
// When the user closes the browser tab, ct is cancelled
// IAsyncEnumerable respects cancellation automatically
public async IAsyncEnumerable<string> StreamAsync(
string message,
[EnumeratorCancellation] CancellationToken ct = default)
{
var messages = new List<ChatMessage>
{
new(ChatRole.System, "You are a helpful assistant."),
new(ChatRole.User, message),
};
await foreach (var update in chatClient.CompleteStreamingAsync(messages, cancellationToken: ct))
{
ct.ThrowIfCancellationRequested(); // surface cancellation cleanly
if (update.Text is { Length: > 0 } text)
yield return text;
}
}
// The SSE endpoint catches OperationCanceledException silently
app.MapGet("/api/chat/stream", async (string message, ChatService chat, HttpContext ctx, CancellationToken ct) =>
{
ctx.Response.Headers.ContentType = "text/event-stream";
try
{
await foreach (var token in chat.StreamAsync(message, ct))
{
await ctx.Response.WriteAsync($"data: {JsonSerializer.Serialize(token)}\n\n", ct);
await ctx.Response.Body.FlushAsync(ct);
}
}
catch (OperationCanceledException)
{
// Client disconnected — normal, not an error
}
});Step 6: Accumulate Full Response for Storage
// Stream to client AND capture the full response for logging/storage
public class ChatWithPersistenceService(
IChatClient chatClient,
IConversationRepository repo)
{
public async IAsyncEnumerable<string> StreamAndSaveAsync(
int conversationId,
string userMessage,
[EnumeratorCancellation] CancellationToken ct = default)
{
var accumulated = new StringBuilder();
await foreach (var token in chatClient.CompleteStreamingAsync(
[new ChatMessage(ChatRole.User, userMessage)],
cancellationToken: ct))
{
if (token.Text is { Length: > 0 } text)
{
accumulated.Append(text);
yield return text; // send to client immediately
}
}
// Save the complete response after streaming finishes
await repo.AddMessageAsync(conversationId, new ConversationMessage
{
Role = "assistant",
Content = accumulated.ToString(),
SentAt = DateTime.UtcNow,
}, ct);
}
}Step 7: Rate Limiting Streaming Endpoints
// Streaming endpoints hold connections open — apply stricter limits
builder.Services.AddRateLimiter(opts =>
{
// Standard endpoints: 100 req/min
opts.AddFixedWindowLimiter("standard", o =>
{
o.Window = TimeSpan.FromMinutes(1);
o.PermitLimit = 100;
o.QueueProcessingOrder = QueueProcessingOrder.OldestFirst;
});
// Streaming endpoints: 10 concurrent streams per user
opts.AddConcurrencyLimiter("streaming", o =>
{
o.PermitLimit = 10;
o.QueueProcessingOrder = QueueProcessingOrder.OldestFirst;
});
});
app.MapPost("/api/chat/stream", ...)
.RequireRateLimiting("streaming");Interview Answer
"Streaming LLM responses in .NET uses IChatClient.CompleteStreamingAsync which returns IAsyncEnumerable — each item is a token chunk as it arrives from the model. The ASP.NET Core endpoint sets Content-Type to text/event-stream, CacheControl to no-cache, then awaits each token from the IAsyncEnumerable and writes it in SSE format (data: JSON\n\n) immediately flushing the response body. The client reads the stream with the Fetch API's ReadableStream, appending tokens to the UI as they arrive. Cancellation is handled automatically — when the client disconnects, the CancellationToken is cancelled, OperationCanceledException propagates out of the IAsyncEnumerable, and the catch block swallows it silently. For POST streaming (long messages): set the Content-Type in the request to application/json and the response to text/event-stream. Apply a concurrency rate limiter to streaming endpoints — they hold long-lived connections unlike regular request/response endpoints."