.NET & C# Development · Lesson 166 of 229

Streaming AI Responses in .NET — IAsyncEnumerable and Server-Sent Events

LLM responses can take 10–30 seconds for long outputs. Streaming sends tokens to the browser as they are generated — users see a response immediately instead of staring at a spinner. This guide wires streaming from the LLM all the way to the browser.

Why Streaming Matters

Without streaming:
  User sends message → waits 15 seconds → full response appears
  User experience: blank screen, no feedback, high perceived latency

With streaming:
  User sends message → first token appears in < 1 second → words flow in
  User experience: feels instant, like watching someone type

Step 1: Streaming from IChatClient

// Microsoft.Extensions.AI — CompleteStreamingAsync returns IAsyncEnumerable
public class ChatService(IChatClient chatClient)
{
    public async IAsyncEnumerable<string> StreamAsync(
        string userMessage,
        [EnumeratorCancellation] CancellationToken ct = default)
    {
        var messages = new List<ChatMessage>
        {
            new(ChatRole.System, "You are a helpful assistant."),
            new(ChatRole.User,   userMessage),
        };

        await foreach (var update in chatClient.CompleteStreamingAsync(messages, cancellationToken: ct))
        {
            if (update.Text is { Length: > 0 } text)
                yield return text;
        }
    }
}

Step 2: Server-Sent Events (SSE) Endpoint

SSE is the standard for streaming text from server to browser over HTTP. It works over HTTP/1.1, supports reconnection, and needs no WebSocket upgrade.

// Minimal API SSE endpoint
app.MapGet("/api/chat/stream", async (
    [FromQuery] string message,
    ChatService chat,
    HttpContext ctx,
    CancellationToken ct) =>
{
    // SSE headers — must be set before writing any body
    ctx.Response.Headers.ContentType        = "text/event-stream";
    ctx.Response.Headers.CacheControl       = "no-cache";
    ctx.Response.Headers.Connection         = "keep-alive";
    ctx.Response.Headers["X-Accel-Buffering"] = "no";   // disable Nginx buffering

    await foreach (var token in chat.StreamAsync(message, ct))
    {
        // SSE format: "data: <content>\n\n"
        await ctx.Response.WriteAsync($"data: {JsonSerializer.Serialize(token)}\n\n", ct);
        await ctx.Response.Body.FlushAsync(ct);   // push immediately — don't buffer
    }

    // Signal stream end
    await ctx.Response.WriteAsync("data: [DONE]\n\n", ct);
    await ctx.Response.Body.FlushAsync(ct);
});

// Controller version with streaming support
[ApiController]
[Route("api/chat")]
public class ChatController(ChatService chat) : ControllerBase
{
    [HttpGet("stream")]
    public async Task StreamChat([FromQuery] string message, CancellationToken ct)
    {
        Response.Headers.ContentType  = "text/event-stream";
        Response.Headers.CacheControl = "no-cache";

        await foreach (var token in chat.StreamAsync(message, ct))
        {
            await Response.WriteAsync($"data: {JsonSerializer.Serialize(token)}\n\n", ct);
            await Response.Body.FlushAsync(ct);
        }

        await Response.WriteAsync("data: [DONE]\n\n", ct);
        await Response.Body.FlushAsync(ct);
    }
}

Step 3: POST with Streaming (Request Body + SSE)

GET requests have URL length limits. For longer messages use POST with SSE response.

public record ChatRequest(string Message, string? SystemPrompt, List<MessageDto>? History);
public record MessageDto(string Role, string Content);

app.MapPost("/api/chat/stream", async (
    ChatRequest request,
    ChatService chat,
    HttpContext ctx,
    CancellationToken ct) =>
{
    ctx.Response.Headers.ContentType  = "text/event-stream";
    ctx.Response.Headers.CacheControl = "no-cache";

    var history = request.History?
        .Select(m => new ChatMessage(
            m.Role == "user" ? ChatRole.User : ChatRole.Assistant,
            m.Content))
        .ToList() ?? [];

    await foreach (var token in chat.StreamWithHistoryAsync(request.Message, history, request.SystemPrompt, ct))
    {
        var data = JsonSerializer.Serialize(new { token, done = false });
        await ctx.Response.WriteAsync($"data: {data}\n\n", ct);
        await ctx.Response.Body.FlushAsync(ct);
    }

    await ctx.Response.WriteAsync($"data: {JsonSerializer.Serialize(new { token = "", done = true })}\n\n", ct);
    await ctx.Response.Body.FlushAsync(ct);
});

Step 4: TypeScript / React Client

TYPESCRIPT

// Hook for consuming the SSE stream in React
export function useChat() {
  const [messages, setMessages] = useState<Message[]>([]);
  const [isStreaming, setIsStreaming] = useState(false);

  const sendMessage = async (userMessage: string) => {
    // Add user message immediately
    setMessages(prev => [...prev, { role: "user", content: userMessage }]);

    // Add empty assistant message to fill in
    setMessages(prev => [...prev, { role: "assistant", content: "" }]);
    setIsStreaming(true);

    const response = await fetch("/api/chat/stream", {
      method: "POST",
      headers: { "Content-Type": "application/json" },
      body: JSON.stringify({ message: userMessage }),
    });

    const reader = response.body!.getReader();
    const decoder = new TextDecoder();
    let buffer = "";

    while (true) {
      const { done, value } = await reader.read();
      if (done) break;

      buffer += decoder.decode(value, { stream: true });
      const lines = buffer.split("\n\n");
      buffer = lines.pop() ?? "";

      for (const line of lines) {
        if (!line.startsWith("data: ")) continue;
        const data = JSON.parse(line.slice(6));
        if (data.done) { setIsStreaming(false); return; }

        // Append token to last assistant message
        setMessages(prev => {
          const updated = [...prev];
          updated[updated.length - 1] = {
            ...updated[updated.length - 1],
            content: updated[updated.length - 1].content + data.token,
          };
          return updated;
        });
      }
    }

    setIsStreaming(false);
  };

  return { messages, sendMessage, isStreaming };
}

Step 5: Handling Cancellation

// When the user closes the browser tab, ct is cancelled
// IAsyncEnumerable respects cancellation automatically

public async IAsyncEnumerable<string> StreamAsync(
    string message,
    [EnumeratorCancellation] CancellationToken ct = default)
{
    var messages = new List<ChatMessage>
    {
        new(ChatRole.System, "You are a helpful assistant."),
        new(ChatRole.User, message),
    };

    await foreach (var update in chatClient.CompleteStreamingAsync(messages, cancellationToken: ct))
    {
        ct.ThrowIfCancellationRequested();   // surface cancellation cleanly

        if (update.Text is { Length: > 0 } text)
            yield return text;
    }
}

// The SSE endpoint catches OperationCanceledException silently
app.MapGet("/api/chat/stream", async (string message, ChatService chat, HttpContext ctx, CancellationToken ct) =>
{
    ctx.Response.Headers.ContentType = "text/event-stream";

    try
    {
        await foreach (var token in chat.StreamAsync(message, ct))
        {
            await ctx.Response.WriteAsync($"data: {JsonSerializer.Serialize(token)}\n\n", ct);
            await ctx.Response.Body.FlushAsync(ct);
        }
    }
    catch (OperationCanceledException)
    {
        // Client disconnected — normal, not an error
    }
});

Step 6: Accumulate Full Response for Storage

// Stream to client AND capture the full response for logging/storage
public class ChatWithPersistenceService(
    IChatClient chatClient,
    IConversationRepository repo)
{
    public async IAsyncEnumerable<string> StreamAndSaveAsync(
        int conversationId,
        string userMessage,
        [EnumeratorCancellation] CancellationToken ct = default)
    {
        var accumulated = new StringBuilder();

        await foreach (var token in chatClient.CompleteStreamingAsync(
            [new ChatMessage(ChatRole.User, userMessage)],
            cancellationToken: ct))
        {
            if (token.Text is { Length: > 0 } text)
            {
                accumulated.Append(text);
                yield return text;   // send to client immediately
            }
        }

        // Save the complete response after streaming finishes
        await repo.AddMessageAsync(conversationId, new ConversationMessage
        {
            Role    = "assistant",
            Content = accumulated.ToString(),
            SentAt  = DateTime.UtcNow,
        }, ct);
    }
}

Step 7: Rate Limiting Streaming Endpoints

// Streaming endpoints hold connections open — apply stricter limits
builder.Services.AddRateLimiter(opts =>
{
    // Standard endpoints: 100 req/min
    opts.AddFixedWindowLimiter("standard", o =>
    {
        o.Window           = TimeSpan.FromMinutes(1);
        o.PermitLimit      = 100;
        o.QueueProcessingOrder = QueueProcessingOrder.OldestFirst;
    });

    // Streaming endpoints: 10 concurrent streams per user
    opts.AddConcurrencyLimiter("streaming", o =>
    {
        o.PermitLimit      = 10;
        o.QueueProcessingOrder = QueueProcessingOrder.OldestFirst;
    });
});

app.MapPost("/api/chat/stream", ...)
   .RequireRateLimiting("streaming");

Interview Answer

"Streaming LLM responses in .NET uses IChatClient.CompleteStreamingAsync which returns IAsyncEnumerable — each item is a token chunk as it arrives from the model. The ASP.NET Core endpoint sets Content-Type to text/event-stream, CacheControl to no-cache, then awaits each token from the IAsyncEnumerable and writes it in SSE format (data: JSON\n\n) immediately flushing the response body. The client reads the stream with the Fetch API's ReadableStream, appending tokens to the UI as they arrive. Cancellation is handled automatically — when the client disconnects, the CancellationToken is cancelled, OperationCanceledException propagates out of the IAsyncEnumerable, and the catch block swallows it silently. For POST streaming (long messages): set the Content-Type in the request to application/json and the response to text/event-stream. Apply a concurrency rate limiter to streaming endpoints — they hold long-lived connections unlike regular request/response endpoints."

Microsoft.Extensions.AI — The .NET AI Abstraction Layer

Next Lesson

pgvector with EF Core — Vector Search Without a Separate Database