Build an AI Document Processor in .NET

AI document processors extract structured data from unstructured files — invoices, contracts, medical forms, legal documents. This guide builds a production pipeline: upload a PDF, extract typed data, validate, score confidence, persist, and surface results via API.

What you'll build:

Multi-format document parser (PDF, image, DOCX)
AI extraction with structured JSON output and confidence scoring
Business rule validation pipeline
Retry with correction prompts on extraction failure
Audit trail for every extraction decision
Background processing with status tracking

What We're Extracting

Example: Invoice processing

Input:  PDF invoice (scanned or digital)
Output: Structured InvoiceData record

{
  "invoiceNumber": "INV-2026-00342",
  "issueDate": "2026-05-01",
  "dueDate": "2026-06-01",
  "vendorName": "Acme Supplies Ltd",
  "vendorVatNumber": "GB123456789",
  "lineItems": [
    { "description": "Widget A", "quantity": 10, "unitPrice": 9.99, "total": 99.90 },
    { "description": "Gadget B", "quantity": 2,  "unitPrice": 49.99, "total": 99.98 }
  ],
  "subtotal": 199.88,
  "vatAmount": 39.98,
  "totalAmount": 239.86,
  "currency": "GBP",
  "confidence": 0.94
}

Project Structure

src/
  DocProcessor.Api/
    Endpoints/
      DocumentEndpoints.cs
    Program.cs

  DocProcessor.Core/
    Entities/
      ProcessingJob.cs
      ExtractionResult.cs
    Models/
      InvoiceData.cs
      LineItem.cs
    Interfaces/
      IExtractionService.cs
      IDocumentParser.cs

  DocProcessor.Application/
    Commands/
      SubmitDocumentCommand.cs
      SubmitDocumentCommandHandler.cs
    Jobs/
      ProcessDocumentJob.cs

  DocProcessor.Infrastructure/
    Parsing/
      PdfTextExtractor.cs
    Extraction/
      InvoiceExtractionService.cs
      ExtractionValidator.cs

Step 1: Domain Models

// src/DocProcessor.Core/Models/InvoiceData.cs
public class InvoiceData
{
    public string         InvoiceNumber   { get; set; } = "";
    public DateOnly?      IssueDate       { get; set; }
    public DateOnly?      DueDate         { get; set; }
    public string         VendorName      { get; set; } = "";
    public string?        VendorVatNumber { get; set; }
    public List<LineItem> LineItems       { get; set; } = [];
    public decimal        Subtotal        { get; set; }
    public decimal        VatAmount       { get; set; }
    public decimal        TotalAmount     { get; set; }
    public string         Currency        { get; set; } = "GBP";

    // Set by the extraction service — not from the document
    public double         Confidence      { get; set; }
    public List<string>   Warnings        { get; set; } = [];
}

public class LineItem
{
    public string  Description { get; set; } = "";
    public decimal Quantity    { get; set; }
    public decimal UnitPrice   { get; set; }
    public decimal Total       { get; set; }
}

// src/DocProcessor.Core/Entities/ProcessingJob.cs
public class ProcessingJob
{
    public int             Id           { get; set; }
    public string          FileName     { get; set; } = "";
    public string          FileHash     { get; set; } = "";
    public ProcessingStatus Status      { get; set; } = ProcessingStatus.Pending;
    public DateTime        CreatedAt    { get; set; } = DateTime.UtcNow;
    public DateTime?       CompletedAt  { get; set; }
    public string?         ResultJson   { get; set; }
    public string?         ErrorMessage { get; set; }
    public int             RetryCount   { get; set; }

    public ExtractionAudit? Audit       { get; set; }
}

public enum ProcessingStatus { Pending, Processing, Succeeded, Failed, RequiresReview }

public class ExtractionAudit
{
    public int      Id            { get; set; }
    public int      JobId         { get; set; }
    public string   ModelUsed     { get; set; } = "";
    public int      InputTokens   { get; set; }
    public int      OutputTokens  { get; set; }
    public int      AttemptCount  { get; set; }
    public string?  RawResponse   { get; set; }
    public DateTime CreatedAt     { get; set; } = DateTime.UtcNow;
}

Step 2: Document Parsing

// src/DocProcessor.Infrastructure/Parsing/PdfTextExtractor.cs
public class PdfTextExtractor
{
    public string Extract(Stream pdfStream)
    {
        using var doc = PdfDocument.Open(pdfStream);
        var sb = new System.Text.StringBuilder();

        foreach (var page in doc.GetPages())
        {
            sb.AppendLine($"--- Page {page.Number} ---");
            sb.AppendLine(page.Text);
        }

        return sb.ToString();
    }

    // For scanned PDFs (image-based), return the raw bytes for vision model
    public bool IsImageBased(Stream pdfStream)
    {
        using var doc   = PdfDocument.Open(pdfStream);
        var firstPage   = doc.GetPages().First();
        var hasText     = firstPage.Text.Trim().Length > 50;
        return !hasText;
    }
}

Step 3: AI Extraction Service

// src/DocProcessor.Infrastructure/Extraction/InvoiceExtractionService.cs
public class InvoiceExtractionService(
    IChatClient chatClient,
    ILogger<InvoiceExtractionService> logger)
    : IExtractionService<InvoiceData>
{
    private const string ExtractionPrompt = """
        You are a precise document data extraction system.
        Extract structured data from the invoice text below.

        Return ONLY valid JSON matching this exact schema:
        {
          "invoiceNumber": "string",
          "issueDate": "YYYY-MM-DD or null",
          "dueDate": "YYYY-MM-DD or null",
          "vendorName": "string",
          "vendorVatNumber": "string or null",
          "lineItems": [
            { "description": "string", "quantity": number, "unitPrice": number, "total": number }
          ],
          "subtotal": number,
          "vatAmount": number,
          "totalAmount": number,
          "currency": "3-letter ISO code",
          "confidence": 0.0-1.0
        }

        Set confidence:
          0.95-1.0 = all fields clearly stated, numbers add up
          0.80-0.94 = most fields found, minor ambiguity
          0.60-0.79 = some fields missing or unclear
          below 0.60 = significant data missing or inconsistent

        Do not invent data. If a field is not present, use null.
        """;

    public async Task<ExtractionAttempt<InvoiceData>> ExtractAsync(
        string documentText,
        CancellationToken ct = default)
    {
        var attempt = 1;
        string? lastError = null;

        // Up to 3 attempts with progressive correction
        while (attempt <= 3)
        {
            var messages = BuildMessages(documentText, lastError, attempt);

            var response = await chatClient.CompleteAsync(
                messages,
                new ChatOptions { ResponseFormat = ChatResponseFormat.Json },
                ct);

            var rawJson = response.Message.Text!;

            try
            {
                var data = JsonSerializer.Deserialize<InvoiceData>(rawJson,
                    new JsonSerializerOptions { PropertyNameCaseInsensitive = true });

                if (data is null)
                    throw new JsonException("Deserialized to null");

                logger.LogInformation(
                    "Extraction succeeded on attempt {Attempt}, confidence {Confidence}",
                    attempt, data.Confidence);

                return new ExtractionAttempt<InvoiceData>(
                    Data:          data,
                    RawResponse:   rawJson,
                    AttemptCount:  attempt,
                    InputTokens:   response.Usage?.InputTokenCount ?? 0,
                    OutputTokens:  response.Usage?.OutputTokenCount ?? 0);
            }
            catch (JsonException ex)
            {
                logger.LogWarning("Attempt {Attempt} failed: {Error}", attempt, ex.Message);
                lastError = $"Previous response was not valid JSON: {ex.Message}";
                attempt++;
            }
        }

        throw new ExtractionException("Failed to extract valid JSON after 3 attempts");
    }

    private static List<ChatMessage> BuildMessages(
        string documentText,
        string? previousError,
        int attempt)
    {
        var messages = new List<ChatMessage>
        {
            new(ChatRole.System, ExtractionPrompt),
            new(ChatRole.User,   $"Invoice text:\n\n{documentText}")
        };

        // On retry: add correction guidance
        if (previousError is not null)
        {
            messages.Add(new(ChatRole.Assistant, "[Previous attempt failed]"));
            messages.Add(new(ChatRole.User,
                $"Your previous response was invalid. Error: {previousError}\n" +
                "Please try again. Return ONLY valid JSON, no markdown code fences."));
        }

        return messages;
    }
}

public record ExtractionAttempt<T>(
    T Data,
    string RawResponse,
    int AttemptCount,
    int InputTokens,
    int OutputTokens);

Step 4: Validation Pipeline

// src/DocProcessor.Infrastructure/Extraction/ExtractionValidator.cs
public class InvoiceValidator
{
    public ValidationResult Validate(InvoiceData invoice)
    {
        var errors   = new List<string>();
        var warnings = new List<string>();

        // Required fields
        if (string.IsNullOrWhiteSpace(invoice.InvoiceNumber))
            errors.Add("Invoice number is missing");

        if (string.IsNullOrWhiteSpace(invoice.VendorName))
            errors.Add("Vendor name is missing");

        // Date logic
        if (invoice.IssueDate.HasValue && invoice.DueDate.HasValue)
        {
            if (invoice.DueDate < invoice.IssueDate)
                errors.Add($"Due date ({invoice.DueDate}) is before issue date ({invoice.IssueDate})");
        }

        if (invoice.IssueDate.HasValue && invoice.IssueDate > DateOnly.FromDateTime(DateTime.UtcNow))
            warnings.Add("Issue date is in the future");

        // Numeric consistency checks
        if (invoice.LineItems.Any())
        {
            var lineItemTotal = invoice.LineItems.Sum(l => l.Total);
            var lineItemDiff  = Math.Abs(lineItemTotal - invoice.Subtotal);

            if (lineItemDiff > 0.02m)
                errors.Add(
                    $"Line item sum ({lineItemTotal:F2}) does not match subtotal ({invoice.Subtotal:F2})");

            // Check individual line item totals
            foreach (var line in invoice.LineItems)
            {
                var expected = Math.Round(line.Quantity * line.UnitPrice, 2);
                if (Math.Abs(expected - line.Total) > 0.02m)
                    warnings.Add(
                        $"Line item '{line.Description}': " +
                        $"{line.Quantity} x {line.UnitPrice:F2} = {expected:F2}, " +
                        $"but total is {line.Total:F2}");
            }
        }

        var expectedTotal = invoice.Subtotal + invoice.VatAmount;
        if (Math.Abs(expectedTotal - invoice.TotalAmount) > 0.02m)
            errors.Add(
                $"Subtotal ({invoice.Subtotal:F2}) + VAT ({invoice.VatAmount:F2}) = {expectedTotal:F2} " +
                $"but total is {invoice.TotalAmount:F2}");

        // Confidence thresholds
        if (invoice.Confidence < 0.6)
            errors.Add($"Confidence too low for auto-processing: {invoice.Confidence:P0}");
        else if (invoice.Confidence < 0.8)
            warnings.Add($"Low confidence ({invoice.Confidence:P0}) — manual review recommended");

        return new ValidationResult(errors, warnings, errors.Count == 0);
    }
}

public record ValidationResult(
    List<string> Errors,
    List<string> Warnings,
    bool IsValid);

Step 5: Background Processing Job

// src/DocProcessor.Application/Jobs/ProcessDocumentJob.cs
public class ProcessDocumentJob(
    DocProcessorDbContext db,
    PdfTextExtractor pdfExtractor,
    InvoiceExtractionService extractor,
    InvoiceValidator validator,
    ILogger<ProcessDocumentJob> logger)
{
    public async Task ProcessAsync(int jobId, CancellationToken ct)
    {
        var job = await db.ProcessingJobs
            .Include(j => j.Audit)
            .FirstOrDefaultAsync(j => j.Id == jobId, ct)
            ?? throw new InvalidOperationException($"Job {jobId} not found");

        job.Status = ProcessingStatus.Processing;
        await db.SaveChangesAsync(ct);

        try
        {
            // 1. Parse the document
            var fileBytes  = await File.ReadAllBytesAsync(job.FilePath!, ct);
            using var stream = new MemoryStream(fileBytes);
            var docText = pdfExtractor.Extract(stream);

            // 2. Extract structured data
            var attempt = await extractor.ExtractAsync(docText, ct);

            // 3. Validate
            var validation = validator.Validate(attempt.Data);

            // Attach warnings from validation to the result
            attempt.Data.Warnings.AddRange(validation.Warnings);

            // 4. Determine final status
            job.Status      = validation.IsValid
                ? ProcessingStatus.Succeeded
                : (validation.Errors.Any(e => e.Contains("Confidence"))
                    ? ProcessingStatus.RequiresReview
                    : ProcessingStatus.Failed);

            job.ResultJson   = JsonSerializer.Serialize(attempt.Data);
            job.CompletedAt  = DateTime.UtcNow;
            job.ErrorMessage = validation.IsValid ? null
                : string.Join("; ", validation.Errors);

            // 5. Save audit trail
            db.ExtractionAudits.Add(new ExtractionAudit
            {
                JobId        = job.Id,
                ModelUsed    = "gpt-4o",
                InputTokens  = attempt.InputTokens,
                OutputTokens = attempt.OutputTokens,
                AttemptCount = attempt.AttemptCount,
                RawResponse  = attempt.RawResponse,
            });

            await db.SaveChangesAsync(ct);

            logger.LogInformation(
                "Job {JobId} completed: {Status}, confidence {Confidence:P0}, " +
                "{Attempts} attempt(s), {InputTokens} input tokens",
                jobId, job.Status, attempt.Data.Confidence, attempt.AttemptCount, attempt.InputTokens);
        }
        catch (Exception ex)
        {
            job.Status       = ProcessingStatus.Failed;
            job.ErrorMessage = ex.Message;
            job.RetryCount++;
            await db.SaveChangesAsync(ct);

            logger.LogError(ex, "Job {JobId} failed", jobId);
            throw;
        }
    }
}

Step 6: API Endpoints

// src/DocProcessor.Api/Endpoints/DocumentEndpoints.cs
public static class DocumentEndpoints
{
    public static void MapDocumentEndpoints(this IEndpointRouteBuilder app)
    {
        var group = app.MapGroup("/api/documents").RequireAuthorization();

        group.MapPost("/", SubmitDocument)
             .DisableAntiforgery();

        group.MapGet("/{id}/status", GetStatus);
        group.MapGet("/{id}/result", GetResult);
    }

    private static async Task<IResult> SubmitDocument(
        IFormFile file,
        DocProcessorDbContext db,
        IBackgroundJobClient jobs,
        CancellationToken ct)
    {
        if (file.Length > 10 * 1024 * 1024)
            return Results.BadRequest("File size exceeds 10 MB limit");

        var allowed = new[] { ".pdf", ".docx" };
        if (!allowed.Contains(Path.GetExtension(file.FileName).ToLowerInvariant()))
            return Results.BadRequest("Only PDF and DOCX files are accepted");

        // Save the file
        var uploadPath = Path.Combine("uploads", Guid.NewGuid() + Path.GetExtension(file.FileName));
        Directory.CreateDirectory("uploads");
        await using (var fs = File.Create(uploadPath))
            await file.CopyToAsync(fs, ct);

        // Compute hash for deduplication
        var hash = Convert.ToHexString(
            System.Security.Cryptography.SHA256.HashData(
                await File.ReadAllBytesAsync(uploadPath, ct)));

        // Check for duplicate
        var existing = await db.ProcessingJobs
            .Where(j => j.FileHash == hash && j.Status == ProcessingStatus.Succeeded)
            .FirstOrDefaultAsync(ct);

        if (existing is not null)
            return Results.Ok(new { jobId = existing.Id, cached = true });

        // Create job record
        var job = new ProcessingJob
        {
            FileName = file.FileName,
            FilePath = uploadPath,
            FileHash = hash,
            Status   = ProcessingStatus.Pending,
        };
        db.ProcessingJobs.Add(job);
        await db.SaveChangesAsync(ct);

        // Enqueue background job (Hangfire / or a Channel-based worker)
        jobs.Enqueue<ProcessDocumentJob>(j => j.ProcessAsync(job.Id, CancellationToken.None));

        return Results.Accepted($"/api/documents/{job.Id}/status",
            new { jobId = job.Id, status = "Pending" });
    }

    private static async Task<IResult> GetStatus(
        int id,
        DocProcessorDbContext db,
        CancellationToken ct)
    {
        var job = await db.ProcessingJobs.FindAsync([id], ct);
        if (job is null) return Results.NotFound();

        return Results.Ok(new
        {
            job.Id,
            job.Status,
            job.CreatedAt,
            job.CompletedAt,
            job.ErrorMessage,
            job.RetryCount,
        });
    }

    private static async Task<IResult> GetResult(
        int id,
        DocProcessorDbContext db,
        CancellationToken ct)
    {
        var job = await db.ProcessingJobs.FindAsync([id], ct);

        if (job is null)           return Results.NotFound();
        if (job.Status != ProcessingStatus.Succeeded && job.Status != ProcessingStatus.RequiresReview)
            return Results.BadRequest(new { error = "Processing not complete", job.Status });

        var result = JsonSerializer.Deserialize<InvoiceData>(job.ResultJson!);
        return Results.Ok(new { result, requiresReview = job.Status == ProcessingStatus.RequiresReview });
    }
}

Using the API

# Submit a document
POST /api/documents
Content-Type: multipart/form-data
file: [invoice.pdf]

Response: 202 Accepted
{ "jobId": 42, "status": "Pending" }

# Poll for status
GET /api/documents/42/status
{ "id": 42, "status": "Succeeded", "completedAt": "2026-05-25T10:30:00Z" }

# Get the extracted result
GET /api/documents/42/result
{
  "result": {
    "invoiceNumber": "INV-2026-00342",
    "vendorName": "Acme Supplies Ltd",
    "totalAmount": 239.86,
    "currency": "GBP",
    "confidence": 0.94,
    "warnings": []
  },
  "requiresReview": false
}

Production Considerations

Confidence thresholds:
  0.95+  → auto-approve, no human review
  0.80-0.94 → auto-approve with warnings logged
  0.60-0.79 → route to RequiresReview queue (human checks)
  below 0.60 → reject, request better scan

Cost per document:
  ~1,500 input tokens (document text + prompt)
  ~300 output tokens (JSON extraction)
  gpt-4o cost: ~$0.007 per invoice
  At 10,000 invoices/month: ~$70/month

Speed:
  Text-based PDF: ~3 seconds end-to-end
  Scanned PDF (vision model): ~8 seconds

Accuracy targets:
  Invoice number:  98%+ (unique identifiers are clear in documents)
  Line items:      92%+ (layout varies significantly)
  Total amount:    96%+ (usually prominently displayed)

Build an AI Document Processor in .NET

Build an AI Document Processor in .NET

What We're Extracting

Project Structure

Step 1: Domain Models

Step 2: Document Parsing

Step 3: AI Extraction Service

Step 4: Validation Pipeline

Step 5: Background Processing Job

Step 6: API Endpoints

Using the API

Production Considerations

Enjoyed this article?

Leave a comment