Evaluating LLM Outputs in .NET — Testing AI Quality

LLM outputs are probabilistic — the same prompt can produce different answers, and "good" is subjective. Evals replace manual testing with automated, measurable quality checks you can run in CI and monitor in production.

What to Evaluate

LLM output quality has four dimensions:

1. Correctness      — Is the answer factually right?
2. Groundedness     — Is the answer supported by the provided context?
3. Relevance        — Does the answer address the question?
4. Coherence        — Is the answer well-structured and readable?

For RAG systems (Retrieval-Augmented Generation):
  - Context recall    — Did retrieval fetch the right documents?
  - Context precision — Were the fetched documents actually useful?
  - Answer faithfulness — Does the answer stay within the retrieved context?

Step 1: Semantic Similarity Scoring

// Compare generated answer to a gold-standard expected answer
public class SemanticSimilarityEvaluator(
    IEmbeddingGenerator<string, Embedding<float>> embedder)
{
    // Returns 0.0 (completely different) to 1.0 (identical meaning)
    public async Task<double> ScoreAsync(
        string generated,
        string expected,
        CancellationToken ct = default)
    {
        var results = await embedder.GenerateAsync([generated, expected], cancellationToken: ct);
        var genVec  = results[0].Vector;
        var expVec  = results[1].Vector;

        return CosineSimilarity(genVec.Span, expVec.Span);
    }

    public async Task<List<EvalResult>> EvaluateDatasetAsync(
        IEnumerable<EvalCase> cases,
        double threshold = 0.85,
        CancellationToken ct = default)
    {
        var results = new List<EvalResult>();

        foreach (var c in cases)
        {
            var score  = await ScoreAsync(c.Generated, c.Expected, ct);
            results.Add(new EvalResult(c.Id, score, score >= threshold, c.Generated, c.Expected));
        }

        return results;
    }

    private static double CosineSimilarity(ReadOnlySpan<float> a, ReadOnlySpan<float> b)
    {
        double dot = 0, normA = 0, normB = 0;
        for (int i = 0; i < a.Length; i++)
        {
            dot   += a[i] * b[i];
            normA += a[i] * a[i];
            normB += b[i] * b[i];
        }
        return dot / (Math.Sqrt(normA) * Math.Sqrt(normB));
    }
}

public record EvalCase(string Id, string Input, string Generated, string Expected);
public record EvalResult(string Id, double Score, bool Passed, string Generated, string Expected);

Step 2: LLM-as-Judge

// Use an LLM to evaluate another LLM's output — the RAGAS approach
public class LlmJudge(IChatClient judge)
{
    // Faithfulness: does the answer contain only facts from the context?
    public async Task<FaithfulnessScore> EvaluateFaithfulnessAsync(
        string question,
        string context,
        string answer,
        CancellationToken ct = default)
    {
        var prompt = $"""
            Evaluate the faithfulness of the answer based ONLY on the provided context.

            Question: {question}

            Context:
            {context}

            Answer:
            {answer}

            Rate faithfulness from 0 to 1 where:
            0.0 = answer contains facts not in the context (hallucination)
            0.5 = answer partially supported by context
            1.0 = answer entirely grounded in the context

            Respond with JSON only:
            {{"score": 0.0, "reason": "explanation", "unsupported_claims": ["claim1"]}}
            """;

        var response = await judge.CompleteAsync(
            [new ChatMessage(ChatRole.User, prompt)],
            new ChatOptions { ResponseFormat = ChatResponseFormat.Json },
            ct);

        return JsonSerializer.Deserialize<FaithfulnessScore>(response.Message.Text!)!;
    }

    // Answer relevance: does the answer address the question?
    public async Task<double> EvaluateRelevanceAsync(
        string question,
        string answer,
        CancellationToken ct = default)
    {
        var prompt = $"""
            Does the following answer address the question?

            Question: {question}
            Answer: {answer}

            Score from 0.0 to 1.0.
            Respond with JSON: {{"score": 0.0}}
            """;

        var response = await judge.CompleteAsync(
            [new ChatMessage(ChatRole.User, prompt)],
            new ChatOptions { ResponseFormat = ChatResponseFormat.Json },
            ct);

        var result = JsonSerializer.Deserialize<JsonElement>(response.Message.Text!);
        return result.GetProperty("score").GetDouble();
    }

    // Correctness: does the answer match the ground truth?
    public async Task<CorrectnessScore> EvaluateCorrectnessAsync(
        string question,
        string answer,
        string groundTruth,
        CancellationToken ct = default)
    {
        var prompt = $"""
            Compare the answer to the ground truth and rate correctness.

            Question: {question}
            Answer: {answer}
            Ground Truth: {groundTruth}

            Respond with JSON:
            {{"score": 0.0, "is_correct": true, "missing": ["missing facts"], "extra": ["wrong facts"]}}
            """;

        var response = await judge.CompleteAsync(
            [new ChatMessage(ChatRole.User, prompt)],
            new ChatOptions { ResponseFormat = ChatResponseFormat.Json },
            ct);

        return JsonSerializer.Deserialize<CorrectnessScore>(response.Message.Text!)!;
    }
}

public record FaithfulnessScore(double Score, string Reason, List<string> UnsupportedClaims);
public record CorrectnessScore(double Score, bool IsCorrect, List<string> Missing, List<string> Extra);

Step 3: RAGAS-Style RAG Evaluation

// Full RAG pipeline evaluation — context + answer quality
public class RagEvaluator(
    LlmJudge judge,
    SemanticSimilarityEvaluator similarity)
{
    public async Task<RagEvalReport> EvaluateAsync(
        RagEvalCase c,
        CancellationToken ct = default)
    {
        var faithfulness = await judge.EvaluateFaithfulnessAsync(
            c.Question, c.RetrievedContext, c.GeneratedAnswer, ct);

        var relevance = await judge.EvaluateRelevanceAsync(
            c.Question, c.GeneratedAnswer, ct);

        var correctness = c.GroundTruth is not null
            ? await judge.EvaluateCorrectnessAsync(c.Question, c.GeneratedAnswer, c.GroundTruth, ct)
            : null;

        var semantic = c.GroundTruth is not null
            ? await similarity.ScoreAsync(c.GeneratedAnswer, c.GroundTruth, ct)
            : (double?)null;

        // Context precision: how much of the retrieved context was useful?
        var contextPrecision = await EvaluateContextPrecisionAsync(
            c.Question, c.RetrievedContext, c.GeneratedAnswer, ct);

        return new RagEvalReport(
            CaseId:           c.Id,
            Question:         c.Question,
            Faithfulness:     faithfulness.Score,
            AnswerRelevance:  relevance,
            ContextPrecision: contextPrecision,
            Correctness:      correctness?.Score,
            SemanticSimilarity: semantic,
            OverallScore:     CalculateOverall(faithfulness.Score, relevance, contextPrecision),
            UnsupportedClaims: faithfulness.UnsupportedClaims);
    }

    private async Task<double> EvaluateContextPrecisionAsync(
        string question, string context, string answer, CancellationToken ct)
    {
        // Split context into chunks, check which ones contributed to the answer
        var chunks = context.Split("\n\n", StringSplitOptions.RemoveEmptyEntries);
        if (chunks.Length == 0) return 0;

        int useful = 0;
        foreach (var chunk in chunks)
        {
            var isUseful = await judge.EvaluateRelevanceAsync(question, chunk, ct);
            if (isUseful > 0.5) useful++;
        }

        return (double)useful / chunks.Length;
    }

    private static double CalculateOverall(double faithfulness, double relevance, double precision)
        => (faithfulness + relevance + precision) / 3.0;

    public async Task<DatasetEvalSummary> EvaluateDatasetAsync(
        IEnumerable<RagEvalCase> cases,
        CancellationToken ct = default)
    {
        var reports = new List<RagEvalReport>();

        foreach (var c in cases)
            reports.Add(await EvaluateAsync(c, ct));

        return new DatasetEvalSummary(
            TotalCases:         reports.Count,
            PassedCases:        reports.Count(r => r.OverallScore >= 0.7),
            AvgFaithfulness:    reports.Average(r => r.Faithfulness),
            AvgRelevance:       reports.Average(r => r.AnswerRelevance),
            AvgContextPrecision: reports.Average(r => r.ContextPrecision),
            AvgOverallScore:    reports.Average(r => r.OverallScore),
            FailedCases:        reports.Where(r => r.OverallScore < 0.7).ToList());
    }
}

public record RagEvalCase(
    string Id, string Question, string RetrievedContext,
    string GeneratedAnswer, string? GroundTruth = null);

public record RagEvalReport(
    string CaseId, string Question,
    double Faithfulness, double AnswerRelevance, double ContextPrecision,
    double? Correctness, double? SemanticSimilarity, double OverallScore,
    List<string> UnsupportedClaims);

public record DatasetEvalSummary(
    int TotalCases, int PassedCases,
    double AvgFaithfulness, double AvgRelevance, double AvgContextPrecision,
    double AvgOverallScore, List<RagEvalReport> FailedCases);

Step 4: Eval Dataset Management

// Store eval cases as JSON — version-controlled alongside code
public class EvalDatasetLoader
{
    public static List<RagEvalCase> LoadFromJson(string path)
        => JsonSerializer.Deserialize<List<RagEvalCase>>(File.ReadAllText(path))!;

    public static void Save(List<RagEvalReport> reports, string outputPath)
    {
        var json = JsonSerializer.Serialize(reports, new JsonSerializerOptions { WriteIndented = true });
        File.WriteAllText(outputPath, json);
    }
}

JSON

// eval-cases/rag-product-qa.json
[
  {
    "id": "pc-001",
    "question": "What is the return policy for electronics?",
    "retrievedContext": "Electronics can be returned within 30 days of purchase. Items must be unopened. Refunds are issued within 5 business days.",
    "generatedAnswer": "Electronics can be returned within 30 days if unopened.",
    "groundTruth": "Electronics can be returned within 30 days if unopened. Refunds take 5 business days."
  },
  {
    "id": "pc-002",
    "question": "Do you offer free shipping?",
    "retrievedContext": "Free shipping applies to orders over $50. Standard shipping is $5.99.",
    "generatedAnswer": "Yes, free shipping is available on all orders.",
    "groundTruth": "Free shipping applies to orders over $50."
  }
]

Step 5: xUnit Integration Tests for LLM Quality

// Run evals as part of your test suite with quality thresholds
public class LlmQualityTests(ITestOutputHelper output) : IClassFixture<EvalFixture>
{
    private readonly EvalFixture _fixture = new();

    [Fact]
    public async Task ProductQA_FaithfulnessScore_ShouldExceedThreshold()
    {
        var cases = EvalDatasetLoader.LoadFromJson("eval-cases/rag-product-qa.json");
        var summary = await _fixture.Evaluator.EvaluateDatasetAsync(cases);

        output.WriteLine($"Avg faithfulness: {summary.AvgFaithfulness:P0}");
        output.WriteLine($"Avg relevance:    {summary.AvgRelevance:P0}");
        output.WriteLine($"Pass rate:        {summary.PassedCases}/{summary.TotalCases}");

        summary.AvgFaithfulness.Should().BeGreaterThan(0.80,
            "RAG answers must be grounded in retrieved context");
    }

    [Fact]
    public async Task ProductQA_HallucinationCases_ShouldBeZero()
    {
        var cases = EvalDatasetLoader.LoadFromJson("eval-cases/rag-product-qa.json");
        var reports = new List<RagEvalReport>();

        foreach (var c in cases)
            reports.Add(await _fixture.Evaluator.EvaluateAsync(c));

        var hallucinated = reports
            .Where(r => r.Faithfulness < 0.5)
            .ToList();

        foreach (var r in hallucinated)
            output.WriteLine($"HALLUCINATION in {r.CaseId}: {string.Join(", ", r.UnsupportedClaims)}");

        hallucinated.Should().BeEmpty("zero tolerance for hallucinations in product Q&A");
    }

    [Theory]
    [InlineData("What is the return policy?", 0.75)]
    [InlineData("Do you ship internationally?", 0.70)]
    public async Task SpecificQuestions_ShouldMeetMinimumRelevance(
        string question, double minScore)
    {
        var answer = await _fixture.QnaService.AnswerAsync(question);
        var score  = await _fixture.Judge.EvaluateRelevanceAsync(question, answer);

        score.Should().BeGreaterThanOrEqualTo(minScore);
    }
}

public class EvalFixture : IAsyncLifetime
{
    public RagEvaluator Evaluator { get; private set; } = null!;
    public LlmJudge Judge { get; private set; } = null!;
    public ProductQnA QnaService { get; private set; } = null!;

    public async Task InitializeAsync()
    {
        // Use a cheaper model for evals (gpt-4o-mini, not gpt-4o)
        var chatClient = new OpenAIClient(Environment.GetEnvironmentVariable("OPENAI_API_KEY")!)
            .AsChatClient("gpt-4o-mini");

        var embedder = new OpenAIClient(Environment.GetEnvironmentVariable("OPENAI_API_KEY")!)
            .AsEmbeddingGenerator("text-embedding-3-small");

        Judge      = new LlmJudge(chatClient);
        Evaluator  = new RagEvaluator(Judge, new SemanticSimilarityEvaluator(embedder));
        // QnaService = ... set up RAG pipeline
        await Task.CompletedTask;
    }

    public Task DisposeAsync() => Task.CompletedTask;
}

Step 6: CI Pipeline Integration

YAML

# .github/workflows/llm-evals.yml
name: LLM Quality Evals

on:
  push:
    branches: [main]
    paths:
      - 'src/**'
      - 'eval-cases/**'
  schedule:
    - cron: '0 6 * * *'    # nightly regression check

jobs:
  evals:
    runs-on: ubuntu-latest
    env:
      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-dotnet@v4
        with:
          dotnet-version: '9.x'

      - name: Run LLM quality tests
        run: dotnet test tests/Evals/ --logger "trx;LogFileName=evals.trx"

      - name: Publish eval report
        if: always()
        uses: dorny/test-reporter@v1
        with:
          name: LLM Eval Results
          path: '**/evals.trx'
          reporter: dotnet-trx

Step 7: Production Monitoring

// Sample production traffic and evaluate quality asynchronously
public class ProductionEvalSampler(
    RagEvaluator evaluator,
    ILogger<ProductionEvalSampler> logger,
    IMetrics metrics)
{
    private readonly Random _rng = new();

    // Call this alongside every production answer
    public async Task MaybeSampleAsync(
        string question,
        string retrievedContext,
        string answer,
        CancellationToken ct = default)
    {
        // Sample 5% of traffic for eval (too expensive to eval all)
        if (_rng.NextDouble() > 0.05) return;

        try
        {
            var c = new RagEvalCase(
                Id:               Guid.NewGuid().ToString(),
                Question:         question,
                RetrievedContext: retrievedContext,
                GeneratedAnswer:  answer);

            var report = await evaluator.EvaluateAsync(c, ct);

            // Emit metrics to your observability stack
            metrics.RecordGauge("llm.faithfulness",     report.Faithfulness);
            metrics.RecordGauge("llm.answer_relevance", report.AnswerRelevance);
            metrics.RecordGauge("llm.overall_score",    report.OverallScore);

            if (report.Faithfulness < 0.5)
                logger.LogWarning(
                    "Low faithfulness detected. Q: {Question} Claims: {Claims}",
                    question, string.Join(", ", report.UnsupportedClaims));
        }
        catch (Exception ex)
        {
            // Never let eval failures affect production path
            logger.LogError(ex, "Eval sampling failed silently");
        }
    }
}

Interview Answer

"LLM evaluation in .NET uses two main approaches: embedding-based semantic similarity (cosine similarity between generated and expected answer embeddings) for fast regression testing, and LLM-as-judge for nuanced quality dimensions. For RAG systems, the key RAGAS-style metrics are faithfulness (does the answer stay within retrieved context?), answer relevance (does it address the question?), and context precision (how much of the retrieved context was useful?). Faithfulness is the most important metric — a score below 0.5 means hallucination. In practice: maintain a JSON eval dataset alongside code, run evals as xUnit tests in CI with quality thresholds (e.g., average faithfulness must exceed 80%), use a cheaper model like gpt-4o-mini for the judge to control costs, and sample 5% of production traffic for continuous quality monitoring without eval costs killing margins. Evals should block deployment when they regress below threshold — treat them like unit tests for correctness."