microsoft · alliscode · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/dotnet/agent-framework-dotnet.slnx b/dotnet/agent-framework-dotnet.slnx
@@ -403,6 +403,7 @@
   <Folder Name="/Samples/05-end-to-end/Evaluation/">
     <Project Path="samples/05-end-to-end/Evaluation/Evaluation_ConversationSplits/Evaluation_ConversationSplits.csproj" />
     <Project Path="samples/05-end-to-end/Evaluation/Evaluation_FoundryQuality/Evaluation_FoundryQuality.csproj" />
+    <Project Path="samples/05-end-to-end/Evaluation/Evaluation_FoundryRubric/Evaluation_FoundryRubric.csproj" />
     <Project Path="samples/05-end-to-end/Evaluation/Evaluation_MixedProviders/Evaluation_MixedProviders.csproj" />
   </Folder>
   <Folder Name="/Samples/05-end-to-end/A2AClientServer/">

diff --git a/dotnet/samples/02-agents/Evaluation/Evaluation_ExpectedOutputs/README.md b/dotnet/samples/02-agents/Evaluation/Evaluation_ExpectedOutputs/README.md
@@ -31,3 +31,4 @@ dotnet run --project .\Evaluation_ExpectedOutputs
 
 - [Evaluation_SimpleEval](../Evaluation_SimpleEval/) — Simplest evaluation with built-in and custom checks
 - [Evaluation_FoundryQuality](../../../05-end-to-end/Evaluation/Evaluation_FoundryQuality/) — Cloud-based quality evaluation with Foundry evaluators
+- [Evaluation_FoundryRubric](../../../05-end-to-end/Evaluation/Evaluation_FoundryRubric/) — Rubric (adaptive) evaluators with per-dimension scores
diff --git a/dotnet/samples/02-agents/Evaluation/Evaluation_Multimodal/README.md b/dotnet/samples/02-agents/Evaluation/Evaluation_Multimodal/README.md
@@ -26,4 +26,5 @@ dotnet run --project .\Evaluation_Multimodal
 
 - [Evaluation_SimpleEval](../Evaluation_SimpleEval/) — Simplest evaluation with built-in checks and `agent.EvaluateAsync()`
 - [Evaluation_FoundryQuality](../../../05-end-to-end/Evaluation/Evaluation_FoundryQuality/) — Cloud-based quality evaluation with Foundry evaluators
+- [Evaluation_FoundryRubric](../../../05-end-to-end/Evaluation/Evaluation_FoundryRubric/) — Rubric (adaptive) evaluators with per-dimension scores
 - [Evaluation_ConversationSplits](../../../05-end-to-end/Evaluation/Evaluation_ConversationSplits/) — Multi-turn conversation split strategies
diff --git a/...samples/05-end-to-end/Evaluation/Evaluation_FoundryRubric/Evaluation_FoundryRubric.csproj b/...samples/05-end-to-end/Evaluation/Evaluation_FoundryRubric/Evaluation_FoundryRubric.csproj
@@ -0,0 +1,15 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <TargetFrameworks>net10.0</TargetFrameworks>
+
+    <Nullable>enable</Nullable>
+    <ImplicitUsings>enable</ImplicitUsings>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\..\..\..\src\Microsoft.Agents.AI.Foundry\Microsoft.Agents.AI.Foundry.csproj" />
+  </ItemGroup>
+
+</Project>
diff --git a/dotnet/samples/05-end-to-end/Evaluation/Evaluation_FoundryRubric/Program.cs b/dotnet/samples/05-end-to-end/Evaluation/Evaluation_FoundryRubric/Program.cs
@@ -0,0 +1,141 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+// This sample evaluates a pre-existing Azure AI Foundry agent against a rubric evaluator
+// that was authored in the Foundry portal.
+//
+// Rubric evaluators are LLM-as-judge evaluators with custom scoring dimensions you define
+// for your domain. agent-framework consumes pre-existing rubric evaluators — they are
+// authored in the Foundry portal (or via the dedicated SDK / REST surface) and referenced
+// here by name and version.
+//
+// Prerequisites:
+//   - An Azure AI Foundry project with a deployed model.
+//   - A registered Foundry agent in that project (the rubric was created against this agent).
+//   - A rubric evaluator already created in the Foundry portal.
+//   - .env (or environment) populated with the FOUNDRY_* variables below.
+//
+// IMPORTANT: FOUNDRY_PROJECT_ENDPOINT must be the project-scoped URL
+//   https://<resource>.services.ai.azure.com/api/projects/<project>
+// A bare Azure OpenAI endpoint silently fails eval submission with HTTP 500.
+
+using Azure.AI.Projects;
+using Azure.AI.Projects.Agents;
+using Azure.Identity;
+using Microsoft.Agents.AI;
+using Microsoft.Agents.AI.Foundry;
+using FoundryEvals = Microsoft.Agents.AI.Foundry.FoundryEvals;
+
+string projectEndpoint = Environment.GetEnvironmentVariable("FOUNDRY_PROJECT_ENDPOINT")
+    ?? throw new InvalidOperationException("FOUNDRY_PROJECT_ENDPOINT is not set.");
+string model = Environment.GetEnvironmentVariable("FOUNDRY_MODEL")
+    ?? throw new InvalidOperationException("FOUNDRY_MODEL is not set.");
+string agentName = Environment.GetEnvironmentVariable("FOUNDRY_AGENT_NAME")
+    ?? throw new InvalidOperationException("FOUNDRY_AGENT_NAME is not set.");
+string? agentVersion = Environment.GetEnvironmentVariable("FOUNDRY_AGENT_VERSION");
+string rubricName = Environment.GetEnvironmentVariable("FOUNDRY_RUBRIC_NAME")
+    ?? throw new InvalidOperationException("FOUNDRY_RUBRIC_NAME is not set.");
+string? rubricVersion = Environment.GetEnvironmentVariable("FOUNDRY_RUBRIC_VERSION");
+
+// WARNING: DefaultAzureCredential is convenient for development but requires careful
+// consideration in production. Prefer ManagedIdentityCredential (or a specific credential)
+// to avoid latency, unintended credential probing, and fallback security risks.
+AIProjectClient projectClient = new(new Uri(projectEndpoint), new DefaultAzureCredential());
+
+// 1. Connect to the pre-existing Foundry agent the rubric was created against.
+FoundryAgent agent;
+if (agentVersion is null)
+{
+    ProjectsAgentRecord agentRecord = await projectClient.AgentAdministrationClient.GetAgentAsync(agentName);
+    agent = projectClient.AsAIAgent(agentRecord);
+}
+else
+{
+    ProjectsAgentVersion versionRecord = await projectClient.AgentAdministrationClient.GetAgentVersionAsync(agentName, agentVersion);
+    agent = projectClient.AsAIAgent(versionRecord);
+}
+
+// 2. Reference the pre-existing rubric evaluator by name + version.
+//    Always pin a version for reproducible CI runs; a versionless ref resolves to the
+//    current version at run time and emits a Trace.TraceWarning on each criterion build.
+GeneratedEvaluatorRef rubric = rubricVersion is null
+    ? GeneratedEvaluatorRef.Latest(rubricName)
+    : new GeneratedEvaluatorRef(rubricName, rubricVersion);
+
+// 3. Mix the rubric with built-in evaluators in a single FoundryEvals config.
+//    The implicit conversion lets you pass strings and refs interchangeably.
+FoundryEvals evals = new(
+    projectClient,
+    model,
+    rubric,
+    FoundryEvals.Relevance,
+    FoundryEvals.Coherence);
+
+// 4. Run two example queries against the agent and evaluate the outputs in one call.
+string[] queries =
+[
+    "What's the weather like in Seattle?",
+    "Should I bring an umbrella to London tomorrow?",
+];
+
+Console.WriteLine(new string('=', 60));
+Console.WriteLine($"Evaluating '{agent.Name}' with rubric '{rubricName}' (version {rubricVersion ?? "latest"})");
+Console.WriteLine(new string('=', 60));
+
+AgentEvaluationResults results = await agent.EvaluateAsync(queries, evals);
+
+Console.WriteLine($"Status: {results.Status}");
+Console.WriteLine($"Results: {results.Passed}/{results.Total} passed");
+if (results.ReportUrl is not null)
+{
+    Console.WriteLine($"Portal: {results.ReportUrl}");
+}
+
+Console.WriteLine(results.Passed == results.Total ? "[PASS] All passed" : $"[FAIL] {results.Failed} failed");
+
+// 5. Print per-dimension breakdown for each evaluated item — this is the unique value
+//    of a rubric evaluator over the built-in numeric ones.
+Console.WriteLine();
+Console.WriteLine(new string('=', 60));
+Console.WriteLine("Per-dimension scores");
+Console.WriteLine(new string('=', 60));
+
+if (results.DetailedItems is { Count: > 0 })
+{
+    for (int i = 0; i < results.DetailedItems.Count; i++)
+    {
+        EvalItemResult item = results.DetailedItems[i];
+        Console.WriteLine($"Item {i + 1}{(i < queries.Length ? $" — \"{queries[i]}\"" : string.Empty)}");
+
+        foreach (EvalScoreResult score in item.Scores)
+        {
+            Console.WriteLine($"  {score.Name}: {score.Score:F1}{(score.Passed is bool p ? (p ? " (pass)" : " (fail)") : string.Empty)}");
+            if (score.Dimensions is { Count: > 0 } dims)
+            {
+                foreach (RubricScore d in dims)
+                {
+                    string scoreStr = d.Score is int s ? s.ToString() : "n/a";
+                    Console.WriteLine($"    - {d.Id}: {scoreStr}  (weight={d.Weight}, applicable={d.Applicable})");
+                }
+            }
+        }
+
+        Console.WriteLine();
+    }
+}
+
+// 6. CI quality gate — fail the build if a critical dimension drops below threshold.
+//    Replace "general_quality" with whatever dimension id your rubric actually defines.
+Console.WriteLine(new string('=', 60));
+Console.WriteLine("Per-dimension quality gate");
+Console.WriteLine(new string('=', 60));
+
+try
+{
+    results.AssertDimensionScoreAtLeast("general_quality", minScore: 3.0, evaluator: rubricName, requireApplicable: true);
+    Console.WriteLine($"[PASS] {results.ProviderName}: general_quality >= 3 on every item");
+}
+catch (InvalidOperationException ex)
+{
+    Console.WriteLine($"[FAIL] {results.ProviderName}: dimension gate tripped: {ex.Message}");
+    System.Environment.ExitCode = 1;
+}
diff --git a/dotnet/samples/05-end-to-end/Evaluation/Evaluation_FoundryRubric/README.md b/dotnet/samples/05-end-to-end/Evaluation/Evaluation_FoundryRubric/README.md
@@ -0,0 +1,55 @@
+# Evaluation — Foundry Rubric
+
+This sample evaluates a pre-existing Azure AI Foundry agent against a **rubric evaluator**
+authored in the Foundry portal. Rubric evaluators are LLM-as-judge evaluators with custom
+scoring dimensions you define for your domain; agent-framework references them by name and
+version, mixes them with built-in evaluators, and exposes per-dimension scores you can gate
+CI on.
+
+## What this sample demonstrates
+
+- Connecting to a pre-existing Foundry agent (`AgentAdministrationClient.GetAgentAsync`).
+- Referencing a pre-existing rubric evaluator via `GeneratedEvaluatorRef(name, version)`.
+- Mixing the rubric with built-in evaluators (`Relevance`, `Coherence`) in one
+  `FoundryEvals` run.
+- Reading per-dimension breakdowns from `EvalScoreResult.Dimensions`.
+- Gating CI on a per-dimension threshold via
+  `AgentEvaluationResults.AssertDimensionScoreAtLeast(...)`.
+
+## Prerequisites
+
+- .NET 10 SDK or later.
+- Azure CLI installed and authenticated (`az login`).
+- An Azure AI Foundry project with a deployed model.
+- A registered Foundry agent in that project (the agent the rubric was created against).
+- A rubric evaluator created in the Foundry portal. Creating rubrics through the portal
+  currently requires picking a Foundry agent as the generation context, so this
+  prerequisite is implied by having a rubric at all.
+
+> [!IMPORTANT]
+> `FOUNDRY_PROJECT_ENDPOINT` **must** be the project-scoped URL
+> `https://<resource>.services.ai.azure.com/api/projects/<project>`. A bare Azure OpenAI
+> endpoint silently fails eval submission with HTTP 500.
+
+> [!NOTE]
+> An **Eval Definition** (a saved bundle of testing_criteria with `"object": "eval"`) is
+> not the same as a **Rubric Evaluator** (a standalone evaluator with dimensions, weights,
+> and a version). `GeneratedEvaluatorRef` points at the latter.
+
+## Environment variables
+
+```powershell
+$env:FOUNDRY_PROJECT_ENDPOINT="https://your-resource.services.ai.azure.com/api/projects/your-project"
+$env:FOUNDRY_MODEL="gpt-4o-mini"
+$env:FOUNDRY_AGENT_NAME="your-agent-name"
+$env:FOUNDRY_AGENT_VERSION="1"                   # optional; omit for latest
+$env:FOUNDRY_RUBRIC_NAME="your-rubric-name"
+$env:FOUNDRY_RUBRIC_VERSION="1"                  # optional; omit for latest (CI: pin this)
+```
+
+## Run the sample
+
+```powershell
+cd dotnet/samples/05-end-to-end/Evaluation
+dotnet run --project .\Evaluation_FoundryRubric
+```
diff --git a/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalConverter.cs b/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalConverter.cs
@@ -148,19 +148,68 @@ internal static WireEvalItemPayload ConvertEvalItem(EvalItem item, IConversation
     /// <summary>
     /// Builds the <c>testing_criteria</c> array for <c>evals.create()</c>.
     /// </summary>
-    /// <param name="evaluators">Evaluator names (short or fully-qualified).</param>
+    /// <param name="evaluators">
+    /// Evaluator specs — built-in evaluator names (short or fully-qualified) and/or
+    /// <see cref="GeneratedEvaluatorRef"/> instances for pre-existing rubric evaluators.
+    /// </param>
     /// <param name="model">Model deployment name for the LLM judge.</param>
     /// <param name="includeDataMapping">
     /// Whether to include field-level data mapping (required for JSONL data source).
     /// </param>
+    /// <param name="includeToolDefinitions">
+    /// Whether the mapped data items include tool definitions. Used to add a
+    /// <c>tool_definitions</c> mapping entry for rubric evaluators (built-in evaluators
+    /// derive this from their own <see cref="ToolEvaluators"/> membership).
+    /// </param>
     internal static List<WireTestingCriterion> BuildTestingCriteria(
-        IEnumerable<string> evaluators,
+        IEnumerable<FoundryEvaluatorSpec> evaluators,
         string model,
-        bool includeDataMapping = false)
+        bool includeDataMapping = false,
+        bool includeToolDefinitions = false)
     {
         var criteria = new List<WireTestingCriterion>();
-        foreach (var name in evaluators)
+        foreach (var spec in evaluators)
         {
+            if (spec.IsRubric)
+            {
+                var @ref = spec.GeneratedRef!;
+                Dictionary<string, string>? refMapping = null;
+                if (includeDataMapping)
+                {
+                    // Rubric evaluators accept conversation arrays like agent evaluators,
+                    // plus tool_definitions when items are tool-aware.
+                    refMapping = new Dictionary<string, string>
+                    {
+                        ["query"] = "{{item.query_messages}}",
+                        ["response"] = "{{item.response_messages}}",
+                    };
+
+                    if (includeToolDefinitions)
+                    {
+                        refMapping["tool_definitions"] = "{{item.tool_definitions}}";
+                    }
+                }
+
+                criteria.Add(new WireTestingCriterion
+                {
+                    Name = @ref.DisplayName ?? @ref.Name,
+                    EvaluatorName = @ref.Name,
+                    EvaluatorVersion = @ref.Version,
+                    InitializationParameters = new WireInitParams { DeploymentName = model },
+                    DataMapping = refMapping,
+                });
+
+                if (@ref.Version is null)
+                {
+                    System.Diagnostics.Trace.TraceWarning(
+                        "GeneratedEvaluatorRef '{0}' has no pinned version; the eval run will resolve to whichever version is current at execution time. Pin the version for reproducible runs.",
+                        @ref.Name);
+                }
+
+                continue;
+            }
+
+            var name = spec.BuiltinName!;
             var qualified = ResolveEvaluator(name);
             var shortName = name.StartsWith("builtin.", StringComparison.Ordinal)
                 ? name.Substring("builtin.".Length)
@@ -248,8 +297,12 @@ internal static WireItemSchema BuildItemSchema(bool hasContext = false, bool has
     /// Returns the subset of <paramref name="evaluators"/> that require a ground-truth
     /// (reference) value but cannot be evaluated because no item provided one.
     /// </summary>
+    /// <remarks>
+    /// Rubric references (<see cref="GeneratedEvaluatorRef"/>) are skipped — they are not
+    /// ground-truth–dependent on the wire.
+    /// </remarks>
     internal static List<string> FindMissingGroundTruthEvaluators(
-        IEnumerable<string> evaluators,
+        IEnumerable<FoundryEvaluatorSpec> evaluators,
         bool hasGroundTruth)
     {
         if (hasGroundTruth)
@@ -258,8 +311,14 @@ internal static List<string> FindMissingGroundTruthEvaluators(
         }
 
         var missing = new List<string>();
-        foreach (var name in evaluators)
+        foreach (var spec in evaluators)
         {
+            if (spec.IsRubric)
+            {
+                continue;
+            }
+
+            var name = spec.BuiltinName!;
             if (GroundTruthEvaluators.Contains(ResolveEvaluator(name)))
             {
                 missing.Add(name);

diff --git a/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalWireModels.cs b/dotnet/src/Microsoft.Agents.AI.Foundry/Evaluation/FoundryEvalWireModels.cs
@@ -137,6 +137,9 @@ internal sealed class WireTestingCriterion
     [JsonPropertyName("evaluator_name")]
     public required string EvaluatorName { get; init; }
 
+    [JsonPropertyName("evaluator_version")]
+    public string? EvaluatorVersion { get; init; }
+
     [JsonPropertyName("initialization_parameters")]
     public required WireInitParams InitializationParameters { get; init; }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -31,3 +31,4 @@ dotnet run --project .\Evaluation_ExpectedOutputs

		- [Evaluation_SimpleEval](../Evaluation_SimpleEval/) — Simplest evaluation with built-in and custom checks
		- [Evaluation_FoundryQuality](../../../05-end-to-end/Evaluation/Evaluation_FoundryQuality/) — Cloud-based quality evaluation with Foundry evaluators
		- [Evaluation_FoundryRubric](../../../05-end-to-end/Evaluation/Evaluation_FoundryRubric/) — Rubric (adaptive) evaluators with per-dimension scores
Comment thread alliscode marked this conversation as resolved.