Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dotnet/agent-framework-dotnet.slnx
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@
<Folder Name="/Samples/05-end-to-end/Evaluation/">
<Project Path="samples/05-end-to-end/Evaluation/Evaluation_ConversationSplits/Evaluation_ConversationSplits.csproj" />
<Project Path="samples/05-end-to-end/Evaluation/Evaluation_FoundryQuality/Evaluation_FoundryQuality.csproj" />
<Project Path="samples/05-end-to-end/Evaluation/Evaluation_FoundryRubric/Evaluation_FoundryRubric.csproj" />
<Project Path="samples/05-end-to-end/Evaluation/Evaluation_MixedProviders/Evaluation_MixedProviders.csproj" />
</Folder>
<Folder Name="/Samples/05-end-to-end/A2AClientServer/">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ dotnet run --project .\Evaluation_ExpectedOutputs

- [Evaluation_SimpleEval](../Evaluation_SimpleEval/) — Simplest evaluation with built-in and custom checks
- [Evaluation_FoundryQuality](../../../05-end-to-end/Evaluation/Evaluation_FoundryQuality/) — Cloud-based quality evaluation with Foundry evaluators
- [Evaluation_FoundryRubric](../../../05-end-to-end/Evaluation/Evaluation_FoundryRubric/) — Rubric (adaptive) evaluators with per-dimension scores
Comment thread
alliscode marked this conversation as resolved.
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ dotnet run --project .\Evaluation_Multimodal

- [Evaluation_SimpleEval](../Evaluation_SimpleEval/) — Simplest evaluation with built-in checks and `agent.EvaluateAsync()`
- [Evaluation_FoundryQuality](../../../05-end-to-end/Evaluation/Evaluation_FoundryQuality/) — Cloud-based quality evaluation with Foundry evaluators
- [Evaluation_FoundryRubric](../../../05-end-to-end/Evaluation/Evaluation_FoundryRubric/) — Rubric (adaptive) evaluators with per-dimension scores
- [Evaluation_ConversationSplits](../../../05-end-to-end/Evaluation/Evaluation_ConversationSplits/) — Multi-turn conversation split strategies
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFrameworks>net10.0</TargetFrameworks>

<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\..\src\Microsoft.Agents.AI.Foundry\Microsoft.Agents.AI.Foundry.csproj" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// Copyright (c) Microsoft. All rights reserved.

// This sample evaluates a pre-existing Azure AI Foundry agent against a rubric evaluator
// that was authored in the Foundry portal.
//
// Rubric evaluators are LLM-as-judge evaluators with custom scoring dimensions you define
// for your domain. agent-framework consumes pre-existing rubric evaluators — they are
// authored in the Foundry portal (or via the dedicated SDK / REST surface) and referenced
// here by name and version.
//
// Prerequisites:
// - An Azure AI Foundry project with a deployed model.
// - A registered Foundry agent in that project (the rubric was created against this agent).
// - A rubric evaluator already created in the Foundry portal.
// - .env (or environment) populated with the FOUNDRY_* variables below.
//
// IMPORTANT: FOUNDRY_PROJECT_ENDPOINT must be the project-scoped URL
// https://<resource>.services.ai.azure.com/api/projects/<project>
// A bare Azure OpenAI endpoint silently fails eval submission with HTTP 500.

using Azure.AI.Projects;
using Azure.AI.Projects.Agents;
using Azure.Identity;
using Microsoft.Agents.AI;
using Microsoft.Agents.AI.Foundry;
using FoundryEvals = Microsoft.Agents.AI.Foundry.FoundryEvals;

string projectEndpoint = Environment.GetEnvironmentVariable("FOUNDRY_PROJECT_ENDPOINT")
?? throw new InvalidOperationException("FOUNDRY_PROJECT_ENDPOINT is not set.");
string model = Environment.GetEnvironmentVariable("FOUNDRY_MODEL")
?? throw new InvalidOperationException("FOUNDRY_MODEL is not set.");
string agentName = Environment.GetEnvironmentVariable("FOUNDRY_AGENT_NAME")
?? throw new InvalidOperationException("FOUNDRY_AGENT_NAME is not set.");
string? agentVersion = Environment.GetEnvironmentVariable("FOUNDRY_AGENT_VERSION");
string rubricName = Environment.GetEnvironmentVariable("FOUNDRY_RUBRIC_NAME")
?? throw new InvalidOperationException("FOUNDRY_RUBRIC_NAME is not set.");
string? rubricVersion = Environment.GetEnvironmentVariable("FOUNDRY_RUBRIC_VERSION");

// WARNING: DefaultAzureCredential is convenient for development but requires careful
// consideration in production. Prefer ManagedIdentityCredential (or a specific credential)
// to avoid latency, unintended credential probing, and fallback security risks.
AIProjectClient projectClient = new(new Uri(projectEndpoint), new DefaultAzureCredential());

// 1. Connect to the pre-existing Foundry agent the rubric was created against.
FoundryAgent agent;
if (agentVersion is null)
{
ProjectsAgentRecord agentRecord = await projectClient.AgentAdministrationClient.GetAgentAsync(agentName);
agent = projectClient.AsAIAgent(agentRecord);
}
else
{
ProjectsAgentVersion versionRecord = await projectClient.AgentAdministrationClient.GetAgentVersionAsync(agentName, agentVersion);
agent = projectClient.AsAIAgent(versionRecord);
}

// 2. Reference the pre-existing rubric evaluator by name + version.
// Always pin a version for reproducible CI runs; a versionless ref resolves to the
// current version at run time and emits a Trace.TraceWarning on each criterion build.
GeneratedEvaluatorRef rubric = rubricVersion is null
? GeneratedEvaluatorRef.Latest(rubricName)
: new GeneratedEvaluatorRef(rubricName, rubricVersion);

// 3. Mix the rubric with built-in evaluators in a single FoundryEvals config.
// The implicit conversion lets you pass strings and refs interchangeably.
FoundryEvals evals = new(
projectClient,
model,
rubric,
FoundryEvals.Relevance,
FoundryEvals.Coherence);

// 4. Run two example queries against the agent and evaluate the outputs in one call.
string[] queries =
[
"What's the weather like in Seattle?",
"Should I bring an umbrella to London tomorrow?",
];

Console.WriteLine(new string('=', 60));
Console.WriteLine($"Evaluating '{agent.Name}' with rubric '{rubricName}' (version {rubricVersion ?? "latest"})");
Console.WriteLine(new string('=', 60));

AgentEvaluationResults results = await agent.EvaluateAsync(queries, evals);

Console.WriteLine($"Status: {results.Status}");
Console.WriteLine($"Results: {results.Passed}/{results.Total} passed");
if (results.ReportUrl is not null)
{
Console.WriteLine($"Portal: {results.ReportUrl}");
}

Console.WriteLine(results.Passed == results.Total ? "[PASS] All passed" : $"[FAIL] {results.Failed} failed");

// 5. Print per-dimension breakdown for each evaluated item — this is the unique value
// of a rubric evaluator over the built-in numeric ones.
Console.WriteLine();
Console.WriteLine(new string('=', 60));
Console.WriteLine("Per-dimension scores");
Console.WriteLine(new string('=', 60));

if (results.DetailedItems is { Count: > 0 })
{
for (int i = 0; i < results.DetailedItems.Count; i++)
{
EvalItemResult item = results.DetailedItems[i];
Console.WriteLine($"Item {i + 1}{(i < queries.Length ? $" — \"{queries[i]}\"" : string.Empty)}");

foreach (EvalScoreResult score in item.Scores)
{
Console.WriteLine($" {score.Name}: {score.Score:F1}{(score.Passed is bool p ? (p ? " (pass)" : " (fail)") : string.Empty)}");
if (score.Dimensions is { Count: > 0 } dims)
{
foreach (RubricScore d in dims)
{
string scoreStr = d.Score is int s ? s.ToString() : "n/a";
Console.WriteLine($" - {d.Id}: {scoreStr} (weight={d.Weight}, applicable={d.Applicable})");
}
}
}

Console.WriteLine();
}
}

// 6. CI quality gate — fail the build if a critical dimension drops below threshold.
// Replace "general_quality" with whatever dimension id your rubric actually defines.
Console.WriteLine(new string('=', 60));
Console.WriteLine("Per-dimension quality gate");
Console.WriteLine(new string('=', 60));

try
{
results.AssertDimensionScoreAtLeast("general_quality", minScore: 3.0, evaluator: rubricName, requireApplicable: true);
Console.WriteLine($"[PASS] {results.ProviderName}: general_quality >= 3 on every item");
}
catch (InvalidOperationException ex)
{
Console.WriteLine($"[FAIL] {results.ProviderName}: dimension gate tripped: {ex.Message}");
System.Environment.ExitCode = 1;
}
Comment thread
alliscode marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Evaluation — Foundry Rubric

This sample evaluates a pre-existing Azure AI Foundry agent against a **rubric evaluator**
authored in the Foundry portal. Rubric evaluators are LLM-as-judge evaluators with custom
scoring dimensions you define for your domain; agent-framework references them by name and
version, mixes them with built-in evaluators, and exposes per-dimension scores you can gate
CI on.

## What this sample demonstrates

- Connecting to a pre-existing Foundry agent (`AgentAdministrationClient.GetAgentAsync`).
- Referencing a pre-existing rubric evaluator via `GeneratedEvaluatorRef(name, version)`.
- Mixing the rubric with built-in evaluators (`Relevance`, `Coherence`) in one
`FoundryEvals` run.
- Reading per-dimension breakdowns from `EvalScoreResult.Dimensions`.
- Gating CI on a per-dimension threshold via
`AgentEvaluationResults.AssertDimensionScoreAtLeast(...)`.

## Prerequisites

- .NET 10 SDK or later.
- Azure CLI installed and authenticated (`az login`).
- An Azure AI Foundry project with a deployed model.
- A registered Foundry agent in that project (the agent the rubric was created against).
- A rubric evaluator created in the Foundry portal. Creating rubrics through the portal
currently requires picking a Foundry agent as the generation context, so this
prerequisite is implied by having a rubric at all.

> [!IMPORTANT]
> `FOUNDRY_PROJECT_ENDPOINT` **must** be the project-scoped URL
> `https://<resource>.services.ai.azure.com/api/projects/<project>`. A bare Azure OpenAI
> endpoint silently fails eval submission with HTTP 500.
> [!NOTE]
> An **Eval Definition** (a saved bundle of testing_criteria with `"object": "eval"`) is
> not the same as a **Rubric Evaluator** (a standalone evaluator with dimensions, weights,
> and a version). `GeneratedEvaluatorRef` points at the latter.
## Environment variables

```powershell
$env:FOUNDRY_PROJECT_ENDPOINT="https://your-resource.services.ai.azure.com/api/projects/your-project"
$env:FOUNDRY_MODEL="gpt-4o-mini"
$env:FOUNDRY_AGENT_NAME="your-agent-name"
$env:FOUNDRY_AGENT_VERSION="1" # optional; omit for latest
$env:FOUNDRY_RUBRIC_NAME="your-rubric-name"
$env:FOUNDRY_RUBRIC_VERSION="1" # optional; omit for latest (CI: pin this)
```

## Run the sample

```powershell
cd dotnet/samples/05-end-to-end/Evaluation
dotnet run --project .\Evaluation_FoundryRubric
```
Original file line number Diff line number Diff line change
Expand Up @@ -148,19 +148,68 @@ internal static WireEvalItemPayload ConvertEvalItem(EvalItem item, IConversation
/// <summary>
/// Builds the <c>testing_criteria</c> array for <c>evals.create()</c>.
/// </summary>
/// <param name="evaluators">Evaluator names (short or fully-qualified).</param>
/// <param name="evaluators">
/// Evaluator specs — built-in evaluator names (short or fully-qualified) and/or
/// <see cref="GeneratedEvaluatorRef"/> instances for pre-existing rubric evaluators.
/// </param>
/// <param name="model">Model deployment name for the LLM judge.</param>
/// <param name="includeDataMapping">
/// Whether to include field-level data mapping (required for JSONL data source).
/// </param>
/// <param name="includeToolDefinitions">
/// Whether the mapped data items include tool definitions. Used to add a
/// <c>tool_definitions</c> mapping entry for rubric evaluators (built-in evaluators
/// derive this from their own <see cref="ToolEvaluators"/> membership).
/// </param>
internal static List<WireTestingCriterion> BuildTestingCriteria(
IEnumerable<string> evaluators,
IEnumerable<FoundryEvaluatorSpec> evaluators,
string model,
bool includeDataMapping = false)
bool includeDataMapping = false,
bool includeToolDefinitions = false)
{
var criteria = new List<WireTestingCriterion>();
foreach (var name in evaluators)
foreach (var spec in evaluators)
{
if (spec.IsRubric)
{
var @ref = spec.GeneratedRef!;
Dictionary<string, string>? refMapping = null;
if (includeDataMapping)
{
// Rubric evaluators accept conversation arrays like agent evaluators,
// plus tool_definitions when items are tool-aware.
refMapping = new Dictionary<string, string>
{
["query"] = "{{item.query_messages}}",
["response"] = "{{item.response_messages}}",
};

if (includeToolDefinitions)
{
refMapping["tool_definitions"] = "{{item.tool_definitions}}";
}
}

criteria.Add(new WireTestingCriterion
{
Name = @ref.DisplayName ?? @ref.Name,
EvaluatorName = @ref.Name,
EvaluatorVersion = @ref.Version,
InitializationParameters = new WireInitParams { DeploymentName = model },
DataMapping = refMapping,
});

if (@ref.Version is null)
{
System.Diagnostics.Trace.TraceWarning(
"GeneratedEvaluatorRef '{0}' has no pinned version; the eval run will resolve to whichever version is current at execution time. Pin the version for reproducible runs.",
@ref.Name);
}

continue;
}

var name = spec.BuiltinName!;
var qualified = ResolveEvaluator(name);
Comment thread
alliscode marked this conversation as resolved.
var shortName = name.StartsWith("builtin.", StringComparison.Ordinal)
? name.Substring("builtin.".Length)
Expand Down Expand Up @@ -248,8 +297,12 @@ internal static WireItemSchema BuildItemSchema(bool hasContext = false, bool has
/// Returns the subset of <paramref name="evaluators"/> that require a ground-truth
/// (reference) value but cannot be evaluated because no item provided one.
/// </summary>
/// <remarks>
/// Rubric references (<see cref="GeneratedEvaluatorRef"/>) are skipped — they are not
/// ground-truth–dependent on the wire.
/// </remarks>
internal static List<string> FindMissingGroundTruthEvaluators(
IEnumerable<string> evaluators,
IEnumerable<FoundryEvaluatorSpec> evaluators,
bool hasGroundTruth)
{
if (hasGroundTruth)
Expand All @@ -258,8 +311,14 @@ internal static List<string> FindMissingGroundTruthEvaluators(
}

var missing = new List<string>();
foreach (var name in evaluators)
foreach (var spec in evaluators)
{
if (spec.IsRubric)
{
continue;
}

var name = spec.BuiltinName!;
if (GroundTruthEvaluators.Contains(ResolveEvaluator(name)))
{
missing.Add(name);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ internal sealed class WireTestingCriterion
[JsonPropertyName("evaluator_name")]
public required string EvaluatorName { get; init; }

[JsonPropertyName("evaluator_version")]
public string? EvaluatorVersion { get; init; }

[JsonPropertyName("initialization_parameters")]
public required WireInitParams InitializationParameters { get; init; }

Expand Down
Loading
Loading