From 514033c611b6e7d857caa85ab1d6394ac0de538c Mon Sep 17 00:00:00 2001 From: Bo Date: Fri, 29 May 2026 14:49:49 -0400 Subject: [PATCH 1/4] =?UTF-8?q?feat(eval):=20ao=20eval=20outcomes=20compil?= =?UTF-8?q?e=20=E2=80=94=20holdout-safe=20rubric=20payload=20(ag-hdqu0=20#?= =?UTF-8?q?compile-strip)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds 'ao eval outcomes compile ' under evalCmd: projects a locked Task + criteria into an Outcomes rubric via the merged evalsubstrate.ProjectRubric, then re-scans (ContainsAny, guard layer 3) and REFUSES to emit any rubric that would carry a holdout answer across the cloud boundary (Managed Agents are not ZDR). Outcomes is a derived projection of SCHEMA.md, never an alternate authority. TDD: TestCompileOutcomesRubric_StripsHoldoutTarget (criteria carried, no leak), TestCompileOutcomesRubric_RefusesLeak (deny-by-default on a leaking criterion). go test/vet/build green; COMMANDS.md regenerated for the new subcommand. Closes-scenario: ag-hdqu0.1#compile-strip Bounded-context: BC1-Corpus Evidence: cli/cmd/ao/eval_outcomes_test.go --- cli/cmd/ao/eval_outcomes.go | 74 ++++++++++++++++++++++++++++++++ cli/cmd/ao/eval_outcomes_test.go | 64 +++++++++++++++++++++++++++ cli/docs/COMMANDS.md | 16 +++++++ 3 files changed, 154 insertions(+) create mode 100644 cli/cmd/ao/eval_outcomes.go create mode 100644 cli/cmd/ao/eval_outcomes_test.go diff --git a/cli/cmd/ao/eval_outcomes.go b/cli/cmd/ao/eval_outcomes.go new file mode 100644 index 000000000..398c5591d --- /dev/null +++ b/cli/cmd/ao/eval_outcomes.go @@ -0,0 +1,74 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + + "github.com/boshu2/agentops/cli/internal/evalsubstrate" + "github.com/spf13/cobra" +) + +// compileOutcomesRubric projects a locked eval Task + its grading criteria into a +// holdout-safe Outcomes rubric, then re-scans the result (defense-in-depth guard +// layer 3) against the run's holdout answer values. It REFUSES to return a rubric +// that would carry any holdout value across the cloud boundary — Managed Agents +// are not ZDR, so a leak is permanent. ProjectRubric already strips ground truth +// by construction; this re-scan is the deny-by-default backstop. +func compileOutcomesRubric(task evalsubstrate.Task, criteria []evalsubstrate.Criterion, judgeContentHash string, holdoutValues []string) (evalsubstrate.Rubric, error) { + r := evalsubstrate.ProjectRubric(task, criteria, judgeContentHash) + if hit, found := r.ContainsAny(holdoutValues); found { + return evalsubstrate.Rubric{}, fmt.Errorf("outcomes compile: holdout value %q would leak into the rubric payload; refusing (Managed Agents are not ZDR)", hit) + } + return r, nil +} + +// outcomesCompileInput is the JSON shape accepted by `ao eval outcomes compile`. +// holdout_values feeds the re-scan guard and is NEVER copied into the output. +type outcomesCompileInput struct { + Task evalsubstrate.Task `json:"task"` + Criteria []evalsubstrate.Criterion `json:"criteria"` + JudgeContentHash string `json:"judge_content_hash"` + HoldoutValues []string `json:"holdout_values,omitempty"` +} + +var evalOutcomesCmd = &cobra.Command{ + Use: "outcomes", + Short: "Project the locked eval substrate into Outcomes grading payloads (holdout-safe)", + Long: "Outcomes is a derived projection of the locked eval substrate (SCHEMA.md), " + + "never an alternate authority. Subcommands compile holdout-safe rubric payloads " + + "and ingest returned scores into the one verdict format.", +} + +var evalOutcomesCompileCmd = &cobra.Command{ + Use: "compile ", + Short: "Compile a holdout-safe Outcomes rubric payload from a locked Task + criteria", + Args: cobra.ExactArgs(1), + RunE: runEvalOutcomesCompile, +} + +func runEvalOutcomesCompile(cmd *cobra.Command, args []string) error { + raw, err := os.ReadFile(args[0]) + if err != nil { + return fmt.Errorf("read %s: %w", args[0], err) + } + var in outcomesCompileInput + if err := json.Unmarshal(raw, &in); err != nil { + return fmt.Errorf("parse %s: %w", args[0], err) + } + rubric, err := compileOutcomesRubric(in.Task, in.Criteria, in.JudgeContentHash, in.HoldoutValues) + if err != nil { + return err + } + out, err := json.MarshalIndent(rubric, "", " ") + if err != nil { + return fmt.Errorf("encode rubric: %w", err) + } + fmt.Fprintln(cmd.OutOrStdout(), string(out)) + return nil +} + +func init() { + evalOutcomesCmd.AddCommand(evalOutcomesCompileCmd) + evalCmd.AddCommand(evalOutcomesCmd) +} diff --git a/cli/cmd/ao/eval_outcomes_test.go b/cli/cmd/ao/eval_outcomes_test.go new file mode 100644 index 000000000..003c0dd51 --- /dev/null +++ b/cli/cmd/ao/eval_outcomes_test.go @@ -0,0 +1,64 @@ +package main + +import ( + "strings" + "testing" + + "github.com/boshu2/agentops/cli/internal/evalsubstrate" +) + +func sampleTaskAndCriteria() (evalsubstrate.Task, []evalsubstrate.Criterion) { + task := evalsubstrate.Task{ + SchemaVersion: evalsubstrate.SchemaVersion, + ID: "task-capital-cities", + Domain: "geography", + Description: "Answer the capital-city question accurately.", + } + criteria := []evalsubstrate.Criterion{ + {ID: "accuracy", Description: "Names the correct capital city.", Weight: 0.7}, + {ID: "concision", Description: "Answers in one short sentence.", Weight: 0.3}, + } + return task, criteria +} + +// TestCompileOutcomesRubric_StripsHoldoutTarget: the compiled rubric carries the +// criteria but none of the holdout answer values — the holdout-isolation +// invariant at the cloud boundary (Managed Agents are not ZDR). +func TestCompileOutcomesRubric_StripsHoldoutTarget(t *testing.T) { + task, criteria := sampleTaskAndCriteria() + holdout := []string{"Ouagadougou", "Antananarivo"} + + r, err := compileOutcomesRubric(task, criteria, "sha256:abc123", holdout) + if err != nil { + t.Fatalf("compile: unexpected error: %v", err) + } + if r.SourceTaskID != task.ID { + t.Errorf("SourceTaskID = %q, want %q", r.SourceTaskID, task.ID) + } + if r.JudgeContentHash != "sha256:abc123" { + t.Errorf("JudgeContentHash = %q, want sha256:abc123", r.JudgeContentHash) + } + if len(r.Criteria) != 2 { + t.Fatalf("len(Criteria) = %d, want 2", len(r.Criteria)) + } + if hit, found := r.ContainsAny(holdout); found { + t.Errorf("compiled rubric leaked holdout value %q", hit) + } +} + +// TestCompileOutcomesRubric_RefusesLeak: deny-by-default — if a holdout value +// would appear in the payload (e.g. a criterion description accidentally embeds +// the answer), compile MUST refuse rather than emit a leaking rubric. +func TestCompileOutcomesRubric_RefusesLeak(t *testing.T) { + task, _ := sampleTaskAndCriteria() + leaky := []evalsubstrate.Criterion{ + {ID: "accuracy", Description: "The answer is Ouagadougou.", Weight: 1.0}, + } + _, err := compileOutcomesRubric(task, leaky, "sha256:abc123", []string{"Ouagadougou"}) + if err == nil { + t.Fatal("expected compile to refuse a leaking rubric, got nil error") + } + if !strings.Contains(err.Error(), "Ouagadougou") { + t.Errorf("error should name the leaked value, got: %v", err) + } +} diff --git a/cli/docs/COMMANDS.md b/cli/docs/COMMANDS.md index cd9d455e8..ee1f8ca5e 100644 --- a/cli/docs/COMMANDS.md +++ b/cli/docs/COMMANDS.md @@ -1432,6 +1432,22 @@ ao eval coverage [suite.json ...] [flags] --root string suite root to scan when no suite paths are provided (default "evals/agentops-core") ``` +#### `ao eval outcomes` + +Outcomes is a derived projection of the locked eval substrate (SCHEMA.md), never an alternate authority. Subcommands compile holdout-safe rubric payloads and ingest returned scores into the one verdict format. + +``` +ao eval outcomes [command] +``` + +##### `ao eval outcomes compile` + +Compile a holdout-safe Outcomes rubric payload from a locked Task + criteria + +``` +ao eval outcomes compile [flags] +``` + #### `ao eval run` Run a deterministic eval suite From 9a22e2f47840c951d43e3414985b5a1a0bfc9168 Mon Sep 17 00:00:00 2001 From: Bo Date: Fri, 29 May 2026 15:12:55 -0400 Subject: [PATCH 2/4] fix(eval): allowlist 'eval outcomes compile' for cli-command-surface canary (ag-hdqu0.1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI contracts-sync contract-canary agentops-core.cli-command-surface-matrix failed: the new command was an uncovered leaf in check-cmdao-surface-parity. Added a public-stateful-fixture-needed allowlist entry (core logic unit-tested in eval_outcomes_test.go; CLI smoke needs an input.json fixture — follow-up ag-lkxx). Parity check now reports the command 'allowlisted'. --- scripts/cmdao-surface-allowlist.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/cmdao-surface-allowlist.txt b/scripts/cmdao-surface-allowlist.txt index a7c0bd5d9..63015922e 100644 --- a/scripts/cmdao-surface-allowlist.txt +++ b/scripts/cmdao-surface-allowlist.txt @@ -104,3 +104,4 @@ manual-only|overnight curator diagnose|Depends on local curator, runner, and Oll public-stateful-fixture-needed|overnight curator enqueue|Writes jobs into a configured local curator queue. public-stateful-fixture-needed|overnight curator event|Appends events to a configured local curator ledger. manual-only|overnight curator status|Depends on local curator, runner, and Ollama environment. +public-stateful-fixture-needed|eval outcomes compile|Holdout-safe rubric projection; core logic unit-tested (eval_outcomes_test.go compileOutcomesRubric); CLI smoke needs an input.json fixture (follow-up). From de55f99d936207acd154d7c030682231bfc144e8 Mon Sep 17 00:00:00 2001 From: Bo Date: Fri, 29 May 2026 15:25:13 -0400 Subject: [PATCH 3/4] fix(eval): update cli-command-surface canary heading counts for outcomes command (ag-hdqu0.1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The documented-cli-help-matrix canary case hard-codes command heading counts; 'ao eval outcomes' (#### sub) bumps sub 175->176 and all 245->246, and the matrix size 245->246. 'ao eval outcomes compile' is ##### (not counted in top/sub/all). Fixture now passes (cli-help-matrix-ok). Pairs with the surface-parity allowlist entry to clear the full cli-command-surface-matrix canary. No regen path for these counts — tracked in ag-lkxx. --- evals/agentops-core/fixtures/cli-command-surface-smoke.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evals/agentops-core/fixtures/cli-command-surface-smoke.sh b/evals/agentops-core/fixtures/cli-command-surface-smoke.sh index 82f418dcd..bc725604a 100755 --- a/evals/agentops-core/fixtures/cli-command-surface-smoke.sh +++ b/evals/agentops-core/fixtures/cli-command-surface-smoke.sh @@ -17,7 +17,7 @@ top_count="$(rg -c '^### `ao ' "$DOCS_PATH")" sub_count="$(rg -c '^#### `ao ' "$DOCS_PATH")" all_count="$(rg -c '^#{3,4} `ao ' "$DOCS_PATH")" -if [[ "$top_count" != "70" || "$sub_count" != "175" || "$all_count" != "245" ]]; then +if [[ "$top_count" != "70" || "$sub_count" != "176" || "$all_count" != "246" ]]; then printf 'unexpected command heading counts: top=%s sub=%s all=%s\n' "$top_count" "$sub_count" "$all_count" >&2 exit 1 fi @@ -25,7 +25,7 @@ fi # shellcheck disable=SC2016 # literal backticks delimit generated Markdown command headings. mapfile -t commands < <(rg '^#{3,4} `ao ' "$DOCS_PATH" | sed -E 's/^.*`([^`]+)`.*/\1/') -if [[ "${#commands[@]}" -ne 245 ]]; then +if [[ "${#commands[@]}" -ne 246 ]]; then printf 'unexpected command matrix size: %s\n' "${#commands[@]}" >&2 exit 1 fi From 78da13deaa1fe9317ce1f4ec9ecd7c3bc0473c54 Mon Sep 17 00:00:00 2001 From: Bo Date: Fri, 29 May 2026 15:39:07 -0400 Subject: [PATCH 4/4] fix(eval): update cli-command-surface canary expected-stdout counts (ag-hdqu0.1) The documented-cli-help-matrix case asserts stdout_contains 'cli-command-headings: top=70 sub=175 all=245'; the fixture now prints 176/246 after adding the outcomes command. Updated the expected string to match (third + final counts location after the fixture's assert + printf). Canary aggregate had already risen 0.7917->0.9306; this clears the last failing case. --- evals/agentops-core/cli-command-surface-matrix.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evals/agentops-core/cli-command-surface-matrix.json b/evals/agentops-core/cli-command-surface-matrix.json index c996411c2..4e7a93af1 100644 --- a/evals/agentops-core/cli-command-surface-matrix.json +++ b/evals/agentops-core/cli-command-surface-matrix.json @@ -41,7 +41,7 @@ }, "expectations": [ {"type": "exit_code", "value": 0}, - {"type": "stdout_contains", "value": "cli-command-headings: top=70 sub=175 all=245"}, + {"type": "stdout_contains", "value": "cli-command-headings: top=70 sub=176 all=246"}, {"type": "stdout_contains", "value": "cli-help-matrix-ok"} ], "dimensions": ["correctness", "runtime_compatibility", "artifact_quality"],