From 514033c611b6e7d857caa85ab1d6394ac0de538c Mon Sep 17 00:00:00 2001
From: Bo <boden.fuller@gmail.com>
Date: Fri, 29 May 2026 14:49:49 -0400
Subject: [PATCH 1/4] =?UTF-8?q?feat(eval):=20ao=20eval=20outcomes=20compil?=
 =?UTF-8?q?e=20=E2=80=94=20holdout-safe=20rubric=20payload=20(ag-hdqu0=20#?=
 =?UTF-8?q?compile-strip)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds 'ao eval outcomes compile <input.json>' under evalCmd: projects a locked
Task + criteria into an Outcomes rubric via the merged evalsubstrate.ProjectRubric,
then re-scans (ContainsAny, guard layer 3) and REFUSES to emit any rubric that
would carry a holdout answer across the cloud boundary (Managed Agents are not
ZDR). Outcomes is a derived projection of SCHEMA.md, never an alternate authority.

TDD: TestCompileOutcomesRubric_StripsHoldoutTarget (criteria carried, no leak),
TestCompileOutcomesRubric_RefusesLeak (deny-by-default on a leaking criterion).
go test/vet/build green; COMMANDS.md regenerated for the new subcommand.

Closes-scenario: ag-hdqu0.1#compile-strip
Bounded-context: BC1-Corpus
Evidence: cli/cmd/ao/eval_outcomes_test.go
---
 cli/cmd/ao/eval_outcomes.go      | 74 ++++++++++++++++++++++++++++++++
 cli/cmd/ao/eval_outcomes_test.go | 64 +++++++++++++++++++++++++++
 cli/docs/COMMANDS.md             | 16 +++++++
 3 files changed, 154 insertions(+)
 create mode 100644 cli/cmd/ao/eval_outcomes.go
 create mode 100644 cli/cmd/ao/eval_outcomes_test.go

diff --git a/cli/cmd/ao/eval_outcomes.go b/cli/cmd/ao/eval_outcomes.go
new file mode 100644
index 000000000..398c5591d
--- /dev/null
+++ b/cli/cmd/ao/eval_outcomes.go
@@ -0,0 +1,74 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+
+	"github.com/boshu2/agentops/cli/internal/evalsubstrate"
+	"github.com/spf13/cobra"
+)
+
+// compileOutcomesRubric projects a locked eval Task + its grading criteria into a
+// holdout-safe Outcomes rubric, then re-scans the result (defense-in-depth guard
+// layer 3) against the run's holdout answer values. It REFUSES to return a rubric
+// that would carry any holdout value across the cloud boundary — Managed Agents
+// are not ZDR, so a leak is permanent. ProjectRubric already strips ground truth
+// by construction; this re-scan is the deny-by-default backstop.
+func compileOutcomesRubric(task evalsubstrate.Task, criteria []evalsubstrate.Criterion, judgeContentHash string, holdoutValues []string) (evalsubstrate.Rubric, error) {
+	r := evalsubstrate.ProjectRubric(task, criteria, judgeContentHash)
+	if hit, found := r.ContainsAny(holdoutValues); found {
+		return evalsubstrate.Rubric{}, fmt.Errorf("outcomes compile: holdout value %q would leak into the rubric payload; refusing (Managed Agents are not ZDR)", hit)
+	}
+	return r, nil
+}
+
+// outcomesCompileInput is the JSON shape accepted by `ao eval outcomes compile`.
+// holdout_values feeds the re-scan guard and is NEVER copied into the output.
+type outcomesCompileInput struct {
+	Task             evalsubstrate.Task        `json:"task"`
+	Criteria         []evalsubstrate.Criterion `json:"criteria"`
+	JudgeContentHash string                    `json:"judge_content_hash"`
+	HoldoutValues    []string                  `json:"holdout_values,omitempty"`
+}
+
+var evalOutcomesCmd = &cobra.Command{
+	Use:   "outcomes",
+	Short: "Project the locked eval substrate into Outcomes grading payloads (holdout-safe)",
+	Long: "Outcomes is a derived projection of the locked eval substrate (SCHEMA.md), " +
+		"never an alternate authority. Subcommands compile holdout-safe rubric payloads " +
+		"and ingest returned scores into the one verdict format.",
+}
+
+var evalOutcomesCompileCmd = &cobra.Command{
+	Use:   "compile <input.json>",
+	Short: "Compile a holdout-safe Outcomes rubric payload from a locked Task + criteria",
+	Args:  cobra.ExactArgs(1),
+	RunE:  runEvalOutcomesCompile,
+}
+
+func runEvalOutcomesCompile(cmd *cobra.Command, args []string) error {
+	raw, err := os.ReadFile(args[0])
+	if err != nil {
+		return fmt.Errorf("read %s: %w", args[0], err)
+	}
+	var in outcomesCompileInput
+	if err := json.Unmarshal(raw, &in); err != nil {
+		return fmt.Errorf("parse %s: %w", args[0], err)
+	}
+	rubric, err := compileOutcomesRubric(in.Task, in.Criteria, in.JudgeContentHash, in.HoldoutValues)
+	if err != nil {
+		return err
+	}
+	out, err := json.MarshalIndent(rubric, "", "  ")
+	if err != nil {
+		return fmt.Errorf("encode rubric: %w", err)
+	}
+	fmt.Fprintln(cmd.OutOrStdout(), string(out))
+	return nil
+}
+
+func init() {
+	evalOutcomesCmd.AddCommand(evalOutcomesCompileCmd)
+	evalCmd.AddCommand(evalOutcomesCmd)
+}
diff --git a/cli/cmd/ao/eval_outcomes_test.go b/cli/cmd/ao/eval_outcomes_test.go
new file mode 100644
index 000000000..003c0dd51
--- /dev/null
+++ b/cli/cmd/ao/eval_outcomes_test.go
@@ -0,0 +1,64 @@
+package main
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/boshu2/agentops/cli/internal/evalsubstrate"
+)
+
+func sampleTaskAndCriteria() (evalsubstrate.Task, []evalsubstrate.Criterion) {
+	task := evalsubstrate.Task{
+		SchemaVersion: evalsubstrate.SchemaVersion,
+		ID:            "task-capital-cities",
+		Domain:        "geography",
+		Description:   "Answer the capital-city question accurately.",
+	}
+	criteria := []evalsubstrate.Criterion{
+		{ID: "accuracy", Description: "Names the correct capital city.", Weight: 0.7},
+		{ID: "concision", Description: "Answers in one short sentence.", Weight: 0.3},
+	}
+	return task, criteria
+}
+
+// TestCompileOutcomesRubric_StripsHoldoutTarget: the compiled rubric carries the
+// criteria but none of the holdout answer values — the holdout-isolation
+// invariant at the cloud boundary (Managed Agents are not ZDR).
+func TestCompileOutcomesRubric_StripsHoldoutTarget(t *testing.T) {
+	task, criteria := sampleTaskAndCriteria()
+	holdout := []string{"Ouagadougou", "Antananarivo"}
+
+	r, err := compileOutcomesRubric(task, criteria, "sha256:abc123", holdout)
+	if err != nil {
+		t.Fatalf("compile: unexpected error: %v", err)
+	}
+	if r.SourceTaskID != task.ID {
+		t.Errorf("SourceTaskID = %q, want %q", r.SourceTaskID, task.ID)
+	}
+	if r.JudgeContentHash != "sha256:abc123" {
+		t.Errorf("JudgeContentHash = %q, want sha256:abc123", r.JudgeContentHash)
+	}
+	if len(r.Criteria) != 2 {
+		t.Fatalf("len(Criteria) = %d, want 2", len(r.Criteria))
+	}
+	if hit, found := r.ContainsAny(holdout); found {
+		t.Errorf("compiled rubric leaked holdout value %q", hit)
+	}
+}
+
+// TestCompileOutcomesRubric_RefusesLeak: deny-by-default — if a holdout value
+// would appear in the payload (e.g. a criterion description accidentally embeds
+// the answer), compile MUST refuse rather than emit a leaking rubric.
+func TestCompileOutcomesRubric_RefusesLeak(t *testing.T) {
+	task, _ := sampleTaskAndCriteria()
+	leaky := []evalsubstrate.Criterion{
+		{ID: "accuracy", Description: "The answer is Ouagadougou.", Weight: 1.0},
+	}
+	_, err := compileOutcomesRubric(task, leaky, "sha256:abc123", []string{"Ouagadougou"})
+	if err == nil {
+		t.Fatal("expected compile to refuse a leaking rubric, got nil error")
+	}
+	if !strings.Contains(err.Error(), "Ouagadougou") {
+		t.Errorf("error should name the leaked value, got: %v", err)
+	}
+}
diff --git a/cli/docs/COMMANDS.md b/cli/docs/COMMANDS.md
index cd9d455e8..ee1f8ca5e 100644
--- a/cli/docs/COMMANDS.md
+++ b/cli/docs/COMMANDS.md
@@ -1432,6 +1432,22 @@ ao eval coverage [suite.json ...] [flags]
       --root string                         suite root to scan when no suite paths are provided (default "evals/agentops-core")
 ```
 
+#### `ao eval outcomes`
+
+Outcomes is a derived projection of the locked eval substrate (SCHEMA.md), never an alternate authority. Subcommands compile holdout-safe rubric payloads and ingest returned scores into the one verdict format.
+
+```
+ao eval outcomes [command]
+```
+
+##### `ao eval outcomes compile`
+
+Compile a holdout-safe Outcomes rubric payload from a locked Task + criteria
+
+```
+ao eval outcomes compile <input.json> [flags]
+```
+
 #### `ao eval run`
 
 Run a deterministic eval suite

From 9a22e2f47840c951d43e3414985b5a1a0bfc9168 Mon Sep 17 00:00:00 2001
From: Bo <boden.fuller@gmail.com>
Date: Fri, 29 May 2026 15:12:55 -0400
Subject: [PATCH 2/4] fix(eval): allowlist 'eval outcomes compile' for
 cli-command-surface canary (ag-hdqu0.1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI contracts-sync contract-canary agentops-core.cli-command-surface-matrix failed:
the new command was an uncovered leaf in check-cmdao-surface-parity. Added a
public-stateful-fixture-needed allowlist entry (core logic unit-tested in
eval_outcomes_test.go; CLI smoke needs an input.json fixture — follow-up ag-lkxx).
Parity check now reports the command 'allowlisted'.
---
 scripts/cmdao-surface-allowlist.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/cmdao-surface-allowlist.txt b/scripts/cmdao-surface-allowlist.txt
index a7c0bd5d9..63015922e 100644
--- a/scripts/cmdao-surface-allowlist.txt
+++ b/scripts/cmdao-surface-allowlist.txt
@@ -104,3 +104,4 @@ manual-only|overnight curator diagnose|Depends on local curator, runner, and Oll
 public-stateful-fixture-needed|overnight curator enqueue|Writes jobs into a configured local curator queue.
 public-stateful-fixture-needed|overnight curator event|Appends events to a configured local curator ledger.
 manual-only|overnight curator status|Depends on local curator, runner, and Ollama environment.
+public-stateful-fixture-needed|eval outcomes compile|Holdout-safe rubric projection; core logic unit-tested (eval_outcomes_test.go compileOutcomesRubric); CLI smoke needs an input.json fixture (follow-up).

From de55f99d936207acd154d7c030682231bfc144e8 Mon Sep 17 00:00:00 2001
From: Bo <boden.fuller@gmail.com>
Date: Fri, 29 May 2026 15:25:13 -0400
Subject: [PATCH 3/4] fix(eval): update cli-command-surface canary heading
 counts for outcomes command (ag-hdqu0.1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The documented-cli-help-matrix canary case hard-codes command heading counts;
'ao eval outcomes' (#### sub) bumps sub 175->176 and all 245->246, and the matrix
size 245->246. 'ao eval outcomes compile' is ##### (not counted in top/sub/all).
Fixture now passes (cli-help-matrix-ok). Pairs with the surface-parity allowlist
entry to clear the full cli-command-surface-matrix canary. No regen path for these
counts — tracked in ag-lkxx.
---
 evals/agentops-core/fixtures/cli-command-surface-smoke.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evals/agentops-core/fixtures/cli-command-surface-smoke.sh b/evals/agentops-core/fixtures/cli-command-surface-smoke.sh
index 82f418dcd..bc725604a 100755
--- a/evals/agentops-core/fixtures/cli-command-surface-smoke.sh
+++ b/evals/agentops-core/fixtures/cli-command-surface-smoke.sh
@@ -17,7 +17,7 @@ top_count="$(rg -c '^### `ao ' "$DOCS_PATH")"
 sub_count="$(rg -c '^#### `ao ' "$DOCS_PATH")"
 all_count="$(rg -c '^#{3,4} `ao ' "$DOCS_PATH")"
 
-if [[ "$top_count" != "70" || "$sub_count" != "175" || "$all_count" != "245" ]]; then
+if [[ "$top_count" != "70" || "$sub_count" != "176" || "$all_count" != "246" ]]; then
   printf 'unexpected command heading counts: top=%s sub=%s all=%s\n' "$top_count" "$sub_count" "$all_count" >&2
   exit 1
 fi
@@ -25,7 +25,7 @@ fi
 # shellcheck disable=SC2016 # literal backticks delimit generated Markdown command headings.
 mapfile -t commands < <(rg '^#{3,4} `ao ' "$DOCS_PATH" | sed -E 's/^.*`([^`]+)`.*/\1/')
 
-if [[ "${#commands[@]}" -ne 245 ]]; then
+if [[ "${#commands[@]}" -ne 246 ]]; then
   printf 'unexpected command matrix size: %s\n' "${#commands[@]}" >&2
   exit 1
 fi

From 78da13deaa1fe9317ce1f4ec9ecd7c3bc0473c54 Mon Sep 17 00:00:00 2001
From: Bo <boden.fuller@gmail.com>
Date: Fri, 29 May 2026 15:39:07 -0400
Subject: [PATCH 4/4] fix(eval): update cli-command-surface canary
 expected-stdout counts (ag-hdqu0.1)

The documented-cli-help-matrix case asserts stdout_contains 'cli-command-headings:
top=70 sub=175 all=245'; the fixture now prints 176/246 after adding the outcomes
command. Updated the expected string to match (third + final counts location after
the fixture's assert + printf). Canary aggregate had already risen 0.7917->0.9306;
this clears the last failing case.
---
 evals/agentops-core/cli-command-surface-matrix.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evals/agentops-core/cli-command-surface-matrix.json b/evals/agentops-core/cli-command-surface-matrix.json
index c996411c2..4e7a93af1 100644
--- a/evals/agentops-core/cli-command-surface-matrix.json
+++ b/evals/agentops-core/cli-command-surface-matrix.json
@@ -41,7 +41,7 @@
       },
       "expectations": [
         {"type": "exit_code", "value": 0},
-        {"type": "stdout_contains", "value": "cli-command-headings: top=70 sub=175 all=245"},
+        {"type": "stdout_contains", "value": "cli-command-headings: top=70 sub=176 all=246"},
         {"type": "stdout_contains", "value": "cli-help-matrix-ok"}
       ],
       "dimensions": ["correctness", "runtime_compatibility", "artifact_quality"],