boshu2 · boshu2 · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
@@ -0,0 +1,74 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+
+	"github.com/boshu2/agentops/cli/internal/evalsubstrate"
+	"github.com/spf13/cobra"
+)
+
+// compileOutcomesRubric projects a locked eval Task + its grading criteria into a
+// holdout-safe Outcomes rubric, then re-scans the result (defense-in-depth guard
+// layer 3) against the run's holdout answer values. It REFUSES to return a rubric
+// that would carry any holdout value across the cloud boundary — Managed Agents
+// are not ZDR, so a leak is permanent. ProjectRubric already strips ground truth
+// by construction; this re-scan is the deny-by-default backstop.
+func compileOutcomesRubric(task evalsubstrate.Task, criteria []evalsubstrate.Criterion, judgeContentHash string, holdoutValues []string) (evalsubstrate.Rubric, error) {
+	r := evalsubstrate.ProjectRubric(task, criteria, judgeContentHash)
+	if hit, found := r.ContainsAny(holdoutValues); found {
+		return evalsubstrate.Rubric{}, fmt.Errorf("outcomes compile: holdout value %q would leak into the rubric payload; refusing (Managed Agents are not ZDR)", hit)
+	}
+	return r, nil
+}
+
+// outcomesCompileInput is the JSON shape accepted by `ao eval outcomes compile`.
+// holdout_values feeds the re-scan guard and is NEVER copied into the output.
+type outcomesCompileInput struct {
+	Task             evalsubstrate.Task        `json:"task"`
+	Criteria         []evalsubstrate.Criterion `json:"criteria"`
+	JudgeContentHash string                    `json:"judge_content_hash"`
+	HoldoutValues    []string                  `json:"holdout_values,omitempty"`
+}
+
+var evalOutcomesCmd = &cobra.Command{
+	Use:   "outcomes",
+	Short: "Project the locked eval substrate into Outcomes grading payloads (holdout-safe)",
+	Long: "Outcomes is a derived projection of the locked eval substrate (SCHEMA.md), " +
+		"never an alternate authority. Subcommands compile holdout-safe rubric payloads " +
+		"and ingest returned scores into the one verdict format.",
+}
+
+var evalOutcomesCompileCmd = &cobra.Command{
+	Use:   "compile <input.json>",
+	Short: "Compile a holdout-safe Outcomes rubric payload from a locked Task + criteria",
+	Args:  cobra.ExactArgs(1),
+	RunE:  runEvalOutcomesCompile,
+}
+
+func runEvalOutcomesCompile(cmd *cobra.Command, args []string) error {
+	raw, err := os.ReadFile(args[0])
+	if err != nil {
+		return fmt.Errorf("read %s: %w", args[0], err)
+	}
+	var in outcomesCompileInput
+	if err := json.Unmarshal(raw, &in); err != nil {
+		return fmt.Errorf("parse %s: %w", args[0], err)
+	}
+	rubric, err := compileOutcomesRubric(in.Task, in.Criteria, in.JudgeContentHash, in.HoldoutValues)
+	if err != nil {
+		return err
+	}
+	out, err := json.MarshalIndent(rubric, "", "  ")
+	if err != nil {
+		return fmt.Errorf("encode rubric: %w", err)
+	}
+	fmt.Fprintln(cmd.OutOrStdout(), string(out))
+	return nil
+}
+
+func init() {
+	evalOutcomesCmd.AddCommand(evalOutcomesCompileCmd)
+	evalCmd.AddCommand(evalOutcomesCmd)
+}
@@ -0,0 +1,64 @@
+package main
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/boshu2/agentops/cli/internal/evalsubstrate"
+)
+
+func sampleTaskAndCriteria() (evalsubstrate.Task, []evalsubstrate.Criterion) {
+	task := evalsubstrate.Task{
+		SchemaVersion: evalsubstrate.SchemaVersion,
+		ID:            "task-capital-cities",
+		Domain:        "geography",
+		Description:   "Answer the capital-city question accurately.",
+	}
+	criteria := []evalsubstrate.Criterion{
+		{ID: "accuracy", Description: "Names the correct capital city.", Weight: 0.7},
+		{ID: "concision", Description: "Answers in one short sentence.", Weight: 0.3},
+	}
+	return task, criteria
+}
+
+// TestCompileOutcomesRubric_StripsHoldoutTarget: the compiled rubric carries the
+// criteria but none of the holdout answer values — the holdout-isolation
+// invariant at the cloud boundary (Managed Agents are not ZDR).
+func TestCompileOutcomesRubric_StripsHoldoutTarget(t *testing.T) {
+	task, criteria := sampleTaskAndCriteria()
+	holdout := []string{"Ouagadougou", "Antananarivo"}
+
+	r, err := compileOutcomesRubric(task, criteria, "sha256:abc123", holdout)
+	if err != nil {
+		t.Fatalf("compile: unexpected error: %v", err)
+	}
+	if r.SourceTaskID != task.ID {
+		t.Errorf("SourceTaskID = %q, want %q", r.SourceTaskID, task.ID)
+	}
+	if r.JudgeContentHash != "sha256:abc123" {
+		t.Errorf("JudgeContentHash = %q, want sha256:abc123", r.JudgeContentHash)
+	}
+	if len(r.Criteria) != 2 {
+		t.Fatalf("len(Criteria) = %d, want 2", len(r.Criteria))
+	}
+	if hit, found := r.ContainsAny(holdout); found {
+		t.Errorf("compiled rubric leaked holdout value %q", hit)
+	}
+}
+
+// TestCompileOutcomesRubric_RefusesLeak: deny-by-default — if a holdout value
+// would appear in the payload (e.g. a criterion description accidentally embeds
+// the answer), compile MUST refuse rather than emit a leaking rubric.
+func TestCompileOutcomesRubric_RefusesLeak(t *testing.T) {
+	task, _ := sampleTaskAndCriteria()
+	leaky := []evalsubstrate.Criterion{
+		{ID: "accuracy", Description: "The answer is Ouagadougou.", Weight: 1.0},
+	}
+	_, err := compileOutcomesRubric(task, leaky, "sha256:abc123", []string{"Ouagadougou"})
+	if err == nil {
+		t.Fatal("expected compile to refuse a leaking rubric, got nil error")
+	}
+	if !strings.Contains(err.Error(), "Ouagadougou") {
+		t.Errorf("error should name the leaked value, got: %v", err)
+	}
+}
@@ -1432,6 +1432,22 @@ ao eval coverage [suite.json ...] [flags]
       --root string                         suite root to scan when no suite paths are provided (default "evals/agentops-core")
 ```
 
+#### `ao eval outcomes`
+
+Outcomes is a derived projection of the locked eval substrate (SCHEMA.md), never an alternate authority. Subcommands compile holdout-safe rubric payloads and ingest returned scores into the one verdict format.
+
+```
+ao eval outcomes [command]
+```
+
+##### `ao eval outcomes compile`
+
+Compile a holdout-safe Outcomes rubric payload from a locked Task + criteria
+
+```
+ao eval outcomes compile <input.json> [flags]
+```
+
 #### `ao eval run`
 
 Run a deterministic eval suite

@@ -41,7 +41,7 @@
       },
       "expectations": [
         {"type": "exit_code", "value": 0},
-        {"type": "stdout_contains", "value": "cli-command-headings: top=70 sub=175 all=245"},
+        {"type": "stdout_contains", "value": "cli-command-headings: top=70 sub=176 all=246"},
         {"type": "stdout_contains", "value": "cli-help-matrix-ok"}
       ],
       "dimensions": ["correctness", "runtime_compatibility", "artifact_quality"],

@@ -17,15 +17,15 @@ top_count="$(rg -c '^### `ao ' "$DOCS_PATH")"
 sub_count="$(rg -c '^#### `ao ' "$DOCS_PATH")"
 all_count="$(rg -c '^#{3,4} `ao ' "$DOCS_PATH")"
 
-if [[ "$top_count" != "70" || "$sub_count" != "175" || "$all_count" != "245" ]]; then
+if [[ "$top_count" != "70" || "$sub_count" != "176" || "$all_count" != "246" ]]; then
   printf 'unexpected command heading counts: top=%s sub=%s all=%s\n' "$top_count" "$sub_count" "$all_count" >&2
   exit 1
 fi
 
 # shellcheck disable=SC2016 # literal backticks delimit generated Markdown command headings.
 mapfile -t commands < <(rg '^#{3,4} `ao ' "$DOCS_PATH" | sed -E 's/^.*`([^`]+)`.*/\1/')
 
-if [[ "${#commands[@]}" -ne 245 ]]; then
+if [[ "${#commands[@]}" -ne 246 ]]; then
   printf 'unexpected command matrix size: %s\n' "${#commands[@]}" >&2
   exit 1
 fi

@@ -104,3 +104,4 @@ manual-only|overnight curator diagnose|Depends on local curator, runner, and Oll
 public-stateful-fixture-needed|overnight curator enqueue|Writes jobs into a configured local curator queue.
 public-stateful-fixture-needed|overnight curator event|Appends events to a configured local curator ledger.
 manual-only|overnight curator status|Depends on local curator, runner, and Ollama environment.
+public-stateful-fixture-needed|eval outcomes compile|Holdout-safe rubric projection; core logic unit-tested (eval_outcomes_test.go compileOutcomesRubric); CLI smoke needs an input.json fixture (follow-up).