Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions cli/cmd/ao/eval_outcomes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package main

import (
"encoding/json"
"fmt"
"os"

"github.com/boshu2/agentops/cli/internal/evalsubstrate"
"github.com/spf13/cobra"
)

// compileOutcomesRubric projects a locked eval Task + its grading criteria into a
// holdout-safe Outcomes rubric, then re-scans the result (defense-in-depth guard
// layer 3) against the run's holdout answer values. It REFUSES to return a rubric
// that would carry any holdout value across the cloud boundary — Managed Agents
// are not ZDR, so a leak is permanent. ProjectRubric already strips ground truth
// by construction; this re-scan is the deny-by-default backstop.
func compileOutcomesRubric(task evalsubstrate.Task, criteria []evalsubstrate.Criterion, judgeContentHash string, holdoutValues []string) (evalsubstrate.Rubric, error) {
r := evalsubstrate.ProjectRubric(task, criteria, judgeContentHash)
if hit, found := r.ContainsAny(holdoutValues); found {
return evalsubstrate.Rubric{}, fmt.Errorf("outcomes compile: holdout value %q would leak into the rubric payload; refusing (Managed Agents are not ZDR)", hit)
}
return r, nil
}

// outcomesCompileInput is the JSON shape accepted by `ao eval outcomes compile`.
// holdout_values feeds the re-scan guard and is NEVER copied into the output.
type outcomesCompileInput struct {
Task evalsubstrate.Task `json:"task"`
Criteria []evalsubstrate.Criterion `json:"criteria"`
JudgeContentHash string `json:"judge_content_hash"`
HoldoutValues []string `json:"holdout_values,omitempty"`
}

var evalOutcomesCmd = &cobra.Command{
Use: "outcomes",
Short: "Project the locked eval substrate into Outcomes grading payloads (holdout-safe)",
Long: "Outcomes is a derived projection of the locked eval substrate (SCHEMA.md), " +
"never an alternate authority. Subcommands compile holdout-safe rubric payloads " +
"and ingest returned scores into the one verdict format.",
}

var evalOutcomesCompileCmd = &cobra.Command{
Use: "compile <input.json>",
Short: "Compile a holdout-safe Outcomes rubric payload from a locked Task + criteria",
Args: cobra.ExactArgs(1),
RunE: runEvalOutcomesCompile,
}

func runEvalOutcomesCompile(cmd *cobra.Command, args []string) error {
raw, err := os.ReadFile(args[0])
if err != nil {
return fmt.Errorf("read %s: %w", args[0], err)
}
var in outcomesCompileInput
if err := json.Unmarshal(raw, &in); err != nil {
return fmt.Errorf("parse %s: %w", args[0], err)
}
rubric, err := compileOutcomesRubric(in.Task, in.Criteria, in.JudgeContentHash, in.HoldoutValues)
if err != nil {
return err
}
out, err := json.MarshalIndent(rubric, "", " ")
if err != nil {
return fmt.Errorf("encode rubric: %w", err)
}
fmt.Fprintln(cmd.OutOrStdout(), string(out))
return nil
}

func init() {
evalOutcomesCmd.AddCommand(evalOutcomesCompileCmd)
evalCmd.AddCommand(evalOutcomesCmd)
}
64 changes: 64 additions & 0 deletions cli/cmd/ao/eval_outcomes_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package main

import (
"strings"
"testing"

"github.com/boshu2/agentops/cli/internal/evalsubstrate"
)

func sampleTaskAndCriteria() (evalsubstrate.Task, []evalsubstrate.Criterion) {
task := evalsubstrate.Task{
SchemaVersion: evalsubstrate.SchemaVersion,
ID: "task-capital-cities",
Domain: "geography",
Description: "Answer the capital-city question accurately.",
}
criteria := []evalsubstrate.Criterion{
{ID: "accuracy", Description: "Names the correct capital city.", Weight: 0.7},
{ID: "concision", Description: "Answers in one short sentence.", Weight: 0.3},
}
return task, criteria
}

// TestCompileOutcomesRubric_StripsHoldoutTarget: the compiled rubric carries the
// criteria but none of the holdout answer values — the holdout-isolation
// invariant at the cloud boundary (Managed Agents are not ZDR).
func TestCompileOutcomesRubric_StripsHoldoutTarget(t *testing.T) {
task, criteria := sampleTaskAndCriteria()
holdout := []string{"Ouagadougou", "Antananarivo"}

r, err := compileOutcomesRubric(task, criteria, "sha256:abc123", holdout)
if err != nil {
t.Fatalf("compile: unexpected error: %v", err)
}
if r.SourceTaskID != task.ID {
t.Errorf("SourceTaskID = %q, want %q", r.SourceTaskID, task.ID)
}
if r.JudgeContentHash != "sha256:abc123" {
t.Errorf("JudgeContentHash = %q, want sha256:abc123", r.JudgeContentHash)
}
if len(r.Criteria) != 2 {
t.Fatalf("len(Criteria) = %d, want 2", len(r.Criteria))
}
if hit, found := r.ContainsAny(holdout); found {
t.Errorf("compiled rubric leaked holdout value %q", hit)
}
}

// TestCompileOutcomesRubric_RefusesLeak: deny-by-default — if a holdout value
// would appear in the payload (e.g. a criterion description accidentally embeds
// the answer), compile MUST refuse rather than emit a leaking rubric.
func TestCompileOutcomesRubric_RefusesLeak(t *testing.T) {
task, _ := sampleTaskAndCriteria()
leaky := []evalsubstrate.Criterion{
{ID: "accuracy", Description: "The answer is Ouagadougou.", Weight: 1.0},
}
_, err := compileOutcomesRubric(task, leaky, "sha256:abc123", []string{"Ouagadougou"})
if err == nil {
t.Fatal("expected compile to refuse a leaking rubric, got nil error")
}
if !strings.Contains(err.Error(), "Ouagadougou") {
t.Errorf("error should name the leaked value, got: %v", err)
}
}
16 changes: 16 additions & 0 deletions cli/docs/COMMANDS.md
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,22 @@ ao eval coverage [suite.json ...] [flags]
--root string suite root to scan when no suite paths are provided (default "evals/agentops-core")
```

#### `ao eval outcomes`

Outcomes is a derived projection of the locked eval substrate (SCHEMA.md), never an alternate authority. Subcommands compile holdout-safe rubric payloads and ingest returned scores into the one verdict format.

```
ao eval outcomes [command]
```

##### `ao eval outcomes compile`

Compile a holdout-safe Outcomes rubric payload from a locked Task + criteria

```
ao eval outcomes compile <input.json> [flags]
```

#### `ao eval run`

Run a deterministic eval suite
Expand Down
2 changes: 1 addition & 1 deletion evals/agentops-core/cli-command-surface-matrix.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
},
"expectations": [
{"type": "exit_code", "value": 0},
{"type": "stdout_contains", "value": "cli-command-headings: top=70 sub=175 all=245"},
{"type": "stdout_contains", "value": "cli-command-headings: top=70 sub=176 all=246"},
{"type": "stdout_contains", "value": "cli-help-matrix-ok"}
],
"dimensions": ["correctness", "runtime_compatibility", "artifact_quality"],
Expand Down
4 changes: 2 additions & 2 deletions evals/agentops-core/fixtures/cli-command-surface-smoke.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ top_count="$(rg -c '^### `ao ' "$DOCS_PATH")"
sub_count="$(rg -c '^#### `ao ' "$DOCS_PATH")"
all_count="$(rg -c '^#{3,4} `ao ' "$DOCS_PATH")"

if [[ "$top_count" != "70" || "$sub_count" != "175" || "$all_count" != "245" ]]; then
if [[ "$top_count" != "70" || "$sub_count" != "176" || "$all_count" != "246" ]]; then
printf 'unexpected command heading counts: top=%s sub=%s all=%s\n' "$top_count" "$sub_count" "$all_count" >&2
exit 1
fi

# shellcheck disable=SC2016 # literal backticks delimit generated Markdown command headings.
mapfile -t commands < <(rg '^#{3,4} `ao ' "$DOCS_PATH" | sed -E 's/^.*`([^`]+)`.*/\1/')

if [[ "${#commands[@]}" -ne 245 ]]; then
if [[ "${#commands[@]}" -ne 246 ]]; then
printf 'unexpected command matrix size: %s\n' "${#commands[@]}" >&2
exit 1
fi
Expand Down
1 change: 1 addition & 0 deletions scripts/cmdao-surface-allowlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,4 @@ manual-only|overnight curator diagnose|Depends on local curator, runner, and Oll
public-stateful-fixture-needed|overnight curator enqueue|Writes jobs into a configured local curator queue.
public-stateful-fixture-needed|overnight curator event|Appends events to a configured local curator ledger.
manual-only|overnight curator status|Depends on local curator, runner, and Ollama environment.
public-stateful-fixture-needed|eval outcomes compile|Holdout-safe rubric projection; core logic unit-tested (eval_outcomes_test.go compileOutcomesRubric); CLI smoke needs an input.json fixture (follow-up).
Loading