diff --git a/cli/cmd/ao/eval_outcomes_ingest.go b/cli/cmd/ao/eval_outcomes_ingest.go new file mode 100644 index 000000000..592b500ba --- /dev/null +++ b/cli/cmd/ao/eval_outcomes_ingest.go @@ -0,0 +1,93 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + + "github.com/spf13/cobra" +) + +// outcomesScore is the score payload returned by an Outcomes grader (cloud +// Managed Agent, async job, or the local Codex/NTM path). It carries only +// aggregate + per-criterion scores — never holdout answers. +type outcomesScore struct { + SourceTaskID string `json:"source_task_id"` + JudgeContentHash string `json:"judge_content_hash"` + Aggregate float64 `json:"aggregate"` + Threshold float64 `json:"threshold"` + CriterionScores map[string]float64 `json:"criterion_scores"` +} + +// outcomesVerdict is the one verdict record (skills/council/schemas/verdict.json +// shape). ao eval outcomes ingest emits this so an Outcomes score feeds the +// Knowledge Flywheel exactly as a local `ao eval run` verdict does — no second +// verdict format, no alternate bar. +type outcomesVerdict struct { + Verdict string `json:"verdict"` + Confidence string `json:"confidence"` + KeyInsight string `json:"key_insight"` + Recommendation string `json:"recommendation"` + SchemaVersion int `json:"schema_version"` + SatisfactionScore *float64 `json:"satisfaction_score"` + SatisfactionBreakdown map[string]float64 `json:"satisfaction_breakdown"` + Findings []map[string]any `json:"findings"` +} + +// ingestOutcomesScore maps an Outcomes score onto the council verdict record. +// PASS when aggregate meets the rubric threshold, FAIL when it falls below 70% +// of it, WARN in between. The aggregate becomes satisfaction_score and the +// per-criterion scores become satisfaction_breakdown. +func ingestOutcomesScore(s outcomesScore) outcomesVerdict { + threshold := s.Threshold + if threshold <= 0 { + threshold = 1.0 + } + verdict := "WARN" + switch { + case s.Aggregate >= threshold: + verdict = "PASS" + case s.Aggregate < threshold*0.7: + verdict = "FAIL" + } + agg := s.Aggregate + return outcomesVerdict{ + Verdict: verdict, + Confidence: "HIGH", + KeyInsight: fmt.Sprintf("Outcomes aggregate %.4f vs threshold %.4f for task %q", + s.Aggregate, threshold, s.SourceTaskID), + Recommendation: fmt.Sprintf("Outcomes grade ingested as %s; feeds the corpus via the eval-verdict pipeline.", verdict), + SchemaVersion: 4, + SatisfactionScore: &agg, + SatisfactionBreakdown: s.CriterionScores, + Findings: []map[string]any{}, + } +} + +var evalOutcomesIngestCmd = &cobra.Command{ + Use: "ingest ", + Short: "Ingest an Outcomes score payload into the one council verdict record", + Args: cobra.ExactArgs(1), + RunE: runEvalOutcomesIngest, +} + +func runEvalOutcomesIngest(cmd *cobra.Command, args []string) error { + raw, err := os.ReadFile(args[0]) + if err != nil { + return fmt.Errorf("read %s: %w", args[0], err) + } + var s outcomesScore + if err := json.Unmarshal(raw, &s); err != nil { + return fmt.Errorf("parse %s: %w", args[0], err) + } + out, err := json.MarshalIndent(ingestOutcomesScore(s), "", " ") + if err != nil { + return fmt.Errorf("encode verdict: %w", err) + } + fmt.Fprintln(cmd.OutOrStdout(), string(out)) + return nil +} + +func init() { + evalOutcomesCmd.AddCommand(evalOutcomesIngestCmd) +} diff --git a/cli/cmd/ao/eval_outcomes_ingest_test.go b/cli/cmd/ao/eval_outcomes_ingest_test.go new file mode 100644 index 000000000..9e61fcb63 --- /dev/null +++ b/cli/cmd/ao/eval_outcomes_ingest_test.go @@ -0,0 +1,53 @@ +package main + +import "testing" + +// TestIngestOutcomesScore_ProducesVerdictRecord: an Outcomes score at/above the +// rubric threshold maps to a PASS verdict record in the one council verdict shape +// (skills/council/schemas/verdict.json) — closing the Outcomes → Knowledge +// Flywheel loop without forking the verdict format. +func TestIngestOutcomesScore_ProducesVerdictRecord(t *testing.T) { + s := outcomesScore{ + SourceTaskID: "task-capital-cities", + JudgeContentHash: "sha256:abc123", + Aggregate: 0.92, + Threshold: 0.8, + CriterionScores: map[string]float64{"accuracy": 0.95, "concision": 0.85}, + } + v := ingestOutcomesScore(s) + + if v.Verdict != "PASS" { + t.Errorf("Verdict = %q, want PASS", v.Verdict) + } + if v.SatisfactionScore == nil || *v.SatisfactionScore != 0.92 { + t.Errorf("SatisfactionScore = %v, want 0.92", v.SatisfactionScore) + } + if v.SatisfactionBreakdown["accuracy"] != 0.95 { + t.Errorf("breakdown[accuracy] = %v, want 0.95", v.SatisfactionBreakdown["accuracy"]) + } + if v.SchemaVersion != 4 { + t.Errorf("SchemaVersion = %d, want 4", v.SchemaVersion) + } + if v.Findings == nil { + t.Error("Findings must be a non-nil (possibly empty) slice for verdict.json validity") + } +} + +// TestIngestOutcomesScore_VerdictBands: aggregate below threshold downgrades to +// WARN, and far below (< 70% of threshold) to FAIL. +func TestIngestOutcomesScore_VerdictBands(t *testing.T) { + cases := []struct { + agg float64 + want string + }{ + {0.90, "PASS"}, // >= 0.8 + {0.70, "WARN"}, // < 0.8 but >= 0.56 + {0.40, "FAIL"}, // < 0.56 + } + for _, c := range cases { + got := ingestOutcomesScore(outcomesScore{Aggregate: c.agg, Threshold: 0.8}).Verdict + if got != c.want { + t.Errorf("aggregate %.2f: Verdict = %q, want %q", c.agg, got, c.want) + } + } +} diff --git a/cli/docs/COMMANDS.md b/cli/docs/COMMANDS.md index ee1f8ca5e..d6a583070 100644 --- a/cli/docs/COMMANDS.md +++ b/cli/docs/COMMANDS.md @@ -1448,6 +1448,14 @@ Compile a holdout-safe Outcomes rubric payload from a locked Task + criteria ao eval outcomes compile [flags] ``` +##### `ao eval outcomes ingest` + +Ingest an Outcomes score payload into the one council verdict record + +``` +ao eval outcomes ingest [flags] +``` + #### `ao eval run` Run a deterministic eval suite diff --git a/scripts/cmdao-surface-allowlist.txt b/scripts/cmdao-surface-allowlist.txt index 63015922e..ca161e57f 100644 --- a/scripts/cmdao-surface-allowlist.txt +++ b/scripts/cmdao-surface-allowlist.txt @@ -105,3 +105,4 @@ public-stateful-fixture-needed|overnight curator enqueue|Writes jobs into a conf public-stateful-fixture-needed|overnight curator event|Appends events to a configured local curator ledger. manual-only|overnight curator status|Depends on local curator, runner, and Ollama environment. public-stateful-fixture-needed|eval outcomes compile|Holdout-safe rubric projection; core logic unit-tested (eval_outcomes_test.go compileOutcomesRubric); CLI smoke needs an input.json fixture (follow-up). +public-stateful-fixture-needed|eval outcomes ingest|Maps an Outcomes score to the council verdict record; core logic unit-tested (eval_outcomes_ingest_test.go ingestOutcomesScore); CLI smoke needs a score.json fixture (follow-up).