Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions cli/cmd/ao/eval_outcomes_ingest.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package main

import (
"encoding/json"
"fmt"
"os"

"github.com/spf13/cobra"
)

// outcomesScore is the score payload returned by an Outcomes grader (cloud
// Managed Agent, async job, or the local Codex/NTM path). It carries only
// aggregate + per-criterion scores — never holdout answers.
type outcomesScore struct {
SourceTaskID string `json:"source_task_id"`
JudgeContentHash string `json:"judge_content_hash"`
Aggregate float64 `json:"aggregate"`
Threshold float64 `json:"threshold"`
CriterionScores map[string]float64 `json:"criterion_scores"`
}

// outcomesVerdict is the one verdict record (skills/council/schemas/verdict.json
// shape). ao eval outcomes ingest emits this so an Outcomes score feeds the
// Knowledge Flywheel exactly as a local `ao eval run` verdict does — no second
// verdict format, no alternate bar.
type outcomesVerdict struct {
Verdict string `json:"verdict"`
Confidence string `json:"confidence"`
KeyInsight string `json:"key_insight"`
Recommendation string `json:"recommendation"`
SchemaVersion int `json:"schema_version"`
SatisfactionScore *float64 `json:"satisfaction_score"`
SatisfactionBreakdown map[string]float64 `json:"satisfaction_breakdown"`
Findings []map[string]any `json:"findings"`
}

// ingestOutcomesScore maps an Outcomes score onto the council verdict record.
// PASS when aggregate meets the rubric threshold, FAIL when it falls below 70%
// of it, WARN in between. The aggregate becomes satisfaction_score and the
// per-criterion scores become satisfaction_breakdown.
func ingestOutcomesScore(s outcomesScore) outcomesVerdict {
threshold := s.Threshold
if threshold <= 0 {
threshold = 1.0
}
verdict := "WARN"
switch {
case s.Aggregate >= threshold:
verdict = "PASS"
case s.Aggregate < threshold*0.7:
verdict = "FAIL"
}
agg := s.Aggregate
return outcomesVerdict{
Verdict: verdict,
Confidence: "HIGH",
KeyInsight: fmt.Sprintf("Outcomes aggregate %.4f vs threshold %.4f for task %q",
s.Aggregate, threshold, s.SourceTaskID),
Recommendation: fmt.Sprintf("Outcomes grade ingested as %s; feeds the corpus via the eval-verdict pipeline.", verdict),
SchemaVersion: 4,
SatisfactionScore: &agg,
SatisfactionBreakdown: s.CriterionScores,
Findings: []map[string]any{},
}
}

var evalOutcomesIngestCmd = &cobra.Command{
Use: "ingest <score.json>",
Short: "Ingest an Outcomes score payload into the one council verdict record",
Args: cobra.ExactArgs(1),
RunE: runEvalOutcomesIngest,
}

func runEvalOutcomesIngest(cmd *cobra.Command, args []string) error {
raw, err := os.ReadFile(args[0])
if err != nil {
return fmt.Errorf("read %s: %w", args[0], err)
}
var s outcomesScore
if err := json.Unmarshal(raw, &s); err != nil {
return fmt.Errorf("parse %s: %w", args[0], err)
}
out, err := json.MarshalIndent(ingestOutcomesScore(s), "", " ")
if err != nil {
return fmt.Errorf("encode verdict: %w", err)
}
fmt.Fprintln(cmd.OutOrStdout(), string(out))
return nil
}

func init() {
evalOutcomesCmd.AddCommand(evalOutcomesIngestCmd)
}
53 changes: 53 additions & 0 deletions cli/cmd/ao/eval_outcomes_ingest_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package main

import "testing"

// TestIngestOutcomesScore_ProducesVerdictRecord: an Outcomes score at/above the
// rubric threshold maps to a PASS verdict record in the one council verdict shape
// (skills/council/schemas/verdict.json) — closing the Outcomes → Knowledge
// Flywheel loop without forking the verdict format.
func TestIngestOutcomesScore_ProducesVerdictRecord(t *testing.T) {
s := outcomesScore{
SourceTaskID: "task-capital-cities",
JudgeContentHash: "sha256:abc123",
Aggregate: 0.92,
Threshold: 0.8,
CriterionScores: map[string]float64{"accuracy": 0.95, "concision": 0.85},
}
v := ingestOutcomesScore(s)

if v.Verdict != "PASS" {
t.Errorf("Verdict = %q, want PASS", v.Verdict)
}
if v.SatisfactionScore == nil || *v.SatisfactionScore != 0.92 {
t.Errorf("SatisfactionScore = %v, want 0.92", v.SatisfactionScore)
}
if v.SatisfactionBreakdown["accuracy"] != 0.95 {
t.Errorf("breakdown[accuracy] = %v, want 0.95", v.SatisfactionBreakdown["accuracy"])
}
if v.SchemaVersion != 4 {
t.Errorf("SchemaVersion = %d, want 4", v.SchemaVersion)
}
if v.Findings == nil {
t.Error("Findings must be a non-nil (possibly empty) slice for verdict.json validity")
}
}

// TestIngestOutcomesScore_VerdictBands: aggregate below threshold downgrades to
// WARN, and far below (< 70% of threshold) to FAIL.
func TestIngestOutcomesScore_VerdictBands(t *testing.T) {
cases := []struct {
agg float64
want string
}{
{0.90, "PASS"}, // >= 0.8
{0.70, "WARN"}, // < 0.8 but >= 0.56
{0.40, "FAIL"}, // < 0.56
}
for _, c := range cases {
got := ingestOutcomesScore(outcomesScore{Aggregate: c.agg, Threshold: 0.8}).Verdict
if got != c.want {
t.Errorf("aggregate %.2f: Verdict = %q, want %q", c.agg, got, c.want)
}
}
}
8 changes: 8 additions & 0 deletions cli/docs/COMMANDS.md
Original file line number Diff line number Diff line change
Expand Up @@ -1448,6 +1448,14 @@ Compile a holdout-safe Outcomes rubric payload from a locked Task + criteria
ao eval outcomes compile <input.json> [flags]
```

##### `ao eval outcomes ingest`

Ingest an Outcomes score payload into the one council verdict record

```
ao eval outcomes ingest <score.json> [flags]
```

#### `ao eval run`

Run a deterministic eval suite
Expand Down
1 change: 1 addition & 0 deletions scripts/cmdao-surface-allowlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,4 @@ public-stateful-fixture-needed|overnight curator enqueue|Writes jobs into a conf
public-stateful-fixture-needed|overnight curator event|Appends events to a configured local curator ledger.
manual-only|overnight curator status|Depends on local curator, runner, and Ollama environment.
public-stateful-fixture-needed|eval outcomes compile|Holdout-safe rubric projection; core logic unit-tested (eval_outcomes_test.go compileOutcomesRubric); CLI smoke needs an input.json fixture (follow-up).
public-stateful-fixture-needed|eval outcomes ingest|Maps an Outcomes score to the council verdict record; core logic unit-tested (eval_outcomes_ingest_test.go ingestOutcomesScore); CLI smoke needs a score.json fixture (follow-up).
Loading