diff --git a/dashboard/dist/index.html b/dashboard/dist/index.html
index efac6bf..19d87d5 100644
--- a/dashboard/dist/index.html
+++ b/dashboard/dist/index.html
@@ -24,7 +24,7 @@
}
})();
-
-
+`).trim()}function Xme({turn:e}){const t=e.parts??[],n=t.length>0,{reasoning:r,response:i}=n?Yme(t):{reasoning:[],response:[]},l=n?null:(()=>{const{reasoning:d,body:g}=Ume(e.content??"");return{reasoning:d,body:g,tools:e.tool_calls??[]}})(),s=n?AP(i):null,u=n&&r.length>0?AP(r):null,f=n?Wme(i.length>0?i:t):l?.body??e.content??"";return p.jsxs(G4,{from:"assistant",children:[p.jsxs(W4,{children:[n?p.jsxs(p.Fragment,{children:[u&&p.jsx(Dme,{count:r.length,children:p.jsx(CP,{segments:u})}),s&&p.jsx(CP,{segments:s})]}):l&&p.jsxs(p.Fragment,{children:[l.reasoning&&p.jsx(DS,{children:p.jsx(Af,{className:"text-xs text-muted-foreground",children:l.reasoning})}),l.body&&p.jsx("div",{className:"text-[1rem] leading-relaxed text-foreground",children:eg(l.body)}),l.tools.length>0&&p.jsx("div",{className:"flex flex-col gap-1.5",children:l.tools.map((d,g)=>p.jsx(J4,{name:d.name,input:d.args},g))})]}),p.jsx(Kme,{checkpoints:e.checkpoints??[]})]}),p.jsxs("div",{className:"flex items-center justify-between gap-2",children:[p.jsx(Z4,{turn:e}),f.length>0&&p.jsx(Pme,{children:p.jsx(Rme,{text:f})})]})]})}function zS({detail:e}){const t=Ime(e);return p.jsx("div",{className:xe("flex flex-col gap-6 px-1 py-2"),children:t.map((n,r)=>$me(n)?p.jsx(Fme,{turn:n},r):n.role==="user"?p.jsx(qme,{turn:n},r):n.role==="assistant"?p.jsx(Xme,{turn:n},r):p.jsx(Hme,{turn:n},r))})}function _P(e){return Number.isFinite(e)?e.toFixed(3).replace(/0+$/,"").replace(/\.$/,""):"n/a"}function kP(e){return Number.isFinite(e)?`${Math.round(e*100)}%`:"n/a"}function Qme({value:e}){const t=Math.max(0,Math.min(1,e));return p.jsx("div",{className:"h-1.5 w-full overflow-hidden rounded-full bg-muted",children:p.jsx("div",{className:"h-full rounded-full bg-foreground/70",style:{width:`${t*100}%`}})})}function Zme({passed:e}){return e?p.jsxs("span",{className:"inline-flex items-center gap-1 rounded-full border border-emerald-500/40 bg-emerald-500/10 px-2 py-0.5 text-[10px] font-semibold uppercase tracking-wider text-emerald-700 dark:text-emerald-400",children:[p.jsx(g1,{size:11,strokeWidth:2.5}),"Pass"]}):p.jsxs("span",{className:"inline-flex items-center gap-1 rounded-full border border-rose-500/40 bg-rose-500/10 px-2 py-0.5 text-[10px] font-semibold uppercase tracking-wider text-rose-700 dark:text-rose-400",children:[p.jsx(EM,{size:11,strokeWidth:2.5}),"Fail"]})}function Jme({children:e,count:t}){return p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx("div",{className:"text-[10px] font-semibold uppercase tracking-[0.16em] text-muted-foreground",children:e}),t!=null&&p.jsx("div",{className:"font-mono text-[10px] text-muted-foreground/70",children:t}),p.jsx("div",{className:"h-px flex-1 bg-border"})]})}function tg({m:e}){return p.jsxs("div",{className:"flex items-center gap-3 rounded-md border border-border bg-card/40 px-3 py-2",children:[p.jsx("div",{className:"min-w-[150px] font-mono text-[11px] text-foreground",children:e.metric}),p.jsx("div",{className:"flex-1",children:p.jsx(Qme,{value:e.value})}),p.jsx("div",{className:"min-w-[48px] text-right font-mono text-[11px] text-foreground",children:_P(e.value)}),p.jsxs("div",{className:"min-w-[36px] text-right font-mono text-[10px] text-muted-foreground",children:["×",_P(e.weight)]})]})}function ng({icon:e,title:t,subtitle:n,weightedScore:r,passThreshold:i,passed:l,source:s}){return p.jsxs("header",{className:"flex items-center justify-between gap-4 rounded-md border border-border bg-muted/30 px-3 py-2",children:[p.jsxs("div",{className:"flex items-center gap-2.5",children:[p.jsx("div",{className:"flex h-7 w-7 items-center justify-center rounded-md border border-border bg-card text-muted-foreground",children:e}),p.jsxs("div",{className:"flex flex-col",children:[p.jsx("div",{className:"text-sm font-semibold text-foreground",children:t}),n&&p.jsx("div",{className:"text-[11px] text-muted-foreground",children:n})]})]}),p.jsxs("div",{className:"flex items-center gap-3 text-[11px]",children:[p.jsxs("div",{className:"flex flex-col items-end leading-tight",children:[p.jsxs("div",{className:"font-mono text-foreground",children:[kP(r),p.jsxs("span",{className:"text-muted-foreground",children:[" ","/ ",kP(i)]})]}),p.jsxs("div",{className:"text-[10px] text-muted-foreground",children:["source: ",s]})]}),p.jsx(Zme,{passed:l})]})]})}function rg(e){if(e.length===0)return{weightedScore:0,passThreshold:0,passed:!1,source:"missing"};const t=e[0];return{weightedScore:t.weighted_score,passThreshold:t.pass_threshold,passed:t.passed,source:t.source}}function ege({metrics:e}){if(e.length===0)return null;const t=rg(e),n=e[0];return p.jsxs("section",{className:"flex flex-col gap-2",children:[p.jsx(ng,{icon:p.jsx(AM,{size:14,strokeWidth:2.25}),title:"Retrieval ranking",subtitle:`k=${n?.k??"n/a"}, ${n?.hit_count??0}/${n?.total_relevant??0} hits, ${n?.forbidden_hits??0} forbidden`,weightedScore:t.weightedScore,passThreshold:t.passThreshold,passed:t.passed,source:t.source}),p.jsx("div",{className:"flex flex-col gap-1.5",children:e.map((r,i)=>p.jsx(tg,{m:r},`${r.metric}-${i}`))})]})}function tge({metrics:e}){if(e.length===0)return null;const t=rg(e),n=e[0],r=[];return n&&(r.push(`timestamp violations: ${n.timestamp_violation_count}`),n.cascade_bounded===!0?r.push("cascade: bounded"):n.cascade_bounded===!1&&r.push("cascade: RUNAWAY")),p.jsxs("section",{className:"flex flex-col gap-2",children:[p.jsx(ng,{icon:p.jsx(Z8,{size:14,strokeWidth:2.25}),title:"Demotion correctness",subtitle:r.join(" · "),weightedScore:t.weightedScore,passThreshold:t.passThreshold,passed:t.passed,source:t.source}),p.jsx("div",{className:"flex flex-col gap-1.5",children:e.map((i,l)=>p.jsx(tg,{m:i},`${i.metric}-${l}`))})]})}function nge({metrics:e}){if(e.length===0)return null;const t=rg(e),n=e[0],r=Array.isArray(n?.predicted)?n.predicted:[],i=Array.isArray(n?.golden)?n.golden:[];return p.jsxs("section",{className:"flex flex-col gap-2",children:[p.jsx(ng,{icon:p.jsx(q8,{size:14,strokeWidth:2.25}),title:"Procedure extraction",subtitle:`predicted ${r.length} steps · golden ${i.length} steps`,weightedScore:t.weightedScore,passThreshold:t.passThreshold,passed:t.passed,source:t.source}),p.jsx("div",{className:"flex flex-col gap-1.5",children:e.map((l,s)=>p.jsx(tg,{m:l},`${l.metric}-${s}`))}),(r.length>0||i.length>0)&&p.jsxs("div",{className:"grid grid-cols-1 gap-3 sm:grid-cols-2",children:[p.jsx(NP,{title:"Predicted",items:r}),p.jsx(NP,{title:"Golden",items:i})]})]})}function NP({title:e,items:t}){return p.jsxs("div",{className:"rounded-md border border-border bg-background p-2.5",children:[p.jsx("div",{className:"mb-1.5 text-[10px] font-semibold uppercase tracking-[0.16em] text-muted-foreground",children:e}),t.length===0?p.jsx("div",{className:"text-[11px] text-muted-foreground",children:"(empty)"}):p.jsx("ol",{className:"flex flex-col gap-1 font-mono text-[11px] text-foreground",children:t.map((n,r)=>p.jsxs("li",{className:"flex gap-2",children:[p.jsxs("span",{className:"text-muted-foreground",children:[r+1,"."]}),p.jsx("span",{children:n})]},`${e}-${r}`))})]})}function rge({metrics:e}){if(e.length===0)return null;const t=rg(e),n=e[0],r=Array.isArray(n?.predicted)?n.predicted:[],i=Array.isArray(n?.golden)?n.golden:[];return p.jsxs("section",{className:"flex flex-col gap-2",children:[p.jsx(ng,{icon:p.jsx(dF,{size:14,strokeWidth:2.25}),title:"Deduplication",subtitle:`items: ${n?.item_count??0} · predicted ${r.length} clusters · golden ${i.length} clusters`,weightedScore:t.weightedScore,passThreshold:t.passThreshold,passed:t.passed,source:t.source}),p.jsx("div",{className:"flex flex-col gap-1.5",children:e.map((l,s)=>p.jsx(tg,{m:l},`${l.metric}-${s}`))}),(r.length>0||i.length>0)&&p.jsxs("div",{className:"grid grid-cols-1 gap-3 sm:grid-cols-2",children:[p.jsx(TP,{title:"Predicted",clusters:r}),p.jsx(TP,{title:"Golden",clusters:i})]})]})}function TP({title:e,clusters:t}){return p.jsxs("div",{className:"rounded-md border border-border bg-background p-2.5",children:[p.jsx("div",{className:"mb-1.5 text-[10px] font-semibold uppercase tracking-[0.16em] text-muted-foreground",children:e}),t.length===0?p.jsx("div",{className:"text-[11px] text-muted-foreground",children:"(empty)"}):p.jsx("div",{className:"flex flex-col gap-1.5",children:t.map((n,r)=>p.jsx("div",{className:xe("rounded border border-border bg-muted/30 px-2 py-1 font-mono text-[11px] leading-relaxed text-foreground"),children:n.join(", ")},`${e}-${r}`))})]})}function e5({detail:e}){const t=e.retrieval_scores??[],n=e.demotion_scores??[],r=e.procedure_scores??[],i=e.dedup_scores??[],l=t.length+n.length+r.length+i.length;return l===0?p.jsxs("div",{className:"flex flex-col items-center justify-center gap-2 rounded-md border border-dashed border-border px-4 py-12 text-center",children:[p.jsx(AM,{size:18,strokeWidth:2,className:"text-muted-foreground"}),p.jsx("div",{className:"text-sm font-medium text-foreground",children:"No quantitative eval scores"}),p.jsx("p",{className:"max-w-md text-[12px] text-muted-foreground",children:"This scenario didn't declare a retrieval, demotion, procedure, or dedup block. Add one to its YAML to get IR-style metrics here."})]}):p.jsxs("div",{className:"flex flex-col gap-5",children:[p.jsx(Jme,{count:l,children:"Eval scores"}),p.jsx(ege,{metrics:t}),p.jsx(tge,{metrics:n}),p.jsx(nge,{metrics:r}),p.jsx(rge,{metrics:i})]})}function t5(e){return(e.retrieval_scores?.length??0)>0||(e.demotion_scores?.length??0)>0||(e.procedure_scores?.length??0)>0||(e.dedup_scores?.length??0)>0}function ige(e){return e.passed?"pass":e.status==="error"||e.overall_score==null&&e.passed===void 0?"unknown":"fail"}function n5({children:e,count:t}){return p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx("div",{className:"text-[10px] font-semibold uppercase tracking-[0.16em] text-muted-foreground",children:e}),t!=null&&p.jsx("div",{className:"text-[10px] font-mono text-muted-foreground/70",children:t}),p.jsx("div",{className:"h-px flex-1 bg-border"})]})}function age({verdict:e}){return e==="pass"?p.jsxs("span",{className:"inline-flex items-center gap-1.5 rounded-full border border-emerald-500/40 bg-emerald-500/10 px-2.5 py-1 text-xs font-semibold uppercase tracking-wider text-emerald-700 dark:text-emerald-400",children:[p.jsx(g1,{size:14,strokeWidth:2.5}),"Pass"]}):e==="fail"?p.jsxs("span",{className:"inline-flex items-center gap-1.5 rounded-full border border-rose-500/40 bg-rose-500/10 px-2.5 py-1 text-xs font-semibold uppercase tracking-wider text-rose-700 dark:text-rose-400",children:[p.jsx(EM,{size:14,strokeWidth:2.5}),"Fail"]}):p.jsxs("span",{className:"inline-flex items-center gap-1.5 rounded-full border border-border bg-muted px-2.5 py-1 text-xs font-semibold uppercase tracking-wider text-muted-foreground",children:[p.jsx(_8,{size:14,strokeWidth:2.5}),"Unknown"]})}function lge({detail:e}){const t=ige(e),n=e.overall_score,r=e.pass_threshold,i=Bp(n),l=r!=null?Math.max(0,Math.min(100,Math.round(r*100))):null,s=t==="pass"?"bg-emerald-500":t==="fail"?"bg-rose-500":"bg-muted-foreground/40";return p.jsxs("section",{className:"flex flex-col gap-3",children:[p.jsxs("div",{className:"flex items-center gap-2 text-[10px] font-semibold uppercase tracking-[0.18em] text-muted-foreground",children:[p.jsx(z8,{size:12,strokeWidth:2.5}),p.jsx("span",{children:"Verdict"})]}),p.jsxs("div",{className:"flex items-end justify-between gap-4",children:[p.jsxs("div",{className:"flex items-baseline gap-3",children:[p.jsx("div",{className:xe("font-mono text-4xl font-semibold tabular-nums leading-none tracking-tight",t==="pass"&&"text-emerald-700 dark:text-emerald-400",t==="fail"&&"text-rose-700 dark:text-rose-400",t==="unknown"&&"text-muted-foreground"),children:n!=null?n.toFixed(2):"—"}),r!=null&&p.jsxs("div",{className:"font-mono text-xs text-muted-foreground",children:["/ threshold ",r.toFixed(2)]})]}),p.jsx(age,{verdict:t})]}),p.jsxs("div",{className:"relative",children:[p.jsx("div",{className:"h-1.5 w-full overflow-hidden rounded-full bg-muted",children:p.jsx("div",{className:xe("h-full rounded-full transition-[width]",s),style:{width:`${i}%`}})}),l!=null&&p.jsx("div",{className:"absolute top-[-3px] h-3 w-px bg-foreground/60",style:{left:`${l}%`},"aria-hidden":!0,title:`Pass threshold ${r?.toFixed(2)}`})]}),e.judge?.model&&p.jsxs("div",{className:"flex flex-wrap items-center gap-1.5 text-[11px] text-muted-foreground",children:[p.jsx("span",{className:"text-muted-foreground/70",children:"Judged by"}),p.jsxs("span",{className:"font-mono text-foreground",children:[e.judge.provider?`${e.judge.provider} · `:"",e.judge.model]}),e.judge.temperature!=null&&p.jsxs("span",{className:"font-mono text-muted-foreground/80",children:["· t=",e.judge.temperature]})]})]})}function oge({notes:e}){return p.jsxs("section",{className:"flex flex-col gap-2",children:[p.jsx(n5,{children:"Notes"}),p.jsxs("blockquote",{className:"relative rounded-md border-l-2 border-foreground/40 bg-muted/40 px-4 py-3 text-sm leading-relaxed text-foreground",children:[p.jsx(nF,{className:"absolute -left-px -top-2 h-3 w-3 -translate-x-1/2 rotate-180 text-muted-foreground/40",strokeWidth:2.5,"aria-hidden":!0}),p.jsx(Af,{children:e})]})]})}function sge(e){const t=e.normalized_score;return t==null?{text:"text-muted-foreground",bar:"bg-muted-foreground/40",border:"border-border"}:t>=.85?{text:"text-emerald-700 dark:text-emerald-400",bar:"bg-emerald-500",border:"border-emerald-500/30"}:t>=.6?{text:"text-foreground",bar:"bg-foreground/70",border:"border-border"}:t>=.4?{text:"text-amber-700 dark:text-amber-400",bar:"bg-amber-500",border:"border-amber-500/30"}:{text:"text-rose-700 dark:text-rose-400",bar:"bg-rose-500",border:"border-rose-500/30"}}function uge({d:e}){const t=Bp(e.normalized_score),n=sge(e),r=e.raw_score!=null?`${e.raw_score}${e.scale_points!=null?`/${e.scale_points}`:""}`:"—",i=e.evidence??[],l=!!e.reasoning?.trim(),s=i.length>0;return p.jsxs("article",{className:xe("flex flex-col gap-3 rounded-lg border bg-background/50 px-4 py-3",n.border),children:[p.jsxs("header",{className:"flex items-start justify-between gap-3",children:[p.jsxs("div",{className:"min-w-0",children:[p.jsx("h3",{className:"truncate text-sm font-medium text-foreground",children:e.dimension_name}),p.jsx("p",{className:"truncate font-mono text-[11px] text-muted-foreground",children:e.dimension_id})]}),p.jsxs("div",{className:"flex shrink-0 items-baseline gap-2 text-right",children:[p.jsx("span",{className:xe("font-mono text-base font-semibold tabular-nums leading-none",n.text),children:r}),e.weight!=null&&p.jsxs("span",{className:"inline-flex items-center gap-1 rounded-sm border border-border bg-muted/60 px-1.5 py-0.5 text-[10px] font-mono text-muted-foreground",children:[p.jsx(lF,{size:10,strokeWidth:2.5}),"×",e.weight]})]})]}),p.jsxs("div",{className:"flex items-center gap-3",children:[p.jsx("div",{className:"h-1 flex-1 overflow-hidden rounded-full bg-muted",children:p.jsx("div",{className:xe("h-full rounded-full transition-[width]",n.bar),style:{width:`${t}%`}})}),p.jsxs("span",{className:"w-9 shrink-0 text-right font-mono text-[10px] tabular-nums text-muted-foreground",children:[t,"%"]})]}),(l||s)&&p.jsxs(DS,{label:"Reasoning",children:[l&&p.jsx(Af,{className:"text-xs text-muted-foreground",children:e.reasoning}),s&&p.jsxs("div",{className:"mt-2 flex flex-col gap-1.5",children:[p.jsx("div",{className:"text-[10px] font-semibold uppercase tracking-[0.16em] text-muted-foreground/70",children:"Evidence"}),i.map((u,f)=>p.jsx("div",{className:"rounded border-l-2 border-border bg-muted/30 px-2 py-1 font-mono text-[11px] leading-relaxed text-muted-foreground",children:u},f))]})]})]})}function r5({detail:e}){const t=[...e.judge_dimension_scores??[]].sort((i,l)=>(l.weight??0)-(i.weight??0)),n=e.judge?.overall_notes,r=e.judge?.output;return p.jsxs("div",{className:"flex flex-col gap-5",children:[p.jsx(lge,{detail:e}),n&&p.jsx(oge,{notes:n}),p.jsxs("section",{className:"flex flex-col gap-3",children:[p.jsx(n5,{count:t.length||void 0,children:"Dimensions"}),t.length>0?p.jsx("div",{className:"flex flex-col gap-2.5",children:t.map((i,l)=>p.jsx(uge,{d:i},l))}):p.jsx("p",{className:"text-sm text-muted-foreground",children:"No rubric dimensions recorded."})]}),r&&p.jsx(X4,{icon:p.jsx(OM,{size:14,strokeWidth:2.25}),title:"Raw judge output",description:"Full structured response from the judge model",children:p.jsx("pre",{className:"overflow-x-auto whitespace-pre-wrap break-words rounded-md border border-border bg-background p-3 font-mono text-[11px] leading-relaxed text-foreground",children:JSON.stringify(r,null,2)})})]})}function cge({detail:e,onClose:t}){const n=t5(e),[r,i]=x.useState("conversation"),l=e.status==="running",s=e.overall_score!=null?e.overall_score.toFixed(2):l?"...":"n/a",u=e.pass_threshold!=null?e.pass_threshold.toFixed(2):"n/a",f=l?"RUNNING":e.passed?"PASS":"FAIL",d=l?"detail-running":e.passed?"detail-pass":"detail-fail",g=typeof e.judge?.output=="object"&&e.judge?.output!=null?e.judge.output.failure_mode_detected:null;return p.jsxs(p.Fragment,{children:[p.jsx("div",{className:"detail-backdrop open",role:"presentation",onClick:t,onKeyDown:h=>{h.key==="Escape"&&t()}}),p.jsx("div",{className:"detail-overlay open",children:p.jsxs("div",{className:"detail-panel",children:[p.jsxs("div",{className:"detail-top",children:[p.jsx("button",{type:"button",className:"detail-close",onClick:t,children:"×"}),p.jsxs("div",{className:`detail-score-header ${d}`,children:[p.jsxs("div",{className:"detail-title-block",children:[p.jsxs("div",{className:"detail-name",children:[e.scenario_name,l&&p.jsxs("span",{className:"live-badge",style:{marginLeft:12,verticalAlign:"middle"},children:[p.jsx("span",{className:"live-dot"})," LIVE"]})]}),p.jsxs("div",{className:"detail-sid",children:[e.scenario_id,e.user_id?` / ${e.user_id}`:""]})]}),p.jsxs("div",{className:"detail-score-block",children:[p.jsxs("div",{className:"detail-score-group",children:[p.jsx("div",{className:"detail-score-label",children:"Score"}),p.jsx("div",{className:"detail-score-value",children:s})]}),p.jsxs("div",{className:"detail-score-group",children:[p.jsx("div",{className:"detail-score-label",children:"Threshold"}),p.jsx("div",{className:"detail-score-value",children:u})]}),p.jsxs("div",{className:"detail-score-group",children:[p.jsx("div",{className:"detail-score-label",children:"Status"}),p.jsx("div",{className:"detail-score-value",children:f})]}),typeof g=="string"&&g&&p.jsxs("div",{className:"detail-score-group",children:[p.jsx("div",{className:"detail-score-label",children:"Failure"}),p.jsx("div",{className:"detail-score-value",children:g})]})]}),p.jsx("div",{className:"detail-bar",children:p.jsx("div",{className:"detail-bar-fill",style:{width:`${Bp(e.overall_score)}%`}})})]}),p.jsxs("div",{className:"detail-tabs",children:[p.jsx("button",{type:"button",className:`tab-btn${r==="conversation"?" tab-active":""}`,onClick:()=>i("conversation"),children:"Conversation"}),p.jsx("button",{type:"button",className:`tab-btn${r==="rubric"?" tab-active":""}`,onClick:()=>i("rubric"),children:"Rubric"}),n&&p.jsx("button",{type:"button",className:`tab-btn${r==="evals"?" tab-active":""}`,onClick:()=>i("evals"),children:"Eval scores"})]})]}),p.jsxs("div",{className:"detail-body",children:[r==="conversation"&&p.jsx(zS,{detail:e}),r==="rubric"&&p.jsx(r5,{detail:e}),r==="evals"&&p.jsx(e5,{detail:e})]})]})})]})}function i5({data:e}){const t=e.total||1,n=e.passed/t*100,r=e.failed/t*100,i=e.running/t*100;return p.jsxs("div",{className:"progress-bar",style:{display:"flex"},children:[p.jsx("div",{className:"progress-fill progress-pass",style:{width:`${n}%`}}),p.jsx("div",{className:"progress-fill progress-fail",style:{width:`${r}%`}}),p.jsx("div",{className:"progress-fill progress-running",style:{width:`${i}%`}})]})}const fge={pending:"PENDING",running:"RUNNING",pass:"PASS",fail:"FAIL",error:"ERROR"};function dge({scenario:e}){const[t,n]=x.useState(Date.now()),r=x.useRef(void 0);if(x.useEffect(()=>{if(e.status==="running"&&e.started_at!=null)return r.current=setInterval(()=>n(Date.now()),1e3),()=>clearInterval(r.current);clearInterval(r.current)},[e.status,e.started_at]),e.started_at==null)return p.jsx(p.Fragment,{children:"-"});if(e.finished_at!=null)return p.jsxs(p.Fragment,{children:[(e.finished_at-e.started_at).toFixed(1),"s"]});const i=t/1e3-e.started_at;return p.jsx(p.Fragment,{children:i>0?`${i.toFixed(0)}s`:"-"})}function a5({data:e,runId:t,onSelect:n}){const r=i=>t?`/runs/${encodeURIComponent(t)}/scenarios/${i}`:null;return p.jsxs(p.Fragment,{children:[p.jsxs("div",{className:"section-title",children:["Scenarios"," ",p.jsx("span",{style:{color:"var(--muted)",fontWeight:400,fontSize:12},children:"(click a row to view conversation & rubric)"})]}),p.jsxs("table",{children:[p.jsx("thead",{children:p.jsxs("tr",{children:[p.jsx("th",{children:"ID"}),p.jsx("th",{children:"Name"}),p.jsx("th",{children:"Status"}),p.jsx("th",{style:{textAlign:"right"},children:"Score"}),p.jsx("th",{style:{textAlign:"right"},children:"Duration"}),p.jsx("th",{children:"Error"})]})}),p.jsx("tbody",{children:e.scenarios.map((i,l)=>{const s=l in e.details,u=r(l);return p.jsxs("tr",{className:`status-${i.status}${s?" clickable-row":""}`,onClick:s?()=>n(l):void 0,children:[p.jsx("td",{className:"id-cell",children:s&&u?p.jsx("a",{href:u,onClick:f=>f.stopPropagation(),style:{color:"inherit",textDecoration:"none"},children:i.scenario_id}):i.scenario_id}),p.jsx("td",{children:i.scenario_name??""}),p.jsx("td",{className:"status-badge",children:p.jsx("span",{children:fge[i.status]??i.status.toUpperCase()})}),p.jsx("td",{className:"score-cell",children:i.score!=null?i.score.toFixed(2):"-"}),p.jsx("td",{className:"duration-cell",children:p.jsx(dge,{scenario:i})}),p.jsx("td",{children:i.error&&p.jsx("span",{className:"error-text",title:i.error,children:i.error.slice(0,60)})})]},`${i.scenario_id}-${l}`)})})]})]})}function hge(e,t){const[n,r]=x.useState(0);return x.useEffect(()=>{if(t)return;const i=performance.now(),l=setInterval(()=>{r((performance.now()-i)/1e3)},500);return()=>{clearInterval(l),r(0)}},[e,t]),t?e:e+n}function pge(e){const t=Math.floor(e/60),n=Math.floor(e%60);return`${t}m ${n}s`}function Vl({value:e,label:t,color:n}){return p.jsxs("div",{className:"stat",children:[p.jsx("div",{className:"stat-value",style:{color:n},children:e}),p.jsx("div",{className:"stat-label",children:t})]})}function l5({data:e}){const t=hge(e.elapsed,e.all_done),n=e.total>0?Math.round(e.done/e.total*100):0;return p.jsxs("div",{className:"stats",children:[p.jsx(Vl,{value:`${e.done}/${e.total}`,label:"Completed",color:"var(--text)"}),p.jsx(Vl,{value:`${e.passed}`,label:"Passed",color:"var(--green)"}),p.jsx(Vl,{value:`${e.failed}`,label:"Failed",color:"var(--red)"}),p.jsx(Vl,{value:`${e.errored}`,label:"Errors",color:"var(--amber)"}),p.jsx(Vl,{value:`${e.running}`,label:"Running",color:"var(--blue)"}),p.jsx(Vl,{value:pge(t),label:"Elapsed",color:"var(--muted)"}),p.jsx(Vl,{value:`${n}%`,label:"Progress",color:"var(--indigo)"})]})}const o5="agentprobe:theme";function a1(){try{const e=window.localStorage.getItem(o5);if(e==="light"||e==="dark")return e}catch{}return null}function mge(){const e=a1();return e||(typeof window<"u"&&window.matchMedia?.("(prefers-color-scheme: dark)").matches?"dark":"light")}function gge(e){const t=document.documentElement;e==="dark"?t.classList.add("dark"):t.classList.remove("dark")}function vge(){const[e,t]=x.useState(()=>mge());return x.useEffect(()=>{gge(e);try{window.localStorage.setItem(o5,e)}catch{}},[e]),x.useEffect(()=>{if(a1())return;const r=window.matchMedia("(prefers-color-scheme: dark)"),i=l=>{a1()||t(l.matches?"dark":"light")};return r.addEventListener("change",i),()=>r.removeEventListener("change",i)},[]),{theme:e,setTheme:t,toggle:()=>t(n=>n==="dark"?"light":"dark")}}function yge(){const{theme:e,toggle:t}=vge(),n=e==="dark"?pF:X8;return p.jsx(fM,{variant:"ghost",size:"icon",onClick:t,"aria-label":e==="dark"?"Switch to light mode":"Switch to dark mode",title:e==="dark"?"Light mode":"Dark mode",className:"size-8",children:p.jsx(n,{className:"h-4 w-4"})})}const bge=2e3;function xge(){const[e,t]=x.useState(null),[n,r]=x.useState(null),i=x.useRef(!1);return x.useEffect(()=>{let l,s=!1;async function u(){try{const f=await fetch("/api/state");if(!f.ok)throw new Error(`HTTP ${f.status}`);const d=await f.json();if(s)return;t(d),r(null),i.current=d.all_done}catch(f){if(s)return;r(f instanceof Error?f.message:"Unknown error")}}return u(),l=setInterval(()=>{i.current||u()},bge),()=>{s=!0,clearInterval(l)}},[]),{data:e,error:n}}function wge({request:e}){const[t,n]=x.useState(null),[r,i]=x.useState({}),[l,s]=x.useState(null),u=x.useCallback(async()=>{try{const[g,h]=await Promise.all([e("/api/suites"),e("/api/endpoint-overrides")]);n(g);const v={};for(const b of h.overrides)v[b.endpoint_path]={baseUrl:b.base_url,autogptJwtSecret:b.autogpt_jwt_secret};i(v),s(null)}catch(g){s(g instanceof Error?g.message:String(g))}},[e]);if(x.useEffect(()=>{let g=!1;return(async()=>g||await u())(),()=>{g=!0}},[u]),l&&!t)return p.jsx(yt,{message:l});if(!t)return p.jsx(rf,{});const f=t.suites.filter(g=>g.schema==="endpoints").map(g=>({relativePath:g.relativePath})),d=f.filter(g=>{const h=r[g.relativePath];return!!(h?.baseUrl||h?.autogptJwtSecret)}).length;return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Endpoints",title:"Endpoint overrides",meta:`${f.length} endpoint${f.length===1?"":"s"} · ${d} with overrides`}),l?p.jsx(yt,{message:l}):null,p.jsxs("p",{className:"text-sm text-muted-foreground mb-4",children:["Override values from any endpoint YAML. Saved overrides are applied whenever the dashboard server uses that endpoint, taking precedence over the YAML defaults (and any ",p.jsxs("code",{children:["$","{VAR}"]})," placeholders)."]}),f.length===0?p.jsx(rt,{className:"p-6 text-sm text-muted-foreground",children:"No endpoint suites found in your data path."}):p.jsx("div",{className:"flex flex-col gap-4",children:f.map(g=>p.jsx(Sge,{relativePath:g.relativePath,request:e,onChanged:()=>{u()}},g.relativePath))})]})}function Sge({relativePath:e,request:t,onChanged:n}){const[r,i]=x.useState(null),[l,s]=x.useState(!0),[u,f]=x.useState(""),[d,g]=x.useState(""),[h,v]=x.useState(!1),[b,S]=x.useState(null),[j,E]=x.useState(null),O=x.useCallback(async()=>{s(!0);try{const k=await t(`/api/endpoint-overrides/${encodeURIComponent(e)}`);i(k),f(k.override?.base_url??""),g(k.override?.autogpt_jwt_secret??""),E(null)}catch(k){E(k instanceof Error?k.message:String(k))}finally{s(!1)}},[e,t]);x.useEffect(()=>{let k=!1;return(async()=>k||await O())(),()=>{k=!0}},[O]);const C=async k=>{k.preventDefault(),v(!0),S(null);try{const I=u.trim(),R=d.trim(),F=await t(`/api/endpoint-overrides/${encodeURIComponent(e)}`,Si("PUT",{base_url:I||null,autogpt_jwt_secret:R||null}));i(D=>D&&{...D,override:I||R?F.override:null}),S(I||R?"Saved.":"Cleared."),E(null),n()}catch(I){E(I instanceof Error?I.message:String(I))}finally{v(!1)}},_=async()=>{v(!0),S(null);try{await t(`/api/endpoint-overrides/${encodeURIComponent(e)}`,Si("DELETE")),f(""),g(""),i(k=>k&&{...k,override:null}),S("Cleared."),E(null),n()}catch(k){E(k instanceof Error?k.message:String(k))}finally{v(!1)}},P=!!(r?.override?.base_url||r?.override?.autogpt_jwt_secret),T=r?.defaults.preset==="autogpt";return p.jsxs(rt,{className:"p-4",children:[p.jsx("div",{className:"flex items-center justify-between gap-3 mb-3",children:p.jsxs("div",{className:"flex items-center gap-2 min-w-0",children:[p.jsx("span",{className:"font-mono text-sm break-all",children:e}),r?.defaults.transport?p.jsx(Pt,{tone:"info",children:r.defaults.transport}):null,P?p.jsx(Pt,{tone:"warn",children:"override saved"}):null]})}),l?p.jsx("div",{className:"text-xs text-muted-foreground",children:"Loading…"}):p.jsxs("form",{onSubmit:C,className:"flex flex-col gap-3",children:[p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-3",children:[p.jsx(Xe,{label:"YAML default",hint:r?.defaults.base_url&&r.defaults.base_url_resolved&&r.defaults.base_url!==r.defaults.base_url_resolved?`Resolves to ${r.defaults.base_url_resolved}`:"From the endpoint YAML's connection.base_url / connection.url",children:p.jsx(Kt,{value:r?.defaults.base_url??"",readOnly:!0,disabled:!0,className:"font-mono text-xs"})}),p.jsx(Xe,{label:"Override",hint:"Applied for every run that uses this endpoint. Leave blank to remove.",children:p.jsx(Kt,{value:u,onChange:k=>f(k.currentTarget.value),placeholder:r?.defaults.base_url_resolved??"https://staging.example",className:"font-mono text-xs"})})]}),T?p.jsx(Xe,{label:"AutoGPT JWT secret override",hint:"Overrides the secret used to forge AutoGPT bearer tokens for this endpoint. Leave blank to remove and fall back to AUTOGPT_JWT_SECRET, JWT_SECRET, or the built-in dev default.",children:p.jsx(Kt,{type:"password",value:d,onChange:k=>g(k.currentTarget.value),placeholder:"your-super-secret-jwt-token...",autoComplete:"off",className:"font-mono text-xs"})}):null,p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx(Le,{type:"submit",disabled:h,children:h?"Saving…":"Save"}),P?p.jsx(Le,{type:"button",variant:"ghost",disabled:h,onClick:()=>{_()},children:"Clear override"}):null,b?p.jsx("span",{className:"text-xs text-success",children:b}):null,j?p.jsx("span",{className:"text-xs text-destructive",children:j}):null]})]})]})}function kt(e){return typeof e=="string"&&e.trim()?e:null}function ra(e){return Array.isArray(e)?e:[]}function Xr(e){return e&&typeof e=="object"&&!Array.isArray(e)?e:null}function jge(e){switch(e){case"user":return{variant:"info",label:"User"};case"assistant":return{variant:"default",label:"Assistant"};case"checkpoint":return{variant:"warning",label:"Checkpoint"};case"inject":return{variant:"destructive",label:"Inject"};default:return{variant:"secondary",label:e}}}function Es({children:e}){return p.jsx("div",{className:"text-[10px] uppercase tracking-[0.12em] text-muted-foreground font-semibold mt-5 mb-2",children:e})}function Xl({label:e,value:t,mono:n}){return p.jsxs("div",{className:"grid grid-cols-[120px_1fr] gap-2 py-1.5 text-sm border-b border-border last:border-b-0",children:[p.jsx("div",{className:"text-xs text-muted-foreground",children:e}),p.jsx("div",{className:`min-w-0 break-words ${n?"font-mono text-xs":""}`,children:t})]})}function io({text:e}){return p.jsx("pre",{className:"whitespace-pre-wrap text-sm text-foreground bg-secondary/40 border border-border rounded-md px-3 py-2 font-mono leading-relaxed overflow-x-auto",children:e})}function s5({turn:e,ordinal:t}){const n=String(e.role??"turn"),r=jge(n),i=kt(e.content),l=e.useExactMessage===!0,s=ra(e.attachments),u=ra(e.assertions);return p.jsxs(rt,{className:"p-3",children:[p.jsx("div",{className:"flex items-center justify-between mb-2 gap-2",children:p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx("span",{className:"font-mono text-xs text-muted-foreground",children:t.toString().padStart(2,"0")}),p.jsx(ea,{variant:r.variant,className:"uppercase tracking-wider",children:r.label}),l?p.jsx(Pt,{tone:"warn",children:"verbatim"}):null]})}),i?p.jsx(io,{text:i}):null,s.length>0?p.jsx("div",{className:"mt-2 flex flex-wrap gap-1.5",children:s.map((f,d)=>{const g=Xr(f),h=kt(g?.path)??"",v=kt(g?.name)??h.split("/").pop();return p.jsx(Pt,{tone:"info",children:v||h},`${h}-${d}`)})}):null,n==="checkpoint"&&u.length>0?p.jsx("ul",{className:"mt-2 space-y-2",children:u.map((f,d)=>{const g=Xr(f);return g?p.jsxs("li",{className:"text-xs bg-secondary/40 border border-border rounded-md p-2 space-y-1",children:[kt(g.toolCalled)?p.jsxs("div",{children:[p.jsx("span",{className:"text-muted-foreground",children:"tool:"})," ",p.jsx("span",{className:"font-mono",children:g.toolCalled})]}):null,Array.isArray(g.responseContainsAny)&&g.responseContainsAny.length>0?p.jsxs("div",{children:[p.jsx("span",{className:"text-muted-foreground",children:"contains any:"})," ",p.jsx("span",{className:"font-mono",children:g.responseContainsAny.join(" | ")})]}):null,Array.isArray(g.responseMustNotContain)&&g.responseMustNotContain.length>0?p.jsxs("div",{children:[p.jsx("span",{className:"text-muted-foreground",children:"must not contain:"})," ",p.jsx("span",{className:"font-mono",children:g.responseMustNotContain.join(" | ")})]}):null,kt(g.responseMentions)?p.jsxs("div",{children:[p.jsx("span",{className:"text-muted-foreground",children:"mentions:"})," ",p.jsx("span",{className:"font-mono",children:g.responseMentions})]}):null,Xr(g.withArgs)?p.jsx("pre",{className:"font-mono text-[11px] mt-1 whitespace-pre-wrap",children:JSON.stringify(g.withArgs,null,2)}):null]},`assert-${d}`):null})}):null]})}function Ege({session:e,index:t}){const n=ra(e.turns);return p.jsxs("div",{children:[p.jsxs("div",{className:"flex items-center gap-2 mb-2",children:[p.jsxs(ea,{variant:"secondary",className:"uppercase",children:["Session ",t+1]}),kt(e.id)?p.jsx("span",{className:"font-mono text-xs text-muted-foreground",children:e.id}):null,kt(e.timeOffset)?p.jsxs(Pt,{children:["+",e.timeOffset]}):null,kt(e.reset)?p.jsxs(Pt,{children:["reset: ",e.reset]}):null,typeof e.maxTurns=="number"?p.jsxs(Pt,{children:["max turns: ",e.maxTurns]}):null]}),p.jsxs("div",{className:"flex flex-col gap-2",children:[n.map((r,i)=>{const l=Xr(r);return l?p.jsx(s5,{turn:l,ordinal:i+1},`t-${i}`):null}),n.length===0?p.jsx("div",{className:"text-sm text-muted-foreground italic",children:"No turns in this session."}):null]})]})}function Oge({expectations:e}){const t=ra(e.mustInclude),n=ra(e.mustNotInclude),r=ra(e.expectedTools),i=ra(e.failureModes),l=kt(e.expectedBehavior),s=kt(e.expectedOutcome),u=kt(e.groundTruth),f=kt(e.testerNote);return t.length===0&&n.length===0&&r.length===0&&i.length===0&&!l&&!s&&!u&&!f?null:p.jsxs(rt,{className:"p-4 space-y-3",children:[s?p.jsx(Xl,{label:"Outcome",value:s}):null,l?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Expected behavior"}),p.jsx(io,{text:l})]}):null,t.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Response must include"}),p.jsx("div",{className:"flex flex-wrap gap-1.5",children:t.map((g,h)=>p.jsx(Pt,{tone:"success",children:g},`m-${h}`))})]}):null,n.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Response must NOT include"}),p.jsx("div",{className:"flex flex-wrap gap-1.5",children:n.map((g,h)=>p.jsx(Pt,{tone:"warn",children:g},`mn-${h}`))})]}):null,r.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Expected tool calls"}),p.jsx("ul",{className:"text-sm font-mono space-y-1",children:r.map((g,h)=>{const v=Xr(g);return v?p.jsxs("li",{className:"flex items-center gap-2 text-xs",children:[p.jsx("span",{children:kt(v.name)??"?"}),v.required?p.jsx(Pt,{tone:"warn",children:"required"}):null,typeof v.callOrder=="number"?p.jsxs(Pt,{children:["order: ",v.callOrder]}):null]},`tool-${h}`):null})})]}):null,i.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Failure modes"}),p.jsx("ul",{className:"space-y-2",children:i.map((g,h)=>{const v=Xr(g);return v?p.jsxs("li",{className:"bg-secondary/40 border border-border rounded-md p-2",children:[p.jsx("div",{className:"font-medium text-sm",children:kt(v.name)??"(unnamed)"}),kt(v.description)?p.jsx("div",{className:"text-xs text-muted-foreground mt-0.5",children:kt(v.description)}):null]},`fm-${h}`):null})})]}):null,u?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Ground truth"}),p.jsx(io,{text:u})]}):null,f?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Tester note"}),p.jsx(io,{text:f})]}):null]})}function Age({context:e}){const t=kt(e.systemPrompt),n=kt(e.userName),r=kt(e.copilotMode),i=Xr(e.injectedData);return!t&&!n&&!r&&!i?null:p.jsxs(rt,{className:"p-4 space-y-3",children:[n||r?p.jsxs("div",{className:"flex flex-wrap gap-2",children:[n?p.jsxs(Pt,{tone:"info",children:["user: ",n]}):null,r?p.jsxs(Pt,{tone:"info",children:["mode: ",r]}):null]}):null,t?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"System prompt"}),p.jsx(io,{text:t})]}):null,i&&Object.keys(i).length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Injected data"}),p.jsx(io,{text:JSON.stringify(i,null,2)})]}):null]})}function u5({open:e,target:t,request:n,onClose:r}){const[i,l]=x.useState(null),[s,u]=x.useState(null),[f,d]=x.useState(!1),[g,h]=x.useState(!1);if(x.useEffect(()=>{if(!e||!t)return;let F=!1;return d(!0),u(null),l(null),h(!1),n(`/api/scenarios/lookup?file=${encodeURIComponent(t.file)}&id=${encodeURIComponent(t.id)}`).then(D=>{F||l(D)}).catch(D=>{F||u(D instanceof Error?D.message:String(D))}).finally(()=>{F||d(!1)}),()=>{F=!0}},[e,t,n]),!t)return null;const v=i?.scenario??null,b=kt(v?.name)??t.name??t.id,S=kt(v?.description)??t.description??null,j=v?.tags??t.tags??[],E=kt(v?.priority)??t.priority??null,O=kt(v?.persona),C=kt(v?.rubric),_=v?.maxTurns,P=kt(v?.baseDate),T=ra(v?.sessions),k=ra(v?.turns),I=Xr(v?.expectations),R=Xr(v?.context);return p.jsxs(M1,{open:e,onClose:r,size:"lg",title:p.jsxs("div",{className:"flex items-baseline gap-2 flex-wrap pr-6",children:[p.jsx("span",{className:"text-foreground",children:b}),E?p.jsx(Pt,{tone:"info",children:E}):null]}),description:p.jsxs("span",{className:"font-mono text-xs text-muted-foreground",children:[t.id," · ",t.file]}),footer:p.jsxs(p.Fragment,{children:[p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>h(F=>!F),disabled:!v,children:g?"Hide raw spec":"View raw spec"}),p.jsx(Le,{onClick:r,children:"Close"})]}),children:[f?p.jsx(rf,{label:"Loading scenario…"}):null,s?p.jsx(yt,{message:s}):null,v?p.jsxs("div",{className:"space-y-1",children:[S?p.jsx("p",{className:"text-sm text-foreground leading-relaxed",children:S}):null,p.jsx(Es,{children:"Metadata"}),p.jsxs(rt,{className:"px-4 py-2",children:[p.jsx(Xl,{label:"Suite",value:t.file,mono:!0}),O?p.jsx(Xl,{label:"Persona",value:O,mono:!0}):null,C?p.jsx(Xl,{label:"Rubric",value:C,mono:!0}):null,typeof _=="number"?p.jsx(Xl,{label:"Max turns",value:_,mono:!0}):null,P?p.jsx(Xl,{label:"Base date",value:P,mono:!0}):null,j.length>0?p.jsx(Xl,{label:"Tags",value:p.jsx("div",{className:"flex flex-wrap gap-1.5",children:j.map(F=>p.jsx(Pt,{children:F},F))})}):null]}),R?p.jsxs(p.Fragment,{children:[p.jsx(Es,{children:"Context"}),p.jsx(Age,{context:R})]}):null,T.length>0?p.jsxs(p.Fragment,{children:[p.jsx(Es,{children:"Sessions"}),p.jsx("div",{className:"space-y-4",children:T.map((F,D)=>{const H=Xr(F);return H?p.jsx(Ege,{session:H,index:D},`s-${D}`):null})})]}):null,T.length===0&&k.length>0?p.jsxs(p.Fragment,{children:[p.jsx(Es,{children:"Turns"}),p.jsx("div",{className:"space-y-2",children:k.map((F,D)=>{const H=Xr(F);return H?p.jsx(s5,{turn:H,ordinal:D+1},`t-${D}`):null})})]}):null,I?p.jsxs(p.Fragment,{children:[p.jsx(Es,{children:"Expectations"}),p.jsx(Oge,{expectations:I})]}):null,g?p.jsxs(p.Fragment,{children:[p.jsx(T7,{className:"my-4"}),p.jsx(Es,{children:"Raw spec"}),p.jsx(io,{text:JSON.stringify(v,null,2)})]}):null]}):null]})}function Kl(e,t){return`${e}::${t}`}function Cge({presetId:e,request:t,navigate:n}){const[r,i]=x.useState(null),[l,s]=x.useState(null),[u,f]=x.useState(null),[d,g]=x.useState(null),[h,v]=x.useState(!1),[b,S]=x.useState(""),[j,E]=x.useState(""),[O,C]=x.useState(""),[_,P]=x.useState(""),[T,k]=x.useState(""),[I,R]=x.useState(1),[F,D]=x.useState(!1),[H,$]=x.useState(2),[J,G]=x.useState(!1),[Q,B]=x.useState(new Set),[Y,ie]=x.useState(""),[ae,M]=x.useState(""),[L,te]=x.useState(""),[z,he]=x.useState("all"),[ve,pe]=x.useState(null);x.useEffect(()=>{let de=!1;return Promise.all([t(`/api/presets/${encodeURIComponent(e)}`),t("/api/scenarios"),t("/api/suites")]).then(([Re,Rt,wn])=>{if(de)return;const sn=Re.preset;i(sn),s(Rt),f(wn),S(sn.name),E(sn.description??""),C(sn.endpoint),P(sn.personas),k(sn.rubric),R(sn.repeat),D(sn.parallel.enabled),$(sn.parallel.limit??2),G(sn.dry_run),B(new Set(sn.selection.map(Co=>Kl(Co.file,Co.id))))}).catch(Re=>{de||g(Re instanceof Error?Re.message:String(Re))}),()=>{de=!0}},[t,e]);const W=x.useMemo(()=>{if(!l)return[];const de=new Set;for(const Re of l.scenarios)for(const Rt of Re.tags)de.add(Rt);return[...de].sort()},[l]),ge=x.useMemo(()=>{if(!l)return[];const de=new Set;for(const Re of l.scenarios)Re.priority&&de.add(Re.priority);return[...de].sort()},[l]),ne=x.useMemo(()=>{if(!l)return[];const de=Y.trim().toLowerCase();return l.scenarios.filter(Re=>{if(de&&!Re.id.toLowerCase().includes(de)&&!Re.name.toLowerCase().includes(de)&&!(Re.description??"").toLowerCase().includes(de)&&!Re.sourcePath.toLowerCase().includes(de)||ae&&!Re.tags.includes(ae)||L&&Re.priority!==L)return!1;if(z!=="all"){const Rt=Q.has(Kl(Re.sourcePath,Re.id));if(z==="selected"&&!Rt||z==="unselected"&&Rt)return!1}return!0})},[l,Y,ae,L,z,Q]),se=x.useMemo(()=>u?.suites.filter(de=>de.schema==="endpoints")??[],[u]),ye=x.useMemo(()=>u?.suites.filter(de=>de.schema==="personas")??[],[u]),oe=x.useMemo(()=>u?.suites.filter(de=>de.schema==="rubrics")??[],[u]),$e=(de,Re)=>{const Rt=Kl(de,Re),wn=new Set(Q);wn.has(Rt)?wn.delete(Rt):wn.add(Rt),B(wn)},ke=()=>{const de=new Set(Q);for(const Re of ne)de.add(Kl(Re.sourcePath,Re.id));B(de)},xt=()=>{const de=new Set(Q);for(const Re of ne)de.delete(Kl(Re.sourcePath,Re.id));B(de)},wt=async()=>{v(!0),g(null);try{if(!l)throw new Error("Scenarios not loaded.");const de=[];for(const Re of l.scenarios){const Rt=Kl(Re.sourcePath,Re.id);Q.has(Rt)&&de.push({file:Re.sourcePath,id:Re.id})}if(de.length===0)throw new Error("Select at least one scenario.");await t(`/api/presets/${encodeURIComponent(e)}`,Si("PUT",{name:b.trim(),description:j.trim()||null,endpoint:O,personas:_,rubric:T,selection:de,parallel:{enabled:F,limit:F?H:null},repeat:I,dry_run:J})),n(`/presets/${encodeURIComponent(e)}`)}catch(de){g(de instanceof Error?de.message:String(de))}finally{v(!1)}};return d&&!r?p.jsx(yt,{message:d}):!r||!l||!u?p.jsx(rf,{}):p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Edit Preset",title:b||r.name,meta:`${Q.size} scenario${Q.size===1?"":"s"} selected`,actions:p.jsxs(p.Fragment,{children:[p.jsx(Le,{variant:"secondary",onClick:()=>n(`/presets/${encodeURIComponent(e)}`),children:"Cancel"}),p.jsx(Le,{onClick:()=>void wt(),disabled:h,children:h?"Saving…":"Save changes"})]})}),d?p.jsx(yt,{message:d}):null,p.jsxs("div",{className:"grid grid-cols-1 lg:grid-cols-[minmax(0,1fr)_320px] gap-4 mb-6",children:[p.jsxs(rt,{className:"p-4 flex flex-col gap-3",children:[p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-3",children:[p.jsx(Xe,{label:"Name",children:p.jsx(Kt,{value:b,onChange:de=>S(de.currentTarget.value),required:!0})}),p.jsx(Xe,{label:"Description",children:p.jsx(Kt,{value:j,onChange:de=>E(de.currentTarget.value),placeholder:"Short summary"})})]}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3",children:[p.jsx(Xe,{label:"Endpoint",children:p.jsx(sr,{value:O,onValueChange:C,options:se.map(de=>({value:de.relativePath,label:de.relativePath})),emptyLabel:"No endpoint suites"})}),p.jsx(Xe,{label:"Personas",children:p.jsx(sr,{value:_,onValueChange:P,options:ye.map(de=>({value:de.relativePath,label:de.relativePath})),emptyLabel:"No persona suites"})}),p.jsx(Xe,{label:"Rubric",children:p.jsx(sr,{value:T,onValueChange:k,options:oe.map(de=>({value:de.relativePath,label:de.relativePath})),emptyLabel:"No rubric suites"})})]})]}),p.jsxs(rt,{className:"p-4 flex flex-col gap-3",children:[p.jsx(Xe,{label:"Repeat",children:p.jsx(Kt,{type:"number",min:1,value:I,onChange:de=>R(Number(de.currentTarget.value))})}),p.jsx(Xe,{label:"Parallel",hint:"Run multiple scenarios concurrently. Limit caps concurrency (2–4 is typical; higher = faster but more LLM cost spikes).",children:p.jsxs("div",{className:"flex items-center gap-3",children:[p.jsx(bi,{checked:F,onChange:D,label:"Enabled"}),p.jsx(Kt,{type:"number",min:1,value:H,disabled:!F,onChange:de=>$(Number(de.currentTarget.value)),className:"w-20"})]})}),p.jsx(Xe,{label:"Mode",hint:"Dry run records run + scenario rows but skips the live adapter, judge, and scorers. Use to validate config / preset shape without spending LLM tokens.",children:p.jsx(bi,{checked:J,onChange:G,label:"Dry run by default"})})]})]}),p.jsxs(rt,{className:"overflow-hidden",children:[p.jsxs("div",{className:"p-3 border-b border-border flex flex-wrap items-center gap-2",children:[p.jsx(Kt,{placeholder:"Filter by id, name, or path…",value:Y,onChange:de=>ie(de.currentTarget.value),className:"max-w-xs"}),p.jsx(sr,{value:ae||"__all_tags__",onValueChange:de=>M(de==="__all_tags__"?"":de),className:"max-w-xs",options:[{value:"__all_tags__",label:"All tags"},...W.map(de=>({value:de,label:de}))]}),p.jsx(sr,{value:L||"__all_priorities__",onValueChange:de=>te(de==="__all_priorities__"?"":de),className:"max-w-xs",options:[{value:"__all_priorities__",label:"All priorities"},...ge.map(de=>({value:de,label:de}))]}),p.jsx(sr,{value:z,onValueChange:de=>he(de),className:"max-w-xs",options:[{value:"all",label:"All scenarios"},{value:"selected",label:"Included only"},{value:"unselected",label:"Not included"}]}),p.jsx("div",{className:"flex-1"}),p.jsxs("span",{className:"text-xs text-muted-foreground mr-2",children:[ne.length," matching · ",Q.size," selected"]}),p.jsx(Le,{variant:"secondary",size:"sm",onClick:ke,children:"Select shown"}),p.jsx(Le,{variant:"ghost",size:"sm",onClick:xt,children:"Clear shown"})]}),p.jsx("div",{className:"max-h-[480px] overflow-y-auto divide-y divide-border",children:ne.length===0?p.jsx("div",{className:"p-6 text-center text-muted-foreground text-sm",children:"No scenarios match."}):ne.map(de=>{const Re=Kl(de.sourcePath,de.id),Rt=Q.has(Re);return p.jsxs("div",{className:`flex items-start gap-3 px-3 py-2.5 hover:bg-secondary ${Rt?"bg-primary/5":""}`,children:[p.jsxs("label",{className:"flex items-start gap-3 flex-1 min-w-0 cursor-pointer",children:[p.jsx("input",{type:"checkbox",checked:Rt,onChange:()=>$e(de.sourcePath,de.id),className:"size-4 mt-0.5 accent-primary shrink-0"}),p.jsxs("div",{className:"flex-1 min-w-0",children:[p.jsxs("div",{className:"flex items-baseline gap-2 flex-wrap",children:[p.jsx("span",{className:"text-sm font-medium text-foreground",children:de.name||de.id}),p.jsx("span",{className:"font-mono text-[11px] text-muted-foreground",children:de.id}),de.priority?p.jsx(Pt,{tone:"info",children:de.priority}):null]}),de.description?p.jsx("div",{className:"text-xs text-muted-foreground mt-0.5 line-clamp-2",children:de.description}):null,p.jsxs("div",{className:"flex items-center gap-1.5 mt-1 flex-wrap",children:[de.tags.slice(0,5).map(wn=>p.jsx(Pt,{children:wn},wn)),p.jsx("span",{className:"text-[10px] text-muted-foreground/70 font-mono",children:de.sourcePath})]})]})]}),p.jsx(Le,{type:"button",variant:"ghost",size:"sm",className:"shrink-0 self-start",onClick:()=>pe({file:de.sourcePath,id:de.id,name:de.name,description:de.description,tags:de.tags,priority:de.priority}),children:"Details"})]},Re)})})]}),p.jsx(u5,{open:ve!=null,target:ve,request:t,onClose:()=>pe(null)})]})}function _ge(e){if(!e)return"—";try{return new Date(e).toLocaleString()}catch{return e}}function kge(e,t){const n=Date.parse(e);if(Number.isNaN(n))return null;const r=t?Date.parse(t):Date.now();return Number.isNaN(r)?null:Math.max(0,(r-n)/1e3)}function Nge(e){if(e==null)return"—";if(e<60)return`${Math.round(e)}s`;const t=Math.floor(e/60),n=Math.round(e%60);return`${t}m ${n}s`}function Tge({runs:e,navigate:t,presetName:n}){const[r,i]=x.useState(new Set),l=x.useMemo(()=>[...e].sort((h,v)=>Date.parse(v.startedAt)-Date.parse(h.startedAt)),[e]),s=h=>{const v=new Set(r);v.has(h)?v.delete(h):v.add(h),i(v)},u=()=>{if(r.size<2)return;const h=l.filter(v=>r.has(v.runId)).slice(0,10).map(v=>encodeURIComponent(v.runId)).join(",");t(`/compare?run_ids=${h}`)},f=()=>{if(l.length<2)return;const h=l.slice(0,2).map(v=>encodeURIComponent(v.runId)).join(",");t(`/compare?run_ids=${h}`)};if(l.length===0)return p.jsx(Jp,{title:"No runs yet",description:`Launch ${n} to see results here.`});const d=r.size<2,g=r.size>10;return p.jsxs(p.Fragment,{children:[p.jsxs("div",{className:"flex flex-wrap items-center gap-2 mb-3",children:[p.jsx("div",{className:"text-sm text-muted-foreground",children:r.size===0?`${l.length} run${l.length===1?"":"s"} · select 2–10 to compare`:`${r.size} selected${g?" (max 10)":""}`}),p.jsx("div",{className:"flex-1"}),l.length>=2?p.jsx(Le,{variant:"secondary",size:"sm",onClick:f,children:"Compare latest two"}):null,p.jsxs(Le,{size:"sm",onClick:u,disabled:d||g,children:["Compare ",r.size>0?`(${Math.min(r.size,10)})`:""]})]}),p.jsx(rt,{className:"overflow-hidden",children:p.jsx("div",{className:"overflow-x-auto",children:p.jsxs("table",{className:"w-full text-sm",children:[p.jsx("thead",{className:"bg-secondary",children:p.jsxs("tr",{className:"text-left text-muted-foreground text-xs uppercase tracking-wider",children:[p.jsx("th",{className:"px-3 py-2 w-8"}),p.jsx("th",{className:"px-3 py-2",children:"Name"}),p.jsx("th",{className:"px-3 py-2",children:"Status"}),p.jsx("th",{className:"px-3 py-2",children:"Started"}),p.jsx("th",{className:"px-3 py-2",children:"Duration"}),p.jsx("th",{className:"px-3 py-2 text-right",children:"Pass / Total"}),p.jsx("th",{className:"px-3 py-2",children:"Notes"})]})}),p.jsx("tbody",{className:"divide-y divide-border",children:l.map(h=>{const v=r.has(h.runId);return p.jsxs("tr",{className:v?"bg-primary/5 hover:bg-primary/10":"hover:bg-secondary",children:[p.jsx("td",{className:"px-3 py-2 align-top",children:p.jsx(bi,{checked:v,onChange:()=>s(h.runId)})}),p.jsxs("td",{className:"px-3 py-2 align-top",children:[p.jsx("a",{href:`/runs/${encodeURIComponent(h.runId)}`,className:"text-foreground hover:text-primary block",children:h.label?p.jsx("span",{className:"font-medium",children:h.label}):p.jsxs("span",{className:"font-mono text-xs text-muted-foreground",children:[h.runId.slice(0,12),"…"]})}),h.label?p.jsx("span",{className:"font-mono text-[10px] text-muted-foreground/70",children:h.runId.slice(0,12)}):null]}),p.jsx("td",{className:"px-3 py-2 align-top",children:p.jsx(Qp,{run:h})}),p.jsx("td",{className:"px-3 py-2 align-top text-muted-foreground",children:_ge(h.startedAt)}),p.jsx("td",{className:"px-3 py-2 align-top text-muted-foreground",children:Nge(kge(h.startedAt,h.completedAt))}),p.jsxs("td",{className:"px-3 py-2 align-top text-right font-mono",children:[h.aggregateCounts.scenarioPassedCount,"/",h.aggregateCounts.scenarioTotal]}),p.jsx("td",{className:"px-3 py-2 align-top text-muted-foreground max-w-[280px] truncate",children:h.notes??"—"})]},h.runId)})})]})})})]})}function c5({open:e,options:t,request:n,onClose:r,onLaunched:i,suites:l}){const[s,u]=x.useState(""),[f,d]=x.useState(""),[g,h]=x.useState(""),[v,b]=x.useState(""),[S,j]=x.useState(!1),[E,O]=x.useState(2),[C,_]=x.useState(1),[P,T]=x.useState(!1),[k,I]=x.useState(""),[R,F]=x.useState(""),[D,H]=x.useState(null),[$,J]=x.useState(!1);x.useEffect(()=>{!e||!t||(u(t.defaults.endpoint),d(""),h(t.defaults.personas),b(t.defaults.rubric),j(t.defaults.parallelEnabled),O(t.defaults.parallelLimit??2),_(t.defaults.repeat),T(t.defaults.dryRun),I(""),F(""),H(null))},[e,t]);const G=x.useMemo(()=>l?.suites.filter(ae=>ae.schema==="endpoints")??[],[l]),Q=x.useMemo(()=>l?.suites.filter(ae=>ae.schema==="personas")??[],[l]),B=x.useMemo(()=>l?.suites.filter(ae=>ae.schema==="rubrics")??[],[l]),Y=x.useMemo(()=>{const ae=s.toLowerCase();return ae.includes("autogpt")?"autogpt":ae.includes("openclaw")?"openclaw":ae.includes("opencode")?"opencode":"custom"},[s]);if(!t)return null;const ie=async ae=>{ae.preventDefault(),J(!0),H(null);try{const M={parallel:{enabled:S,limit:S?E:void 0},repeat:C,dry_run:P};s&&s!==t.defaults.endpoint&&(M.endpoint=s);const L=f.trim();L&&(M.base_url=L),g&&g!==t.defaults.personas&&(M.personas=g),v&&v!==t.defaults.rubric&&(M.rubric=v);const te={overrides:M};k.trim()&&(te.label=k.trim()),R.trim()&&(te.notes=R.trim());const z=await n(`/api/presets/${encodeURIComponent(t.presetId)}/runs`,Si("POST",te));i(z.run_id)}catch(M){H(M instanceof Error?M.message:String(M))}finally{J(!1)}};return p.jsxs(M1,{open:e,onClose:r,title:`Run ${t.presetName}`,size:"lg",footer:p.jsxs(p.Fragment,{children:[p.jsx(Le,{variant:"ghost",onClick:r,disabled:$,children:"Cancel"}),p.jsx(Le,{type:"submit",form:"run-launch-form",disabled:$,children:$?"Starting…":"Start run"})]}),children:[D?p.jsx(yt,{message:D}):null,p.jsxs("form",{id:"run-launch-form",onSubmit:ie,className:"flex flex-col gap-4",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Endpoint"}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-[2fr_auto] gap-2 items-center",children:[p.jsx(sr,{value:s,onValueChange:u,options:[...G.map(ae=>({value:ae.relativePath,label:ae.relativePath})),...s&&!G.find(ae=>ae.relativePath===s)?[{value:s,label:s}]:[]],emptyLabel:"No endpoint suites found"}),p.jsx(Pt,{tone:Y==="custom"?"warn":"info",children:Y})]}),p.jsx("div",{className:"text-xs text-muted-foreground mt-1",children:"Pick a different endpoint YAML to retarget the run (e.g. an autogpt staging endpoint vs. an openclaw gateway)."})]}),p.jsx(Xe,{label:"Base URL override",hint:"Replaces connection.base_url (HTTP) or connection.url (WebSocket) from the endpoint YAML for this run only. Leave blank to use the YAML default.",children:p.jsx(Kt,{value:f,onChange:ae=>d(ae.currentTarget.value),placeholder:"e.g. https://staging.autogpt.example or ws://10.0.0.5:18789"})}),p.jsxs("details",{className:"rounded-md border border-border bg-secondary p-3",children:[p.jsx("summary",{className:"cursor-pointer text-sm font-medium text-muted-foreground hover:text-foreground",children:"Override personas / rubric"}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-3 mt-3",children:[p.jsx(Xe,{label:"Personas",children:p.jsx(sr,{value:g,onValueChange:h,options:Q.map(ae=>({value:ae.relativePath,label:ae.relativePath})),emptyLabel:"No persona suites"})}),p.jsx(Xe,{label:"Rubric",children:p.jsx(sr,{value:v,onValueChange:b,options:B.map(ae=>({value:ae.relativePath,label:ae.relativePath})),emptyLabel:"No rubric suites"})})]})]}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3",children:[p.jsx(Xe,{label:"Parallel",hint:"Run multiple scenarios concurrently. Scenarios still complete in order, but several at a time.",children:p.jsx("div",{className:"flex items-center gap-2",children:p.jsx(bi,{checked:S,onChange:j,label:"Enabled"})})}),p.jsx(Xe,{label:"Parallel limit",hint:"Max concurrent scenarios when parallel is on. Higher = faster but more LLM cost spikes; 2–4 is typical.",children:p.jsx(Kt,{type:"number",min:1,value:E,disabled:!S,onChange:ae=>O(Number(ae.currentTarget.value))})}),p.jsx(Xe,{label:"Repeat",children:p.jsx(Kt,{type:"number",min:1,value:C,onChange:ae=>_(Number(ae.currentTarget.value))})})]}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-3",children:[p.jsx(Xe,{label:"Run name",hint:"Shows in the run list — useful for comparing runs later.",children:p.jsx(Kt,{value:k,onChange:ae=>I(ae.currentTarget.value),placeholder:"e.g. autogpt-staging baseline",maxLength:200})}),p.jsx(Xe,{label:"Mode",hint:"Dry run records run + scenario rows but skips the live adapter, judge, and scorers. Use to validate config / preset shape without spending LLM tokens.",children:p.jsx(bi,{checked:P,onChange:T,label:"Dry run"})})]}),p.jsx(Xe,{label:"Notes",hint:"Optional context — diff vs. last run, hypotheses, etc.",children:p.jsx(eD,{value:R,onChange:ae=>F(ae.currentTarget.value),rows:3,maxLength:4e3,placeholder:"Why this run?"})})]})]})}function Pge({run:e,request:t,onUpdated:n}){const[r,i]=x.useState(!1),[l,s]=x.useState(e.label??""),[u,f]=x.useState(!1),[d,g]=x.useState(e.notes??""),[h,v]=x.useState(!1),[b,S]=x.useState(null);x.useEffect(()=>{s(e.label??""),g(e.notes??"")},[e.label,e.notes]);const j=async C=>{v(!0),S(null);try{const _=await t(`/api/runs/${encodeURIComponent(e.runId)}`,Si("PATCH",C));n(_.run)}catch(_){throw S(_ instanceof Error?_.message:String(_)),_}finally{v(!1)}},E=async()=>{try{await j({label:l.trim()?l.trim():null}),i(!1)}catch{}},O=async()=>{try{await j({notes:d.trim()?d.trim():null}),f(!1)}catch{}};return p.jsxs(rt,{className:"p-4 mb-4",children:[b?p.jsx(yt,{message:b}):null,p.jsxs("div",{className:"mb-4",children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Run name"}),r?p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx(Kt,{value:l,onChange:C=>s(C.currentTarget.value),maxLength:200,autoFocus:!0,placeholder:"e.g. autogpt staging baseline",onKeyDown:C=>{C.key==="Enter"&&E(),C.key==="Escape"&&(s(e.label??""),i(!1))}}),p.jsx(Le,{onClick:()=>void E(),disabled:h,size:"sm",children:"Save"}),p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>{s(e.label??""),i(!1)},disabled:h,children:"Cancel"})]}):p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx("span",{className:"text-base text-foreground",children:e.label??p.jsx("span",{className:"text-muted-foreground/70 italic",children:"Untitled run — click rename to add a name"})}),p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>i(!0),children:"Rename"})]})]}),p.jsxs("div",{children:[p.jsxs("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1 flex items-center justify-between",children:[p.jsx("span",{children:"Notes"}),!u&&p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>f(!0),children:e.notes?"Edit":"Add notes"})]}),u?p.jsxs("div",{className:"flex flex-col gap-2",children:[p.jsx(eD,{value:d,onChange:C=>g(C.currentTarget.value),rows:4,maxLength:4e3,autoFocus:!0,placeholder:"Hypotheses, observations, comparison context…"}),p.jsxs("div",{className:"flex items-center gap-2 justify-end",children:[p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>{g(e.notes??""),f(!1)},disabled:h,children:"Cancel"}),p.jsx(Le,{onClick:()=>void O(),disabled:h,size:"sm",children:"Save notes"})]})]}):e.notes?p.jsx("div",{className:"text-sm text-foreground whitespace-pre-wrap",children:e.notes}):p.jsx("div",{className:"text-sm text-muted-foreground/70 italic",children:"No notes yet."})]})]})}function Mge(e){if(e===null||!Number.isFinite(e))return{label:"—",className:"bg-secondary text-muted-foreground"};const t=e.toFixed(2);return e>=.7?{label:`r=${t}`,className:"bg-green-500/15 text-green-600 dark:text-green-400 border border-green-500/30"}:e>=.3?{label:`r=${t}`,className:"bg-amber-500/15 text-amber-600 dark:text-amber-400 border border-amber-500/30"}:{label:`r=${t}`,className:"bg-red-500/15 text-red-600 dark:text-red-400 border border-red-500/30"}}function l1(e){const t=e.labels??{},n=Object.keys(t).map(i=>({value:Number(i),label:t[i]??""})).filter(i=>Number.isFinite(i.value));if(n.length>0)return n.sort((i,l)=>i.value-l.value),n;if(e.type==="binary")return[{value:0,label:"0"},{value:1,label:"1"}];const r=e.points??5;return Array.from({length:r},(i,l)=>({value:l+1,label:String(l+1)}))}function Rge({request:e,navigate:t}){const[n,r]=x.useState(null),[i,l]=x.useState(null);return x.useEffect(()=>{let s=!1;return e("/api/human-scoring/rubrics").then(u=>{s||(r(u),l(null))}).catch(u=>{s||l(u instanceof Error?u.message:String(u))}),()=>{s=!0}},[e]),i?p.jsx(yt,{message:i}):n?n.rubrics.length===0?p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Human scoring",title:"Score completed runs",meta:"No completed runs found yet."}),p.jsx(rt,{className:"p-6",children:p.jsx("div",{className:"text-sm text-muted-foreground",children:"Run an evaluation, then return here to score it by hand."})})]}):p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Human scoring",title:"Score completed runs",meta:"Pick a rubric dimension. You'll click through completed runs scoring just that dimension, one chat at a time."}),n.rubrics.map(s=>p.jsxs(rt,{className:"p-4 mb-3",children:[p.jsxs("div",{className:"flex items-baseline justify-between mb-1",children:[p.jsx("h3",{className:"text-base font-semibold tracking-tight m-0",children:s.rubricName}),p.jsx("span",{className:"text-xs text-muted-foreground font-mono",children:s.rubricId})]}),p.jsxs("div",{className:"text-xs text-muted-foreground mb-3",children:[s.totalScenarios," completed scenario",s.totalScenarios===1?"":"s"]}),p.jsx("div",{className:"space-y-2",children:s.dimensions.map(u=>{const f=u.unscored===0,d=`/score/${encodeURIComponent(s.rubricId)}/${encodeURIComponent(u.id)}`,g=Mge(u.correlation),h=u.correlation===null?u.pairedCount===0?"No human scores yet.":`Only ${u.pairedCount} paired score${u.pairedCount===1?"":"s"} — need 2+ for correlation.`:`Pearson correlation between ${u.pairedCount} paired human and judge scores.`;return p.jsxs("a",{href:d,className:`flex items-center justify-between gap-3 px-3 py-2.5 rounded-md border border-border bg-secondary/40 hover:border-primary hover:bg-secondary no-underline transition-colors ${f?"opacity-50":""}`,onClick:v=>{v.preventDefault(),t(d)},children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-sm font-medium text-foreground",children:u.name}),p.jsxs("div",{className:"text-xs text-muted-foreground",children:["weight ",u.weight," · ",u.scale.type]})]}),p.jsxs("div",{className:"flex items-center gap-2 whitespace-nowrap",children:[p.jsx("span",{title:h,className:`px-2 py-0.5 rounded-full font-mono text-[11px] tabular-nums ${g.className}`,children:g.label}),p.jsxs("span",{className:"font-mono text-xs text-foreground",children:[u.unscored," ",p.jsxs("span",{className:"text-muted-foreground",children:["/ ",s.totalScenarios," unscored"]})]})]})]},u.id)})})]},s.rubricId))]}):p.jsx(Ci,{withMeta:!0})}function PP(e){return typeof e=="string"&&e.trim()?e.trim():null}function MP(e){return Array.isArray(e)?e.map(t=>{if(typeof t=="string")return t.trim();if(t&&typeof t=="object"){const n=t.fact;if(typeof n=="string")return n.trim()}return""}).filter(t=>t.length>0):[]}function Dge(e){const t=e.expectations&&typeof e.expectations=="object"&&!Array.isArray(e.expectations)?e.expectations:{},n=PP(t.expected_behavior),r=PP(t.expected_outcome),i=MP(t.must_include),l=MP(t.must_not_include),s=Array.isArray(t.expected_tools)?t.expected_tools.map(u=>{if(typeof u=="string")return u;if(u&&typeof u=="object"){const f=u.name;return typeof f=="string"?f:""}return""}).filter(u=>u.length>0):[];return{scenarioName:e.scenarioName,scenarioId:e.scenarioId,description:e.scenarioDescription,expectedBehavior:n,expectedOutcome:r,mustInclude:i,mustNotInclude:l,expectedTools:s}}function zge(e){return{scenario_id:e.scenarioId,scenario_name:e.scenarioName,passed:!1,overall_score:e.overallScore,pass_threshold:e.passThreshold,status:"completed",turns:e.turns??[],tool_calls:e.toolCalls??[],target_events:e.targetEvents??[],checkpoints:[],judge_dimension_scores:[]}}const RP=420;function Ige({dimension:e,onSelect:t,submitting:n}){const r=l1(e.scale);return p.jsxs("div",{className:"space-y-3 text-sm",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Dimension"}),p.jsx("div",{className:"text-foreground font-medium",children:e.name}),p.jsxs("div",{className:"text-[11px] text-muted-foreground font-mono mt-0.5",children:["weight ",e.weight," · ",e.scale.type,e.scale.points?` · 1–${e.scale.points}`:""]})]}),p.jsxs("div",{className:"text-[11px] text-muted-foreground",children:["Press ",p.jsx("span",{className:"font-mono",children:"1"}),"–",p.jsx("span",{className:"font-mono",children:r.length})," on your keyboard, or click a level below."]}),p.jsx("div",{className:"space-y-2",children:r.map(i=>p.jsxs("button",{type:"button",disabled:n,onClick:()=>t(i.value),className:"w-full grid grid-cols-[40px_1fr] gap-3 items-start px-3 py-2.5 rounded-md border border-border bg-secondary/40 hover:border-primary hover:bg-secondary text-left disabled:opacity-60 disabled:cursor-progress transition-colors",children:[p.jsx("span",{className:"text-xl font-bold font-mono text-primary text-center leading-tight pt-0.5",children:i.value}),p.jsx("span",{className:"text-xs whitespace-pre-wrap leading-relaxed",children:i.label})]},i.value))})]})}function Lge({objective:e}){return p.jsxs("div",{className:"space-y-4 text-sm",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Scenario"}),p.jsx("div",{className:"text-foreground font-medium",children:e.scenarioName}),p.jsx("div",{className:"text-[11px] text-muted-foreground font-mono mt-0.5",children:e.scenarioId})]}),e.description?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Description"}),p.jsx("div",{className:"text-foreground whitespace-pre-wrap",children:e.description})]}):null,e.expectedBehavior?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Expected behavior"}),p.jsx("div",{className:"text-foreground whitespace-pre-wrap",children:e.expectedBehavior})]}):null,e.expectedOutcome?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Expected outcome"}),p.jsx("div",{className:"text-foreground whitespace-pre-wrap",children:e.expectedOutcome})]}):null,e.mustInclude.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Must include"}),p.jsx("ul",{className:"list-disc list-inside text-foreground space-y-0.5",children:e.mustInclude.map((t,n)=>p.jsx("li",{children:t},n))})]}):null,e.mustNotInclude.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Must not include"}),p.jsx("ul",{className:"list-disc list-inside text-foreground space-y-0.5",children:e.mustNotInclude.map((t,n)=>p.jsx("li",{children:t},n))})]}):null,e.expectedTools.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Expected tools"}),p.jsx("div",{className:"font-mono text-foreground",children:e.expectedTools.join(", ")})]}):null]})}function Bge({toolCalls:e}){const[t,n]=x.useState(null);return e.length===0?p.jsx("div",{className:"text-xs text-muted-foreground",children:"No tool calls recorded for this run."}):p.jsx("div",{className:"space-y-2",children:e.map((r,i)=>{const l=t===i,s=r.args===void 0||r.args===null?"—":JSON.stringify(r.args,null,2),u=r.raw===void 0||r.raw===null?null:JSON.stringify(r.raw,null,2);return p.jsxs("div",{className:"rounded-md border border-border bg-secondary/30",children:[p.jsxs("button",{type:"button",onClick:()=>n(l?null:i),className:"w-full flex items-center justify-between gap-2 px-3 py-2 text-left hover:bg-secondary",children:[p.jsxs("div",{className:"min-w-0",children:[p.jsx("div",{className:"text-sm font-mono truncate",children:r.name??"(unnamed)"}),p.jsxs("div",{className:"text-[11px] text-muted-foreground",children:["turn ",r.turn_index,r.call_order!==null?` · order ${r.call_order}`:""]})]}),p.jsx("span",{className:"text-muted-foreground text-xs shrink-0",children:l?"▾":"▸"})]}),l?p.jsxs("div",{className:"border-t border-border px-3 py-2 space-y-2",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground mb-1",children:"Args"}),p.jsx("pre",{className:"text-[11px] font-mono whitespace-pre-wrap break-words bg-background rounded p-2 max-h-72 overflow-auto",children:s})]}),u?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground mb-1",children:"Raw"}),p.jsx("pre",{className:"text-[11px] font-mono whitespace-pre-wrap break-words bg-background rounded p-2 max-h-72 overflow-auto",children:u})]}):null]}):null]},`${r.turn_index}-${r.call_order??i}-${i}`)})})}function $ge(){return p.jsxs("div",{className:"fixed bottom-4 left-4 z-40 flex flex-col gap-2",children:[p.jsx("button",{type:"button",onClick:()=>window.scrollTo({top:0,behavior:"smooth"}),className:"size-10 rounded-md border border-border bg-background shadow-lg hover:bg-secondary text-foreground flex items-center justify-center transition-colors","aria-label":"Jump to top",title:"Jump to top",children:p.jsx("span",{className:"text-lg leading-none",children:"↑"})}),p.jsx("button",{type:"button",onClick:()=>window.scrollTo({top:document.documentElement.scrollHeight,behavior:"smooth"}),className:"size-10 rounded-md border border-border bg-background shadow-lg hover:bg-secondary text-foreground flex items-center justify-center transition-colors","aria-label":"Jump to bottom",title:"Jump to bottom",children:p.jsx("span",{className:"text-lg leading-none",children:"↓"})})]})}function Uge({objective:e,dimension:t,toolCalls:n,onSubmitScore:r,submitting:i}){const[l,s]=x.useState("rubric"),u=n.length>0;x.useEffect(()=>{if(l===null)return;const d=document.body.style.paddingRight,g=document.body.style.transition;return document.body.style.transition="padding-right 150ms ease",document.body.style.paddingRight=`${RP}px`,()=>{document.body.style.paddingRight=d,document.body.style.transition=g}},[l]);const f=({panelKey:d,label:g})=>p.jsx("button",{type:"button",onClick:()=>s(d),className:`px-3 py-1 rounded-md text-sm transition-colors ${l===d?"bg-secondary text-foreground font-medium":"text-muted-foreground hover:text-foreground"}`,children:g});return p.jsxs(p.Fragment,{children:[p.jsxs("div",{className:"fixed bottom-4 right-4 z-40 flex flex-col items-end gap-2",children:[p.jsx("button",{type:"button",onClick:()=>s(d=>d==="rubric"?null:"rubric"),className:`px-4 py-2 rounded-md border text-sm font-medium shadow-lg transition-colors ${l==="rubric"?"bg-primary text-primary-foreground border-primary":"bg-background border-border hover:bg-secondary"}`,children:"Rubric"}),p.jsx("button",{type:"button",onClick:()=>s(d=>d==="objective"?null:"objective"),className:`px-4 py-2 rounded-md border text-sm font-medium shadow-lg transition-colors ${l==="objective"?"bg-primary text-primary-foreground border-primary":"bg-background border-border hover:bg-secondary"}`,children:"Objective"}),u?p.jsxs("button",{type:"button",onClick:()=>s(d=>d==="tools"?null:"tools"),className:`px-4 py-2 rounded-md border text-sm font-medium shadow-lg transition-colors ${l==="tools"?"bg-primary text-primary-foreground border-primary":"bg-background border-border hover:bg-secondary"}`,children:["Tool calls (",n.length,")"]}):null]}),l!==null?p.jsxs("div",{className:"fixed top-0 right-0 bottom-0 z-50 border-l border-border bg-background shadow-2xl flex flex-col",style:{width:`${RP}px`},children:[p.jsxs("div",{className:"px-4 py-3 border-b border-border flex items-center gap-2",children:[p.jsxs("div",{className:"flex-1 flex items-center gap-1 flex-wrap",children:[p.jsx(f,{panelKey:"rubric",label:"Rubric"}),p.jsx(f,{panelKey:"objective",label:"Objective"}),u?p.jsx(f,{panelKey:"tools",label:p.jsxs(p.Fragment,{children:["Tool calls"," ",p.jsxs("span",{className:"text-muted-foreground font-normal",children:["· ",n.length]})]})}):null]}),p.jsx("button",{type:"button",onClick:()=>s(null),className:"text-muted-foreground hover:text-foreground text-lg leading-none px-1","aria-label":"Close panel",children:"×"})]}),p.jsxs("div",{className:"flex-1 min-h-0 overflow-y-auto overscroll-contain p-4",children:[l==="rubric"?p.jsx(Ige,{dimension:t,onSelect:r,submitting:i}):null,l==="objective"&&e?p.jsx(Lge,{objective:e}):null,l==="tools"?p.jsx(Bge,{toolCalls:n}):null]})]}):null]})}function Fge({rubricId:e,dimensionId:t,request:n,navigate:r}){const[i,l]=x.useState(null),[s,u]=x.useState(null),[f,d]=x.useState(null),[g,h]=x.useState(!0),[v,b]=x.useState(!1),[S,j]=x.useState(null);x.useEffect(()=>{let T=!1;return h(!0),Promise.all([n("/api/human-scoring/rubrics"),n(`/api/human-scoring/next?rubric_id=${encodeURIComponent(e)}&dimension_id=${encodeURIComponent(t)}`)]).then(([k,I])=>{if(T)return;const R=k.rubrics.find(D=>D.rubricId===e)??null,F=R?.dimensions.find(D=>D.id===t)??null;l(R),u(F),d(I.item),j(null),h(!1)}).catch(k=>{T||(j(k instanceof Error?k.message:String(k)),h(!1))}),()=>{T=!0}},[n,e,t]);const E=x.useCallback(async T=>{if(!(!f||v)){b(!0),j(null);try{const k=await n("/api/human-scoring/scores",Si("POST",{scenario_run_id:f.scenarioRunId,rubric_id:e,dimension_id:t,raw_score:T}));d(k.next)}catch(k){j(k instanceof Error?k.message:String(k))}finally{b(!1)}}},[n,f,v,e,t]);if(x.useEffect(()=>{if(!f||v||!s)return;const T=l1(s.scale),k=new Set(T.map(R=>R.value));function I(R){const F=R.target;if(F instanceof HTMLElement&&/input|textarea|select/i.test(F.tagName))return;const D=Number(R.key);!Number.isFinite(D)||!k.has(D)||(R.preventDefault(),E(D))}return window.addEventListener("keydown",I),()=>window.removeEventListener("keydown",I)},[f,s,v,E]),g)return p.jsx(Ci,{withMeta:!0});if(S&&!f)return p.jsx(yt,{message:S});if(!i||!s)return p.jsxs(p.Fragment,{children:[p.jsx(yt,{message:"Unknown rubric or dimension."}),p.jsx(Le,{variant:"secondary",onClick:()=>r("/score"),children:"Back to scoring"})]});if(!f)return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:`${i.rubricName} · ${s.name}`,title:"Queue empty",meta:"No more unscored chats for this dimension."}),p.jsx("div",{className:"flex gap-2",children:p.jsx(Le,{onClick:()=>r("/score"),children:"Back to scoring"})})]});const O=zge(f),C=l1(s.scale),_=f.judgeDimensionRawScore!==null&&f.judgeDimensionRawScore!==void 0?`Judge scored this dimension ${f.judgeDimensionRawScore}`+(f.overallScore!==null&&f.overallScore!==void 0?` · overall ${f.overallScore.toFixed(2)}`:""):null,P=Dge(f);return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:`${i.rubricName} · ${s.name}`,title:f.scenarioName,meta:p.jsxs("span",{children:[p.jsx("span",{className:"font-mono",children:f.scenarioId})," · run ",p.jsx("a",{href:`/runs/${encodeURIComponent(f.runId)}`,children:f.runId.slice(0,8)})," · ",p.jsx("a",{href:`/runs/${encodeURIComponent(f.runId)}/scenarios/${f.ordinal}`,children:"detail"})," · ",p.jsx("a",{href:"/score",onClick:T=>{T.preventDefault(),r("/score")},children:"back"})]})}),p.jsxs("div",{className:"text-xs text-muted-foreground mb-3 font-mono",children:[p.jsx("strong",{className:"text-foreground text-base",children:f.remaining})," ","remaining"]}),_?p.jsx(rt,{className:"p-3 mb-4 border-l-4 border-l-muted-foreground/50",children:p.jsx("div",{className:"text-xs text-foreground",children:_})}):null,S?p.jsx(yt,{message:S}):null,p.jsx(rt,{className:"p-4 mb-4",children:p.jsx(zS,{detail:O})}),p.jsx("div",{className:"space-y-2 pb-24",children:C.map(T=>p.jsxs("button",{type:"button",disabled:v,onClick:()=>void E(T.value),className:"w-full grid grid-cols-[56px_1fr] gap-3 items-center px-4 py-3 rounded-md border border-border bg-secondary/40 hover:border-primary hover:bg-secondary text-left disabled:opacity-60 disabled:cursor-progress transition-colors",children:[p.jsx("span",{className:"text-2xl font-bold font-mono text-primary text-center",children:T.value}),p.jsx("span",{className:"text-sm whitespace-pre-wrap",children:T.label})]},T.value))}),p.jsx($ge,{}),p.jsx(Uge,{objective:P,dimension:s,toolCalls:f.toolCalls??[],onSubmitScore:T=>void E(T),submitting:v})]})}function Ip(e){if(!e)return null;const t=Date.parse(e);return Number.isNaN(t)?null:t/1e3}function Hge(e,t){const n=Ip(e);if(n==null)return 0;const r=Ip(t)??Date.now()/1e3;return Math.max(0,r-n)}function qge(e){return e.status==="running"?"running":e.status==="pending"?"pending":e.status==="error"||e.status==="runtime_error"?"error":e.passed===!0?"pass":e.passed===!1?"fail":"pending"}function Vge(e){if(!e)return null;if(typeof e=="string")return e;if(typeof e=="object"){const t=e.message;return typeof t=="string"?t:JSON.stringify(e)}return String(e)}function Kge(e){return{dimension_id:String(e.dimension_id??""),dimension_name:String(e.dimension_name??e.dimension_id??""),raw_score:typeof e.raw_score=="number"?e.raw_score:Number(e.raw_score),scale_points:e.scale_points==null?null:Number(e.scale_points),normalized_score:e.normalized_score==null?null:Number(e.normalized_score),weight:e.weight==null?null:Number(e.weight),reasoning:typeof e.reasoning=="string"?e.reasoning:"",evidence:Array.isArray(e.evidence)?e.evidence.map(String):[]}}function f5(e){return{scenario_id:e.scenarioId,scenario_name:e.scenarioName,user_id:e.userId??void 0,passed:e.passed===!0,overall_score:e.overallScore??null,pass_threshold:e.passThreshold??null,status:e.status,judge:e.judge?{provider:e.judge.provider??void 0,model:e.judge.model??void 0,temperature:e.judge.temperature??void 0,max_tokens:e.judge.maxTokens??void 0,overall_notes:e.judge.overallNotes??void 0,output:e.judge.output&&typeof e.judge.output=="object"&&!Array.isArray(e.judge.output)?e.judge.output:void 0}:void 0,turns:e.turns??[],tool_calls:e.toolCalls??[],target_events:e.targetEvents??[],checkpoints:e.checkpoints??[],judge_dimension_scores:(e.judgeDimensionScores??[]).map(Kge),retrieval_scores:e.retrievalScores??[],demotion_scores:e.demotionScores??[],procedure_scores:e.procedureScores??[],dedup_scores:e.dedupScores??[],expectations:e.expectations,error:e.error,counts:e.counts?{turn_count:e.counts.turnCount,assistant_turn_count:e.counts.assistantTurnCount,tool_call_count:e.counts.toolCallCount,checkpoint_count:e.counts.checkpointCount}:void 0}}function Yge(e){const t=e.scenarios.map(l=>({scenario_id:l.scenarioId,scenario_name:l.scenarioName,status:qge(l),score:l.overallScore??null,error:Vge(l.error),started_at:Ip(l.startedAt),finished_at:Ip(l.completedAt)})),n={};for(const l of e.scenarios)n[l.ordinal]=f5(l);const r=t.filter(l=>l.status==="running").length,i=t.filter(l=>l.status!=="running"&&l.status!=="pending").length;return{total:e.aggregateCounts.scenarioTotal||t.length,elapsed:Hge(e.startedAt,e.completedAt),passed:e.aggregateCounts.scenarioPassedCount,failed:e.aggregateCounts.scenarioFailedCount,errored:e.aggregateCounts.scenarioErroredCount,running:r,done:i,all_done:!!e.completedAt||r===0,scenarios:t,details:n,averages:[]}}function Gge(){const[e,t]=x.useState(window.location.pathname);x.useEffect(()=>{const r=()=>t(window.location.pathname);return window.addEventListener("popstate",r),()=>window.removeEventListener("popstate",r)},[]);const n=x.useCallback(r=>{window.history.pushState({},"",r),t(window.location.pathname)},[]);return{pathname:e,navigate:n}}function Wge(e){x.useEffect(()=>{const t=n=>{if(n.defaultPrevented||!(n.target instanceof Element))return;const r=n.target.closest("a");if(!r)return;const i=r.getAttribute("href");!i?.startsWith("/")||i.startsWith("//")||i.startsWith("/api/")||r.target||(n.preventDefault(),e(i))};return document.addEventListener("click",t),()=>document.removeEventListener("click",t)},[e])}function IS(e){if(!e)return"—";try{return new Date(e).toLocaleString()}catch{return e}}const xc=10;function d5({runs:e,navigate:t,selectable:n=!0}){const[r,i]=x.useState(()=>new Set);if(x.useEffect(()=>{i(h=>{const v=new Set(e.map(S=>S.runId)),b=new Set;return h.forEach(S=>{v.has(S)&&b.add(S)}),b.size===h.size?h:b})},[e]),e.length===0)return p.jsx(Jp,{title:"No runs recorded",description:"Launch a preset or start an ad-hoc run to populate this table."});const l=(h,v)=>{i(b=>{const S=new Set(b);if(v){if(S.size>=xc&&!S.has(h))return b;S.add(h)}else S.delete(h);return S})},s=e.map(h=>h.runId).filter(h=>r.has(h)),u=s.length<2||s.length>xc,f=s.length===0?"Pick 2+ runs to compare":s.length===1?"Pick at least one more run":s.length>xc?`Maximum ${xc} runs at a time`:`${s.length} runs selected`,d=`/compare?run_ids=${s.join(",")}`,g=()=>{u||(t?t(d):(window.history.pushState({},"",d),window.dispatchEvent(new PopStateEvent("popstate"))))};return p.jsxs(p.Fragment,{children:[n?p.jsxs("div",{className:"flex items-center gap-3 mb-3",children:[p.jsx(Le,{variant:"primary",size:"sm",onClick:g,disabled:u,children:"Compare selected"}),p.jsx("span",{className:"text-xs text-muted-foreground",children:f}),s.length>0?p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>i(new Set),children:"Clear"}):null]}):null,p.jsx(rt,{className:"overflow-hidden",children:p.jsx("div",{className:"overflow-x-auto",children:p.jsxs("table",{className:"w-full text-sm",children:[p.jsx("thead",{className:"bg-secondary",children:p.jsxs("tr",{className:"text-left text-muted-foreground text-xs uppercase tracking-wider",children:[n?p.jsx("th",{className:"px-3 py-2 w-8"}):null,p.jsx("th",{className:"px-3 py-2",children:"Name"}),p.jsx("th",{className:"px-3 py-2",children:"Status"}),p.jsx("th",{className:"px-3 py-2",children:"Preset"}),p.jsx("th",{className:"px-3 py-2",children:"Started"}),p.jsx("th",{className:"px-3 py-2 text-right",children:"Pass / Total"})]})}),p.jsx("tbody",{className:"divide-y divide-border",children:e.map(h=>{const v=r.has(h.runId),b=!v&&s.length>=xc;return p.jsxs("tr",{className:"hover:bg-secondary",children:[n?p.jsx("td",{className:"px-3 py-2 align-middle",children:p.jsx(bi,{checked:v,onChange:S=>{b&&S||l(h.runId,S)}})}):null,p.jsx("td",{className:"px-3 py-2",children:p.jsx("a",{href:`/runs/${encodeURIComponent(h.runId)}`,className:"text-foreground hover:text-primary",children:h.label?p.jsx("span",{className:"font-medium",children:h.label}):p.jsxs("span",{className:"font-mono text-xs text-muted-foreground",children:[h.runId.slice(0,12),"…"]})})}),p.jsx("td",{className:"px-3 py-2",children:p.jsx(Qp,{run:h})}),p.jsx("td",{className:"px-3 py-2 text-muted-foreground",children:h.preset??"—"}),p.jsx("td",{className:"px-3 py-2 text-muted-foreground",children:IS(h.startedAt)}),p.jsxs("td",{className:"px-3 py-2 text-right font-mono",children:[h.aggregateCounts.scenarioPassedCount,"/",h.aggregateCounts.scenarioTotal]})]},h.runId)})})]})})})]})}function Xge({request:e,navigate:t}){const[n,r]=x.useState(null),[i,l]=x.useState(null),[s,u]=x.useState(null);if(x.useEffect(()=>{let g=!1;return Promise.all([e("/api/runs?limit=5"),e("/api/suites")]).then(([h,v])=>{g||(r(h),l(v),u(null))}).catch(h=>{g||u(h instanceof Error?h.message:String(h))}),()=>{g=!0}},[e]),s)return p.jsx(yt,{message:s});if(!n||!i)return p.jsxs(p.Fragment,{children:[p.jsx(Ci,{}),p.jsx(JR,{}),p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Latest Runs"}),p.jsx(Zp,{rows:5})]});const f=n.runs.filter(g=>g.passed===!0).length,d=n.runs.filter(g=>g.passed===!1).length;return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Overview",title:"AgentProbe"}),p.jsxs("div",{className:"grid grid-cols-2 sm:grid-cols-4 gap-3 mb-6",children:[p.jsx(yn,{label:"Total Runs",value:n.total}),p.jsx(yn,{label:"Recent Passed",tone:"success",value:f}),p.jsx(yn,{label:"Recent Failed",tone:"danger",value:d}),p.jsx(yn,{label:"Suites",tone:"accent",value:i.suites.length})]}),p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Latest Runs"}),p.jsx(d5,{runs:n.runs,navigate:t})]})}const DP=50;function Qge({request:e,navigate:t}){const[n,r]=x.useState([]),[i,l]=x.useState(0),[s,u]=x.useState(null),[f,d]=x.useState(!1),[g,h]=x.useState(!1),[v,b]=x.useState(null);x.useEffect(()=>{let j=!1;return e(`/api/runs?limit=${DP}&offset=0`).then(E=>{j||(r(E.runs),l(E.total),u(E.next_cursor??null),h(!0),b(null))}).catch(E=>{j||b(E instanceof Error?E.message:String(E))}),()=>{j=!0}},[e]);const S=x.useCallback(()=>{!s||f||(d(!0),e(`/api/runs?limit=${DP}&offset=${encodeURIComponent(s)}`).then(j=>{r(E=>[...E,...j.runs]),l(j.total),u(j.next_cursor??null),b(null)}).catch(j=>{b(j instanceof Error?j.message:String(j))}).finally(()=>{d(!1)}))},[s,f,e]);return v&&!g?p.jsx(yt,{message:v}):g?p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"History",title:"Runs",meta:`${i} total`}),v?p.jsx(yt,{message:v}):null,p.jsx(d5,{runs:n,navigate:t}),s?p.jsx("div",{className:"mt-4 flex justify-center",children:p.jsx(Le,{variant:"secondary",onClick:S,disabled:f,children:f?"Loading...":`Load more (${n.length} of ${i})`})}):null]}):p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(Zp,{rows:8})]})}function Zge({runId:e,request:t,navigate:n}){const[r,i]=x.useState(null),[l,s]=x.useState(null),[u,f]=x.useState(!1),[d,g]=x.useState(!1),[h,v]=x.useState(""),[b,S]=x.useState(""),[j,E]=x.useState(!1),[O,C]=x.useState(null),_=x.useRef(t),P=x.useRef(e),T=x.useRef(!0);_.current=t,P.current=e,x.useEffect(()=>(T.current=!0,()=>{T.current=!1}),[]);const k=x.useCallback(async()=>{const J=e;try{const G=await _.current(`/api/runs/${encodeURIComponent(J)}`);if(!T.current||P.current!==J)return;i(G.run),s(null)}catch(G){if(!T.current||P.current!==J)return;s(G instanceof Error?G.message:String(G))}},[e]),I=x.useRef(k);I.current=k,x.useEffect(()=>{i(null),s(null),k()},[k]),x.useEffect(()=>{const J=new EventSource(`/api/runs/${encodeURIComponent(e)}/events`),G=()=>{I.current()},Q=()=>{G(),J.close()};return J.addEventListener("snapshot",G),J.addEventListener("suite_started",G),J.addEventListener("scenario_started",G),J.addEventListener("scenario_finished",G),J.addEventListener("scenario_error",G),J.addEventListener("run_finished",Q),J.addEventListener("run_cancelled",Q),J.addEventListener("run_error",Q),()=>J.close()},[e]);const R=async()=>{f(!0),s(null);try{await t(`/api/runs/${encodeURIComponent(e)}/cancel`,{method:"POST"}),await k()}catch(J){s(J instanceof Error?J.message:String(J))}finally{f(!1)}},F=x.useMemo(()=>r?Yge(r):null,[r]),D=x.useCallback(J=>{!n||!r||n(`/runs/${encodeURIComponent(r.runId)}/scenarios/${J}`)},[n,r]),H=()=>{C(null);const J=r?.label??r?.preset??`Run ${r?.runId.slice(0,8)??""}`.trim();v(J),S(""),g(!0)},$=async()=>{if(!r)return;const J=h.trim();if(J.length===0){C("Name is required.");return}E(!0),C(null);try{const G=await t(`/api/runs/${encodeURIComponent(r.runId)}/save-as-preset`,{method:"POST",headers:{"content-type":"application/json"},body:JSON.stringify({name:J,description:b.trim()||null})});g(!1),n&&G?.preset?.id&&n(`/presets/${encodeURIComponent(G.preset.id)}`)}catch(G){C(G instanceof Error?G.message:String(G))}finally{E(!1)}};return l?p.jsx(yt,{message:l}):!r||!F?p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(JR,{}),p.jsx(Qr,{lines:4,className:"mb-4"}),p.jsx(Zp,{rows:6,selectable:!1})]}):p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:r.presetId?p.jsxs("span",{children:["Run from preset"," ",p.jsx("a",{href:`/presets/${encodeURIComponent(r.presetId)}`,className:"text-primary hover:underline",children:r.preset??r.presetId})]}):"Run",title:p.jsx("span",{className:"font-mono text-base text-muted-foreground break-all",children:r.runId}),meta:p.jsxs("span",{children:["Started ",IS(r.startedAt)," · trigger ",r.trigger??"—"]}),actions:p.jsxs(p.Fragment,{children:[r.status==="running"&&p.jsx(Le,{variant:"secondary",onClick:()=>void R(),disabled:u,children:u?"Cancelling…":"Cancel"}),p.jsx(Le,{variant:"secondary",onClick:H,disabled:!r.scenarios||r.scenarios.length===0,title:"Create a preset that reuses this run's endpoint, personas, rubric, and scenarios",children:"Save as preset"}),p.jsx("a",{href:`/api/runs/${encodeURIComponent(r.runId)}/report.html`,className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-3 py-1.5 text-sm bg-secondary text-foreground border-border hover:bg-primary hover:border-border no-underline",children:"HTML report"})]})}),p.jsx(M1,{open:d,onClose:()=>{j||g(!1)},title:"Save as preset",description:"Captures this run's endpoint, personas, rubric, and the exact scenario selection so you can re-run it later.",size:"md",footer:p.jsxs("div",{className:"flex items-center justify-end gap-2",children:[p.jsx(Le,{variant:"ghost",onClick:()=>g(!1),disabled:j,children:"Cancel"}),p.jsx(Le,{variant:"primary",onClick:()=>void $(),disabled:j||h.trim().length===0,children:j?"Saving…":"Save preset"})]}),children:p.jsxs("div",{className:"flex flex-col gap-4 py-2",children:[O?p.jsx(yt,{message:O}):null,p.jsx(Xe,{label:"Name",htmlFor:"preset-from-run-name",children:p.jsx(Kt,{id:"preset-from-run-name",value:h,onChange:J=>v(J.target.value),placeholder:"e.g. Nightly smoke (gpt-4o)",autoFocus:!0})}),p.jsx(Xe,{label:"Description",htmlFor:"preset-from-run-description",children:p.jsx(Kt,{id:"preset-from-run-description",value:b,onChange:J=>S(J.target.value),placeholder:"Optional"})})]})}),p.jsx(Pge,{run:r,request:t,onUpdated:J=>i(G=>G&&{...G,...J})}),p.jsx(l5,{data:F}),p.jsx(i5,{data:F}),p.jsx(a5,{data:F,runId:r.runId,onSelect:D}),p.jsx(YP,{averages:F.averages,onSelectRun:D})]})}function Jge({runId:e,ordinal:t,request:n}){const[r,i]=x.useState(null),[l,s]=x.useState(null);if(x.useEffect(()=>{let f=!1;return n(`/api/runs/${encodeURIComponent(e)}/scenarios/${encodeURIComponent(t)}`).then(d=>{f||(i(d),s(null))}).catch(d=>{f||s(d instanceof Error?d.message:String(d))}),()=>{f=!0}},[n,e,t]),l)return p.jsx(yt,{message:l});if(!r)return p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(Qr,{lines:4,className:"mb-3"}),p.jsx(Qr,{lines:6})]});const u=f5(r.scenario);return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:p.jsxs("a",{href:`/runs/${encodeURIComponent(r.run.runId)}`,className:"text-primary hover:underline",children:["← Back to run"," ",p.jsxs("span",{className:"font-mono",children:[r.run.runId.slice(0,12),"…"]})]}),title:u.scenario_name,meta:p.jsxs("span",{children:["Scenario #",t," · ",u.scenario_id]}),actions:p.jsx(Qp,{run:{...r.run,exitCode:null,preset:null,aggregateCounts:{scenarioTotal:1,scenarioPassedCount:u.passed?1:0,scenarioFailedCount:u.passed?0:1,scenarioErroredCount:u.status==="error"?1:0}}})}),p.jsxs("div",{className:"grid grid-cols-1 lg:grid-cols-[minmax(0,1fr)_minmax(320px,0.65fr)] gap-4",children:[p.jsxs(rt,{className:"p-4",children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-3",children:"Conversation"}),p.jsx(zS,{detail:u})]}),p.jsxs(rt,{className:"p-4",children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-3",children:"Rubric"}),p.jsx(r5,{detail:u})]})]}),t5(u)&&p.jsxs(rt,{className:"mt-4 p-4",children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-3",children:"Eval scores"}),p.jsx(e5,{detail:u})]})]})}function eve({request:e}){const[t,n]=x.useState(null),[r,i]=x.useState(null),[l,s]=x.useState(null);return x.useEffect(()=>{let u=!1;return Promise.all([e("/api/suites"),e("/api/scenarios")]).then(([f,d])=>{u||(n(f),i(d),s(null))}).catch(f=>{u||s(f instanceof Error?f.message:String(f))}),()=>{u=!0}},[e]),l?p.jsx(yt,{message:l}):!t||!r?p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(Qr,{lines:5,className:"mb-3"}),p.jsx(Qr,{lines:5})]}):p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Data root",title:p.jsx("span",{className:"font-mono text-base text-foreground break-all",children:t.data_path}),meta:`${t.suites.length} suite${t.suites.length===1?"":"s"} · ${r.scenarios.length} scenario${r.scenarios.length===1?"":"s"}`}),t.errors.length>0&&p.jsx(yt,{message:`${t.errors.length} suite files had validation errors.`}),p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Suites"}),p.jsx(rt,{className:"overflow-hidden mb-6",children:p.jsx("div",{className:"overflow-x-auto",children:p.jsxs("table",{className:"w-full text-sm",children:[p.jsx("thead",{className:"bg-secondary",children:p.jsxs("tr",{className:"text-left text-muted-foreground text-xs uppercase tracking-wider",children:[p.jsx("th",{className:"px-3 py-2",children:"Suite"}),p.jsx("th",{className:"px-3 py-2",children:"Schema"}),p.jsx("th",{className:"px-3 py-2",children:"Path"}),p.jsx("th",{className:"px-3 py-2 text-right",children:"Objects"})]})}),p.jsx("tbody",{className:"divide-y divide-border",children:t.suites.map(u=>p.jsxs("tr",{className:"hover:bg-secondary",children:[p.jsx("td",{className:"px-3 py-2 font-mono text-xs",children:u.id}),p.jsx("td",{className:"px-3 py-2",children:p.jsx(Pt,{tone:"info",children:u.schema})}),p.jsx("td",{className:"px-3 py-2 font-mono text-xs text-muted-foreground break-all",children:u.relativePath}),p.jsx("td",{className:"px-3 py-2 text-right font-mono",children:u.objectCount})]},u.id))})]})})}),p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Scenarios"}),p.jsx(rt,{className:"overflow-hidden",children:p.jsx("div",{className:"overflow-x-auto",children:p.jsxs("table",{className:"w-full text-sm",children:[p.jsx("thead",{className:"bg-secondary",children:p.jsxs("tr",{className:"text-left text-muted-foreground text-xs uppercase tracking-wider",children:[p.jsx("th",{className:"px-3 py-2",children:"Scenario"}),p.jsx("th",{className:"px-3 py-2",children:"Name"}),p.jsx("th",{className:"px-3 py-2",children:"Suite"}),p.jsx("th",{className:"px-3 py-2",children:"Tags"}),p.jsx("th",{className:"px-3 py-2",children:"Rubric"})]})}),p.jsx("tbody",{className:"divide-y divide-border",children:r.scenarios.map(u=>p.jsxs("tr",{className:"hover:bg-secondary",children:[p.jsx("td",{className:"px-3 py-2 font-mono text-xs",children:u.id}),p.jsx("td",{className:"px-3 py-2",children:u.name}),p.jsx("td",{className:"px-3 py-2 text-muted-foreground",children:u.suiteId}),p.jsx("td",{className:"px-3 py-2",children:p.jsxs("div",{className:"flex flex-wrap gap-1",children:[u.tags.map(f=>p.jsx(Pt,{children:f},f)),u.tags.length===0?p.jsx("span",{className:"text-muted-foreground/70",children:"—"}):null]})}),p.jsx("td",{className:"px-3 py-2 text-muted-foreground",children:u.rubric??"—"})]},`${u.suiteId}:${u.id}`))})]})})})]})}function tve({request:e,navigate:t}){const[n,r]=x.useState(null),[i,l]=x.useState(null),[s,u]=x.useState(null),[f,d]=x.useState(""),[g,h]=x.useState(""),[v,b]=x.useState(""),[S,j]=x.useState(""),[E,O]=x.useState(new Set),[C,_]=x.useState(""),[P,T]=x.useState(!1),[k,I]=x.useState(2),[R,F]=x.useState(1),[D,H]=x.useState(!0),[$,J]=x.useState(!1),[G,Q]=x.useState(""),[B,Y]=x.useState(null),[ie,ae]=x.useState(!1),[M,L]=x.useState(null);x.useEffect(()=>{let W=!1;return Promise.all([e("/api/suites"),e("/api/scenarios"),e("/api/presets")]).then(([ge,ne,se])=>{W||(r(ge),l(ne),u(se),d(ge.suites.find(ye=>ye.schema==="endpoints")?.relativePath??""),h(ge.suites.find(ye=>ye.schema==="personas")?.relativePath??""),b(ge.suites.find(ye=>ye.schema==="rubrics")?.relativePath??""))}).catch(ge=>{W||Y(ge instanceof Error?ge.message:String(ge))}),()=>{W=!0}},[e]);const te=x.useMemo(()=>i?i.scenarios.filter(W=>E.has(`${W.sourcePath}::${W.id}`)).map(W=>({file:W.sourcePath,id:W.id})):[],[i,E]);if(B)return p.jsx(yt,{message:B});if(!n||!i||!s)return p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(Qr,{lines:6,className:"mb-3"}),p.jsx(Qr,{lines:4})]});const z=n.suites.filter(W=>W.schema==="endpoints"),he=n.suites.filter(W=>W.schema==="personas"),ve=n.suites.filter(W=>W.schema==="rubrics"),pe=async W=>{W.preventDefault(),ae(!0),Y(null);try{const ge={enabled:P,limit:P?k:void 0},ne=S?await e(`/api/presets/${encodeURIComponent(S)}/runs`,Si("POST",{label:C||void 0,overrides:{parallel:ge,repeat:R,dry_run:D}})):await e("/api/runs",Si("POST",{endpoint:f,personas:g,rubric:v,selection:te,parallel:ge,repeat:R,dry_run:D,label:C||void 0,save_as_preset:$&&G.trim()?{name:G.trim()}:void 0}));t(`/runs/${encodeURIComponent(ne.run_id)}`)}catch(ge){Y(ge instanceof Error?ge.message:String(ge))}finally{ae(!1)}};return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Start",title:"Run builder",meta:S?"Launching from preset — overrides only":`${E.size} scenario${E.size===1?"":"s"} selected`,actions:p.jsx(Le,{onClick:W=>pe(W),disabled:ie,children:ie?"Starting…":"Start run"})}),B?p.jsx(yt,{message:B}):null,p.jsxs("form",{onSubmit:pe,className:"flex flex-col gap-4",children:[p.jsxs(rt,{className:"p-4 flex flex-col gap-3",children:[p.jsx(Xe,{label:"Preset",children:p.jsx(sr,{value:S||"__adhoc__",onValueChange:W=>j(W==="__adhoc__"?"":W),options:[{value:"__adhoc__",label:"Ad-hoc (build from scratch)"},...s.presets.map(W=>({value:W.id,label:W.name}))]})}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3",children:[p.jsx(Xe,{label:"Endpoint",children:p.jsx(sr,{value:f,onValueChange:d,disabled:!!S,options:z.map(W=>({value:W.relativePath,label:W.relativePath})),emptyLabel:"No endpoint suites found"})}),p.jsx(Xe,{label:"Personas",children:p.jsx(sr,{value:g,onValueChange:h,disabled:!!S,options:he.map(W=>({value:W.relativePath,label:W.relativePath})),emptyLabel:"No persona suites found"})}),p.jsx(Xe,{label:"Rubric",children:p.jsx(sr,{value:v,onValueChange:b,disabled:!!S,options:ve.map(W=>({value:W.relativePath,label:W.relativePath})),emptyLabel:"No rubric suites found"})})]})]}),!S&&p.jsxs(rt,{className:"overflow-hidden",children:[p.jsxs("div",{className:"p-3 border-b border-border flex items-center justify-between",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold",children:"Scenarios"}),p.jsxs("div",{className:"text-xs text-muted-foreground mt-0.5",children:[i.scenarios.length," available"]})]}),p.jsx(Le,{variant:"secondary",size:"sm",onClick:()=>O(new Set(i.scenarios.map(W=>`${W.sourcePath}::${W.id}`))),children:"Select all"})]}),p.jsx("div",{className:"max-h-[420px] overflow-y-auto divide-y divide-border",children:i.scenarios.map(W=>{const ge=`${W.sourcePath}::${W.id}`,ne=E.has(ge);return p.jsxs("div",{className:`flex items-start gap-3 px-3 py-2.5 hover:bg-secondary ${ne?"bg-primary/5":""}`,children:[p.jsxs("label",{className:"flex items-start gap-3 flex-1 min-w-0 cursor-pointer",children:[p.jsx("input",{type:"checkbox",checked:ne,onChange:se=>{const ye=new Set(E);se.currentTarget.checked?ye.add(ge):ye.delete(ge),O(ye)},className:"size-4 mt-0.5 accent-primary shrink-0"}),p.jsxs("div",{className:"flex-1 min-w-0",children:[p.jsxs("div",{className:"flex items-baseline gap-2 flex-wrap",children:[p.jsx("span",{className:"text-sm font-medium text-foreground",children:W.name||W.id}),p.jsx("span",{className:"font-mono text-[11px] text-muted-foreground",children:W.id}),W.priority?p.jsx(Pt,{tone:"info",children:W.priority}):null]}),W.description?p.jsx("div",{className:"text-xs text-muted-foreground mt-0.5 line-clamp-2",children:W.description}):null,p.jsxs("div",{className:"flex items-center gap-1.5 mt-1 flex-wrap",children:[W.tags.slice(0,5).map(se=>p.jsx(Pt,{children:se},se)),p.jsx("span",{className:"font-mono text-[10px] text-muted-foreground/70",children:W.sourcePath})]})]})]}),p.jsx(Le,{type:"button",variant:"ghost",size:"sm",className:"shrink-0 self-start",onClick:()=>L({file:W.sourcePath,id:W.id,name:W.name,description:W.description,tags:W.tags,priority:W.priority}),children:"Details"})]},ge)})})]}),p.jsxs(rt,{className:"p-4 flex flex-col gap-3",children:[p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3",children:[p.jsx(Xe,{label:"Label",hint:"Shown in the run list.",children:p.jsx(Kt,{value:C,onChange:W=>_(W.currentTarget.value),maxLength:200})}),p.jsx(Xe,{label:"Repeat",children:p.jsx(Kt,{type:"number",min:1,value:R,onChange:W=>F(Number(W.currentTarget.value))})}),p.jsx(Xe,{label:"Parallel limit",hint:"Max concurrent scenarios when parallel is on. 2-4 is typical; higher = faster but more LLM cost spikes.",children:p.jsx(Kt,{type:"number",min:1,value:k,onChange:W=>I(Number(W.currentTarget.value)),disabled:!P})})]}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3",children:[p.jsx(Xe,{label:"Dry run",hint:"Records run + scenario rows but skips the live adapter, judge, and scorers. Use to validate config without spending LLM tokens.",children:p.jsx(bi,{checked:D,onChange:H,label:"Enabled"})}),p.jsx(Xe,{label:"Parallel",hint:"Run multiple scenarios concurrently. Scenarios still complete in order, but several run at a time (set the limit above).",children:p.jsx(bi,{checked:P,onChange:T,label:"Enabled"})}),S?null:p.jsx(Xe,{label:"Save as preset",hint:"Save this scenario selection + settings as a reusable preset visible on the Presets page.",children:p.jsx(bi,{checked:$,onChange:J,label:"Enabled"})})]}),$&&!S?p.jsx(Xe,{label:"Preset name",children:p.jsx(Kt,{value:G,onChange:W=>Q(W.currentTarget.value),placeholder:"e.g. Smoke suite"})}):null]}),p.jsx("div",{className:"flex justify-end",children:p.jsx(Le,{type:"submit",disabled:ie,children:ie?"Starting…":"Start run"})})]}),p.jsx(u5,{open:M!=null,target:M,request:e,onClose:()=>L(null)})]})}function h5(e){return{presetId:e.id,presetName:e.name,defaults:{endpoint:e.endpoint,personas:e.personas,rubric:e.rubric,parallelEnabled:e.parallel.enabled,parallelLimit:e.parallel.limit,repeat:e.repeat,dryRun:e.dry_run}}}function p5(e){const t=e.toLowerCase();return t.includes("autogpt")?"autogpt":t.includes("openclaw")?"openclaw":t.includes("opencode")?"opencode":"custom"}function nve({request:e,navigate:t}){const[n,r]=x.useState(null),[i,l]=x.useState(null),[s,u]=x.useState(null),[f,d]=x.useState(null);return x.useEffect(()=>{let g=!1;return Promise.all([e("/api/presets"),e("/api/suites")]).then(([h,v])=>{g||(r(h),l(v))}).catch(h=>{g||u(h instanceof Error?h.message:String(h))}),()=>{g=!0}},[e]),s?p.jsx(yt,{message:s}):n?p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Presets",title:"Saved Configurations",meta:`${n.presets.length} preset${n.presets.length===1?"":"s"}`,actions:p.jsx("a",{href:"/start",className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-3 py-1.5 text-sm bg-primary text-background border-primary hover:bg-primary/90 hover:border-primary no-underline",children:"New preset"})}),n.presets.length===0?p.jsx(Jp,{title:"No presets yet",description:"Build a run on the Start tab and save it as a preset to make it repeatable.",action:p.jsx("a",{href:"/start",className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-3 py-1.5 text-sm bg-primary text-background border-primary hover:bg-primary/90 hover:border-primary no-underline",children:"Build your first preset"})}):p.jsx("div",{className:"grid grid-cols-1 md:grid-cols-2 xl:grid-cols-3 gap-3",children:n.presets.map(g=>{const h=p5(g.endpoint);return p.jsxs(rt,{className:"p-4 hover:border-border transition-colors flex flex-col",children:[p.jsxs("div",{className:"flex items-start justify-between gap-2 mb-2",children:[p.jsx("a",{href:`/presets/${encodeURIComponent(g.id)}`,className:"text-base font-semibold text-foreground hover:text-primary no-underline truncate",title:g.name,children:g.name}),p.jsx(Pt,{tone:h==="custom"?"default":"info",children:h})]}),g.description?p.jsx("div",{className:"text-sm text-muted-foreground mb-3 line-clamp-2",children:g.description}):null,p.jsxs("div",{className:"grid grid-cols-3 gap-2 mb-3 text-xs",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-muted-foreground/70 uppercase tracking-wider text-[10px]",children:"Scenarios"}),p.jsx("div",{className:"font-mono text-foreground",children:g.selection.length})]}),p.jsxs("div",{children:[p.jsx("div",{className:"text-muted-foreground/70 uppercase tracking-wider text-[10px]",children:"Repeat"}),p.jsx("div",{className:"font-mono text-foreground",children:g.repeat})]}),p.jsxs("div",{children:[p.jsx("div",{className:"text-muted-foreground/70 uppercase tracking-wider text-[10px]",children:"Parallel"}),p.jsx("div",{className:"font-mono text-foreground",children:g.parallel.enabled?`×${g.parallel.limit??"?"}`:"off"})]})]}),p.jsx("div",{className:"text-xs text-muted-foreground mb-3 flex items-center gap-2 min-h-[1.25rem]",children:g.last_run?p.jsxs(p.Fragment,{children:[p.jsx(Qp,{run:g.last_run}),p.jsx("span",{children:IS(g.last_run.startedAt)})]}):p.jsx("span",{className:"italic text-muted-foreground/70",children:"Never run"})}),p.jsxs("div",{className:"flex items-center gap-2 mt-auto pt-3 border-t border-border",children:[p.jsx(Le,{size:"sm",onClick:()=>d(h5(g)),children:"Launch run"}),p.jsx("a",{href:`/presets/${encodeURIComponent(g.id)}/edit`,className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-2.5 py-1 text-xs bg-secondary text-foreground border-border hover:bg-primary hover:border-border no-underline",children:"Edit"}),p.jsx("a",{href:`/presets/${encodeURIComponent(g.id)}`,className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-2.5 py-1 text-xs bg-transparent text-muted-foreground border-transparent hover:bg-secondary hover:text-foreground no-underline",children:"History"})]})]},g.id)})}),p.jsx(c5,{open:f!=null,options:f,request:e,suites:i,onClose:()=>d(null),onLaunched:g=>{d(null),t(`/runs/${encodeURIComponent(g)}`)}})]}):p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-3 gap-3",children:[p.jsx(Qr,{lines:4}),p.jsx(Qr,{lines:4}),p.jsx(Qr,{lines:4})]})]})}function rve({presetId:e,request:t,navigate:n}){const[r,i]=x.useState(null),[l,s]=x.useState(null),[u,f]=x.useState(null),[d,g]=x.useState(null),[h,v]=x.useState(null);x.useEffect(()=>{let j=!1;return Promise.all([t(`/api/presets/${encodeURIComponent(e)}`),t(`/api/presets/${encodeURIComponent(e)}/runs`),t("/api/suites")]).then(([E,O,C])=>{j||(i(E),s(O),f(C))}).catch(E=>{j||g(E instanceof Error?E.message:String(E))}),()=>{j=!0}},[t,e]);const b=async()=>{if(confirm("Delete this preset? Past runs will remain in history."))try{await t(`/api/presets/${encodeURIComponent(e)}`,{method:"DELETE"}),n("/presets")}catch(j){g(j instanceof Error?j.message:String(j))}};if(d)return p.jsx(yt,{message:d});if(!r||!l)return p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(Qr,{lines:4,className:"mb-3"}),p.jsx(Zp,{rows:5,selectable:!1})]});const S=p5(r.preset.endpoint);return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Preset",title:r.preset.name,meta:r.preset.description??void 0,actions:p.jsxs(p.Fragment,{children:[p.jsx(Le,{onClick:()=>v(h5(r.preset)),children:"Launch run"}),p.jsx("a",{href:`/presets/${encodeURIComponent(e)}/edit`,className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-3 py-1.5 text-sm bg-secondary text-foreground border-border hover:bg-primary hover:border-border no-underline",children:"Edit"}),p.jsx(Le,{variant:"danger",onClick:()=>void b(),children:"Delete"})]})}),r.warnings.map(j=>p.jsx(yt,{message:j.message},`${j.file}:${j.id}`)),p.jsxs("div",{className:"grid grid-cols-2 sm:grid-cols-4 gap-3 mb-6",children:[p.jsx(yn,{label:"Scenarios",value:r.preset.selection.length}),p.jsx(yn,{label:"Repeat",value:r.preset.repeat}),p.jsx(yn,{label:"Parallel",value:r.preset.parallel.enabled?`×${r.preset.parallel.limit??"?"}`:"off"}),p.jsx(yn,{label:"Endpoint",tone:S==="custom"?"default":"accent",value:p.jsx("span",{className:"text-base font-mono",children:S})})]}),p.jsxs(rt,{className:"p-4 mb-6",children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Configuration"}),p.jsxs("dl",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3 text-sm",children:[p.jsxs("div",{children:[p.jsx("dt",{className:"text-muted-foreground/70 text-xs",children:"Endpoint"}),p.jsx("dd",{className:"font-mono text-foreground break-all",children:r.preset.endpoint})]}),p.jsxs("div",{children:[p.jsx("dt",{className:"text-muted-foreground/70 text-xs",children:"Personas"}),p.jsx("dd",{className:"font-mono text-foreground break-all",children:r.preset.personas})]}),p.jsxs("div",{children:[p.jsx("dt",{className:"text-muted-foreground/70 text-xs",children:"Rubric"}),p.jsx("dd",{className:"font-mono text-foreground break-all",children:r.preset.rubric})]})]})]}),p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Run history"}),p.jsx(Tge,{runs:l.runs,navigate:n,presetName:r.preset.name}),p.jsx(c5,{open:h!=null,options:h,request:t,suites:u,onClose:()=>v(null),onLaunched:j=>{v(null),n(`/runs/${encodeURIComponent(j)}`)}})]})}function ive(e){return e?e.configured?e.source==="db"?"stored on server":e.source==="env"?"from environment variable":"configured":"not set":"loading…"}function ave({request:e}){const[t,n]=x.useState(null),[r,i]=x.useState(""),[l,s]=x.useState(!1),[u,f]=x.useState(null),[d,g]=x.useState(null),h=x.useCallback(async()=>{try{const j=await e("/api/settings/secrets/open_router_api_key");n(j.open_router_api_key),f(null)}catch(j){f(j instanceof Error?j.message:String(j))}},[e]);x.useEffect(()=>{let j=!1;return(async()=>j||await h())(),()=>{j=!0}},[h]);const v=async j=>{j.preventDefault();const E=r.trim();if(E){s(!0),g(null);try{const O=await e("/api/settings/secrets/open_router_api_key",Si("PUT",{value:E}));n(O.open_router_api_key),i(""),f(null),g("Saved.")}catch(O){f(O instanceof Error?O.message:String(O))}finally{s(!1)}}},b=async()=>{s(!0),g(null);try{const j=await e("/api/settings/secrets/open_router_api_key",Si("DELETE"));n(j.open_router_api_key),i(""),f(null),g("Cleared.")}catch(j){f(j instanceof Error?j.message:String(j))}finally{s(!1)}},S=t?.source==="db";return p.jsx(rt,{className:"p-4 mb-4",children:p.jsxs("form",{className:"flex flex-col gap-2",onSubmit:v,children:[p.jsx("label",{htmlFor:"open-router-api-key",className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold",children:"OpenRouter API key"}),p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx(Kt,{id:"open-router-api-key",type:"password",value:r,onChange:j=>i(j.currentTarget.value),placeholder:"sk-or-...",autoComplete:"off"}),p.jsx(Le,{type:"submit",disabled:l||!r.trim(),children:"Save"}),S?p.jsx(Le,{type:"button",variant:"ghost",disabled:l,onClick:()=>{b()},children:"Clear"}):null]}),p.jsxs("div",{className:"text-xs text-muted-foreground",children:["Status: ",ive(t)]}),d?p.jsx("div",{className:"text-xs text-success",children:d}):null,u?p.jsx("div",{className:"text-xs text-destructive",children:u}):null]})})}function lve({request:e}){const[t,n]=x.useState(null),[r,i]=x.useState(null),[l,s]=x.useState(null);return x.useEffect(()=>{let u=!1;return Promise.all([fetch("/healthz").then(f=>f.json()),fetch("/readyz").then(f=>f.json())]).then(([f,d])=>{u||(n(f),i(d),s(null))}).catch(f=>{u||s(f instanceof Error?f.message:String(f))}),()=>{u=!0}},[]),p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Settings",title:"Server"}),l?p.jsx(yt,{message:l}):null,p.jsxs("div",{className:"grid grid-cols-2 sm:grid-cols-4 gap-3 mb-6",children:[p.jsx(yn,{label:"Health",tone:t?.status==="ok"?"success":"default",value:p.jsx("span",{className:"text-base font-mono",children:t?.status??"—"})}),p.jsx(yn,{label:"Readiness",tone:r?.status==="ready"?"success":"default",value:p.jsx("span",{className:"text-base font-mono",children:r?.status??"—"})}),p.jsx(yn,{label:"Version",value:p.jsx("span",{className:"text-base font-mono",children:t?.version??"—"})}),p.jsx(yn,{label:"Database",value:p.jsx("span",{className:"text-base font-mono",children:r?.db_url?"sqlite":"—"})})]}),p.jsx(ave,{request:e})]})}function ove(){const{pathname:e,navigate:t}=Gge(),n=K6();Wge(t);const r=(()=>{if(e==="/"||e==="/index.html")return p.jsx(Xge,{request:n,navigate:t});if(e==="/runs")return p.jsx(Qge,{request:n,navigate:t});if(e==="/start")return p.jsx(tve,{request:n,navigate:t});if(e==="/presets")return p.jsx(nve,{request:n,navigate:t});if(e==="/suites")return p.jsx(eve,{request:n});if(e==="/endpoints")return p.jsx(wge,{request:n});if(e==="/settings")return p.jsx(lve,{request:n});if(e==="/compare")return p.jsx(Zse,{});if(e==="/score")return p.jsx(Rge,{request:n,navigate:t});const l=e.match(/^\/score\/([^/]+)\/([^/]+)$/);if(l)return p.jsx(Fge,{rubricId:decodeURIComponent(l[1]??""),dimensionId:decodeURIComponent(l[2]??""),request:n,navigate:t});const s=e.match(/^\/runs\/([^/]+)\/scenarios\/([0-9]+)$/);if(s)return p.jsx(Jge,{runId:decodeURIComponent(s[1]??""),ordinal:s[2]??"0",request:n});const u=e.match(/^\/runs\/([^/]+)$/);if(u)return p.jsx(Zge,{runId:decodeURIComponent(u[1]??""),request:n,navigate:t});const f=e.match(/^\/presets\/([^/]+)\/edit$/);if(f)return p.jsx(Cge,{presetId:decodeURIComponent(f[1]??""),request:n,navigate:t});const d=e.match(/^\/presets\/([^/]+)$/);return d?p.jsx(rve,{presetId:decodeURIComponent(d[1]??""),request:n,navigate:t}):p.jsx(yt,{message:"Page not found."})})(),i=[{href:"/",label:"Overview",isActive:l=>l==="/"||l==="/index.html"},{href:"/start",label:"Start",isActive:l=>l==="/start"},{href:"/runs",label:"Runs",isActive:l=>l==="/runs"||l.startsWith("/runs/")},{href:"/score",label:"Score",isActive:l=>l==="/score"||l.startsWith("/score/")},{href:"/presets",label:"Presets",isActive:l=>l==="/presets"||l.startsWith("/presets/")},{href:"/suites",label:"Suites",isActive:l=>l.startsWith("/suites")},{href:"/endpoints",label:"Endpoints",isActive:l=>l.startsWith("/endpoints")},{href:"/settings",label:"Settings",isActive:l=>l==="/settings"}];return p.jsxs("div",{className:"min-h-screen bg-background",children:[p.jsx("header",{className:"sticky top-0 z-30 border-b border-border bg-background/85 backdrop-blur supports-[backdrop-filter]:bg-background/65",children:p.jsxs("div",{className:"mx-auto max-w-[1280px] px-6 h-14 flex items-center justify-between gap-6",children:[p.jsxs("a",{href:"/",className:"flex items-center gap-2.5 no-underline text-foreground",children:[p.jsx("span",{className:"inline-block size-2 rounded-full bg-primary shadow-[0_0_0_4px_hsl(var(--primary)/0.12)]"}),p.jsx("span",{className:"text-sm font-semibold tracking-tight",children:"AgentProbe"})]}),p.jsxs("div",{className:"flex items-center gap-1",children:[p.jsx("nav",{className:"hidden md:flex items-center gap-0.5",children:i.map(l=>{const s=l.isActive(e);return p.jsxs("a",{href:l.href,className:`relative px-3 h-14 inline-flex items-center text-sm transition-colors no-underline ${s?"text-foreground font-medium":"text-muted-foreground hover:text-foreground"}`,children:[l.label,s?p.jsx("span",{className:"absolute bottom-[-1px] left-3 right-3 h-px bg-primary"}):null]},l.href)})}),p.jsx("nav",{className:"md:hidden flex items-center gap-1 overflow-x-auto",children:i.map(l=>{const s=l.isActive(e);return p.jsx("a",{href:l.href,className:`px-2.5 h-8 inline-flex items-center rounded-md text-xs transition-colors no-underline ${s?"bg-secondary text-foreground":"text-muted-foreground hover:text-foreground"}`,children:l.label},l.href)})}),p.jsx("div",{className:"ml-2 pl-2 border-l border-border",children:p.jsx(yge,{})})]})]})}),p.jsx("main",{className:"mx-auto max-w-[1280px] px-6 py-8",children:r})]})}function sve(){const{data:e,error:t}=xge(),[n,r]=x.useState(null);if(t&&!e)return p.jsxs("div",{style:{padding:48,textAlign:"center",color:"var(--muted)"},children:[p.jsx("div",{style:{fontSize:16,marginBottom:8},children:"Waiting for run to start..."}),p.jsx("div",{style:{fontSize:12},children:t})]});if(!e)return p.jsx("div",{style:{padding:48,textAlign:"center",color:"var(--muted)"},children:"Loading..."});const i=n!=null?e.details[n]??null:null;return p.jsxs(p.Fragment,{children:[p.jsxs("div",{className:"header",children:[p.jsx("h1",{children:"AgentProbe Live Dashboard"}),p.jsxs("span",{className:"live-badge",children:[p.jsx("span",{className:e.all_done?"done-dot":"live-dot"}),e.all_done?"COMPLETE":"LIVE"]})]}),p.jsx(l5,{data:e}),p.jsx(i5,{data:e}),p.jsx(a5,{data:e,onSelect:r}),p.jsx(YP,{averages:e.averages,onSelectRun:r}),p.jsxs("div",{className:"footer",children:["AgentProbe Dashboard · ",e.done,"/",e.total," scenarios"]}),i&&p.jsx(cge,{detail:i,onClose:()=>r(null)})]})}function uve(){const[e,t]=x.useState("detecting"),[n,r]=x.useState(typeof window<"u"?window.location.pathname:"/");return x.useEffect(()=>{const i=()=>r(window.location.pathname);return window.addEventListener("popstate",i),()=>window.removeEventListener("popstate",i)},[]),x.useEffect(()=>{let i=!1;if(window.__AGENTPROBE_LIVE__){t("live");return}return fetch("/api/session",{headers:{accept:"application/json"}}).finally(()=>{i||t("server")}),()=>{i=!0}},[]),e==="detecting"?p.jsx(rf,{label:"Starting dashboard…"}):e==="live"?p.jsx(sve,{}):p.jsx(ove,{})}function cve(){return p.jsx(R$,{children:p.jsx(uve,{})})}const m5=document.getElementById("root");if(!m5)throw new Error("Missing #root element");B6.createRoot(m5).render(p.jsx(x.StrictMode,{children:p.jsx(cve,{})}));
+
diff --git a/dashboard/package.json b/dashboard/package.json
index 5ae9db8..385bf7f 100644
--- a/dashboard/package.json
+++ b/dashboard/package.json
@@ -20,14 +20,16 @@
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"lucide-react": "^1.11.0",
- "react": "^19.1.0",
- "react-dom": "^19.1.0",
"react-markdown": "^10.1.0",
"recharts": "^3.8.1",
"remark-gfm": "^4.0.1",
"tailwind-merge": "^3.5.0",
"tw-animate-css": "^1.4.0"
},
+ "peerDependencies": {
+ "react": "^19.1.0",
+ "react-dom": "^19.1.0"
+ },
"devDependencies": {
"@tailwindcss/vite": "^4.2.4",
"@types/react": "^19.1.6",
diff --git a/dashboard/src/App.tsx b/dashboard/src/App.tsx
index ad48dd7..c134415 100644
--- a/dashboard/src/App.tsx
+++ b/dashboard/src/App.tsx
@@ -31,6 +31,7 @@ import { AveragesTable } from "./components/AveragesTable.tsx";
import { CompareView } from "./components/CompareView.tsx";
import { ConversationView } from "./components/ConversationView.tsx";
import { DetailPanel } from "./components/DetailPanel.tsx";
+import { EvalScoresView, hasEvalScores } from "./components/EvalScoresView.tsx";
import { ProgressBar } from "./components/ProgressBar.tsx";
import { RubricView } from "./components/RubricView.tsx";
import { ScenarioTable } from "./components/ScenarioTable.tsx";
@@ -186,6 +187,14 @@ function scenarioDetail(scenario: ServerScenario): ScenarioDetail {
judge_dimension_scores: (scenario.judgeDimensionScores ?? []).map(
normalizeDimension,
),
+ retrieval_scores: (scenario.retrievalScores ??
+ []) as unknown as ScenarioDetail["retrieval_scores"],
+ demotion_scores: (scenario.demotionScores ??
+ []) as unknown as ScenarioDetail["demotion_scores"],
+ procedure_scores: (scenario.procedureScores ??
+ []) as unknown as ScenarioDetail["procedure_scores"],
+ dedup_scores: (scenario.dedupScores ??
+ []) as unknown as ScenarioDetail["dedup_scores"],
expectations: scenario.expectations,
error: scenario.error,
counts: scenario.counts
@@ -992,6 +1001,14 @@ function ScenarioDetailView({
+ {hasEvalScores(detail) && (
+
+
+ Eval scores
+
+
+
+ )}
>
);
}
@@ -1463,7 +1480,10 @@ export function StartRunView({
onChange={(e) => setRepeat(Number(e.currentTarget.value))}
/>
-
+
-
-
-
+
+
+
+
- {!presetId ? (
+ hint="Run multiple scenarios concurrently. Scenarios still complete in order, but several run at a time (set the limit above)."
+ >
+
+ {!presetId ? (
+
+
+
) : null}
{saveAsPreset && !presetId ? (
diff --git a/dashboard/src/api/types.ts b/dashboard/src/api/types.ts
index 42aa988..1970720 100644
--- a/dashboard/src/api/types.ts
+++ b/dashboard/src/api/types.ts
@@ -46,6 +46,10 @@ export type ServerScenario = {
targetEvents?: Array>;
checkpoints?: Array>;
judgeDimensionScores?: Array>;
+ retrievalScores?: Array>;
+ demotionScores?: Array>;
+ procedureScores?: Array>;
+ dedupScores?: Array>;
expectations?: unknown;
error?: unknown;
counts?: {
diff --git a/dashboard/src/components/DetailPanel.tsx b/dashboard/src/components/DetailPanel.tsx
index b28d8d7..937251c 100644
--- a/dashboard/src/components/DetailPanel.tsx
+++ b/dashboard/src/components/DetailPanel.tsx
@@ -2,6 +2,7 @@ import { useState } from "react";
import { scorePct } from "../helpers.ts";
import type { ScenarioDetail } from "../types.ts";
import { ConversationView } from "./ConversationView.tsx";
+import { EvalScoresView, hasEvalScores } from "./EvalScoresView.tsx";
import { RubricView } from "./RubricView.tsx";
interface Props {
@@ -9,8 +10,11 @@ interface Props {
onClose: () => void;
}
+type TabKey = "conversation" | "rubric" | "evals";
+
export function DetailPanel({ detail, onClose }: Props) {
- const [tab, setTab] = useState<"conversation" | "rubric">("conversation");
+ const evalsVisible = hasEvalScores(detail);
+ const [tab, setTab] = useState("conversation");
const isRunning = detail.status === "running";
const scoreLabel =
@@ -109,14 +113,21 @@ export function DetailPanel({ detail, onClose }: Props) {
>
Rubric
+ {evalsVisible && (
+ setTab("evals")}
+ >
+ Eval scores
+
+ )}
- {tab === "conversation" ? (
-
- ) : (
-
- )}
+ {tab === "conversation" && }
+ {tab === "rubric" && }
+ {tab === "evals" && }
diff --git a/dashboard/src/components/EvalScoresView.tsx b/dashboard/src/components/EvalScoresView.tsx
new file mode 100644
index 0000000..301e6dd
--- /dev/null
+++ b/dashboard/src/components/EvalScoresView.tsx
@@ -0,0 +1,395 @@
+import {
+ CheckCircle2,
+ ListOrdered,
+ Network,
+ Sparkles,
+ Target,
+ XCircle,
+} from "lucide-react";
+import type { ReactNode } from "react";
+import { cn } from "../lib/utils.ts";
+import type {
+ DedupMetricScore,
+ DemotionMetricScore,
+ ProcedureMetricScore,
+ RetrievalMetricScore,
+ ScenarioDetail,
+} from "../types.ts";
+
+interface Props {
+ detail: ScenarioDetail;
+}
+
+type AnyMetric =
+ | RetrievalMetricScore
+ | DemotionMetricScore
+ | ProcedureMetricScore
+ | DedupMetricScore;
+
+function formatNumber(value: number): string {
+ if (!Number.isFinite(value)) return "n/a";
+ return value.toFixed(3).replace(/0+$/, "").replace(/\.$/, "");
+}
+
+function pct(value: number): string {
+ if (!Number.isFinite(value)) return "n/a";
+ return `${Math.round(value * 100)}%`;
+}
+
+function MetricBar({ value }: { value: number }) {
+ const clamped = Math.max(0, Math.min(1, value));
+ return (
+
+ );
+}
+
+function PassPill({ passed }: { passed: boolean }) {
+ return passed ? (
+
+
+ Pass
+
+ ) : (
+
+
+ Fail
+
+ );
+}
+
+function SectionLabel({
+ children,
+ count,
+}: {
+ children: ReactNode;
+ count?: number | string;
+}) {
+ return (
+
+
+ {children}
+
+ {count != null && (
+
+ {count}
+
+ )}
+
+
+ );
+}
+
+function MetricCard({ m }: { m: AnyMetric }) {
+ return (
+
+
+ {m.metric}
+
+
+
+
+
+ {formatNumber(m.value)}
+
+
+ ×{formatNumber(m.weight)}
+
+
+ );
+}
+
+function ScorerHeader({
+ icon,
+ title,
+ subtitle,
+ weightedScore,
+ passThreshold,
+ passed,
+ source,
+}: {
+ icon: ReactNode;
+ title: string;
+ subtitle?: string;
+ weightedScore: number;
+ passThreshold: number;
+ passed: boolean;
+ source: string;
+}) {
+ return (
+
+
+
+ {icon}
+
+
+
{title}
+ {subtitle && (
+
{subtitle}
+ )}
+
+
+
+
+
+ {pct(weightedScore)}
+
+ {" "}
+ / {pct(passThreshold)}
+
+
+
+ source: {source}
+
+
+
+
+
+ );
+}
+
+function aggregate(metrics: AnyMetric[]) {
+ if (metrics.length === 0) {
+ return {
+ weightedScore: 0,
+ passThreshold: 0,
+ passed: false,
+ source: "missing",
+ };
+ }
+ const first = metrics[0];
+ return {
+ weightedScore: first.weighted_score,
+ passThreshold: first.pass_threshold,
+ passed: first.passed,
+ source: first.source,
+ };
+}
+
+function RetrievalSection({ metrics }: { metrics: RetrievalMetricScore[] }) {
+ if (metrics.length === 0) return null;
+ const agg = aggregate(metrics);
+ const first = metrics[0];
+ return (
+
+ }
+ title="Retrieval ranking"
+ subtitle={`k=${first?.k ?? "n/a"}, ${first?.hit_count ?? 0}/${first?.total_relevant ?? 0} hits, ${first?.forbidden_hits ?? 0} forbidden`}
+ weightedScore={agg.weightedScore}
+ passThreshold={agg.passThreshold}
+ passed={agg.passed}
+ source={agg.source}
+ />
+
+ {metrics.map((m, i) => (
+
+ ))}
+
+
+ );
+}
+
+function DemotionSection({ metrics }: { metrics: DemotionMetricScore[] }) {
+ if (metrics.length === 0) return null;
+ const agg = aggregate(metrics);
+ const first = metrics[0];
+ const subtitleBits: string[] = [];
+ if (first) {
+ subtitleBits.push(
+ `timestamp violations: ${first.timestamp_violation_count}`,
+ );
+ if (first.cascade_bounded === true) subtitleBits.push("cascade: bounded");
+ else if (first.cascade_bounded === false)
+ subtitleBits.push("cascade: RUNAWAY");
+ }
+ return (
+
+ }
+ title="Demotion correctness"
+ subtitle={subtitleBits.join(" · ")}
+ weightedScore={agg.weightedScore}
+ passThreshold={agg.passThreshold}
+ passed={agg.passed}
+ source={agg.source}
+ />
+
+ {metrics.map((m, i) => (
+
+ ))}
+
+
+ );
+}
+
+function ProcedureSection({ metrics }: { metrics: ProcedureMetricScore[] }) {
+ if (metrics.length === 0) return null;
+ const agg = aggregate(metrics);
+ const first = metrics[0];
+ const predicted = Array.isArray(first?.predicted)
+ ? (first.predicted as string[])
+ : [];
+ const golden = Array.isArray(first?.golden) ? (first.golden as string[]) : [];
+ return (
+
+ }
+ title="Procedure extraction"
+ subtitle={`predicted ${predicted.length} steps · golden ${golden.length} steps`}
+ weightedScore={agg.weightedScore}
+ passThreshold={agg.passThreshold}
+ passed={agg.passed}
+ source={agg.source}
+ />
+
+ {metrics.map((m, i) => (
+
+ ))}
+
+ {(predicted.length > 0 || golden.length > 0) && (
+
+
+
+
+ )}
+
+ );
+}
+
+function StepList({ title, items }: { title: string; items: string[] }) {
+ return (
+
+
+ {title}
+
+ {items.length === 0 ? (
+
(empty)
+ ) : (
+
+ {items.map((step, i) => (
+
+ {i + 1}.
+ {step}
+
+ ))}
+
+ )}
+
+ );
+}
+
+function DedupSection({ metrics }: { metrics: DedupMetricScore[] }) {
+ if (metrics.length === 0) return null;
+ const agg = aggregate(metrics);
+ const first = metrics[0];
+ const predicted = Array.isArray(first?.predicted)
+ ? (first.predicted as string[][])
+ : [];
+ const golden = Array.isArray(first?.golden)
+ ? (first.golden as string[][])
+ : [];
+ return (
+
+ }
+ title="Deduplication"
+ subtitle={`items: ${first?.item_count ?? 0} · predicted ${predicted.length} clusters · golden ${golden.length} clusters`}
+ weightedScore={agg.weightedScore}
+ passThreshold={agg.passThreshold}
+ passed={agg.passed}
+ source={agg.source}
+ />
+
+ {metrics.map((m, i) => (
+
+ ))}
+
+ {(predicted.length > 0 || golden.length > 0) && (
+
+
+
+
+ )}
+
+ );
+}
+
+function ClusterList({
+ title,
+ clusters,
+}: {
+ title: string;
+ clusters: string[][];
+}) {
+ return (
+
+
+ {title}
+
+ {clusters.length === 0 ? (
+
(empty)
+ ) : (
+
+ {clusters.map((cluster, i) => (
+
+ {cluster.join(", ")}
+
+ ))}
+
+ )}
+
+ );
+}
+
+export function EvalScoresView({ detail }: Props) {
+ const retrieval = detail.retrieval_scores ?? [];
+ const demotion = detail.demotion_scores ?? [];
+ const procedure = detail.procedure_scores ?? [];
+ const dedup = detail.dedup_scores ?? [];
+ const totalCount =
+ retrieval.length + demotion.length + procedure.length + dedup.length;
+
+ if (totalCount === 0) {
+ return (
+
+
+
+ No quantitative eval scores
+
+
+ This scenario didn't declare a retrieval, demotion, procedure, or
+ dedup block. Add one to its YAML to get IR-style metrics here.
+
+
+ );
+ }
+
+ return (
+
+ );
+}
+
+export function hasEvalScores(detail: ScenarioDetail): boolean {
+ return (
+ (detail.retrieval_scores?.length ?? 0) > 0 ||
+ (detail.demotion_scores?.length ?? 0) > 0 ||
+ (detail.procedure_scores?.length ?? 0) > 0 ||
+ (detail.dedup_scores?.length ?? 0) > 0
+ );
+}
diff --git a/dashboard/src/components/copilot/Markdown.tsx b/dashboard/src/components/copilot/Markdown.tsx
index 38280f4..9c560bb 100644
--- a/dashboard/src/components/copilot/Markdown.tsx
+++ b/dashboard/src/components/copilot/Markdown.tsx
@@ -1,4 +1,4 @@
-import { type ComponentProps, memo } from "react";
+import { type ComponentType, type JSX, memo } from "react";
import ReactMarkdown from "react-markdown";
import remarkGfm from "remark-gfm";
import { cn } from "../../lib/utils.ts";
@@ -8,7 +8,17 @@ export type MarkdownProps = {
className?: string;
};
-const components: ComponentProps["components"] = {
+// Re-type react-markdown's `components` prop against the project's React 19
+// JSX namespace. Upstream react-markdown ships its own bundled React types,
+// which collide with the project's @types/react@19. The component handlers
+// themselves are unchanged — we just remap the JSX intrinsic table.
+type MarkdownComponents = {
+ [Key in keyof JSX.IntrinsicElements]?: ComponentType<
+ JSX.IntrinsicElements[Key]
+ >;
+};
+
+const components: MarkdownComponents = {
p: ({ className, ...props }) => (
["components"] = {
function MarkdownInner({ children, className }: MarkdownProps) {
return (
-
+
{children}
diff --git a/dashboard/src/types.ts b/dashboard/src/types.ts
index ab94f17..c119abe 100644
--- a/dashboard/src/types.ts
+++ b/dashboard/src/types.ts
@@ -55,6 +55,61 @@ export interface DimensionScore {
evidence?: string[];
}
+export interface RetrievalMetricScore {
+ metric: string;
+ value: number;
+ weight: number;
+ k: number;
+ weighted_score: number;
+ pass_threshold: number;
+ passed: boolean;
+ total_relevant: number;
+ total_returned: number;
+ hit_count: number;
+ forbidden_hits: number;
+ source: string;
+ returned: unknown;
+}
+
+export interface DemotionMetricScore {
+ metric: string;
+ value: number;
+ weight: number;
+ weighted_score: number;
+ pass_threshold: number;
+ passed: boolean;
+ timestamp_violation_count: number;
+ cascade_bounded: boolean | null;
+ source: string;
+ observed: unknown;
+ expected: unknown;
+}
+
+export interface ProcedureMetricScore {
+ metric: string;
+ value: number;
+ weight: number;
+ weighted_score: number;
+ pass_threshold: number;
+ passed: boolean;
+ source: string;
+ predicted: unknown;
+ golden: unknown;
+}
+
+export interface DedupMetricScore {
+ metric: string;
+ value: number;
+ weight: number;
+ weighted_score: number;
+ pass_threshold: number;
+ passed: boolean;
+ item_count: number;
+ source: string;
+ predicted: unknown;
+ golden: unknown;
+}
+
export interface ScenarioDetail {
scenario_id: string;
scenario_name: string;
@@ -76,6 +131,10 @@ export interface ScenarioDetail {
target_events?: Array>;
checkpoints?: Checkpoint[];
judge_dimension_scores?: DimensionScore[];
+ retrieval_scores?: RetrievalMetricScore[];
+ demotion_scores?: DemotionMetricScore[];
+ procedure_scores?: ProcedureMetricScore[];
+ dedup_scores?: DedupMetricScore[];
expectations?: unknown;
error?: unknown;
counts?: {
diff --git a/dashboard/src/views/PresetEditorView.tsx b/dashboard/src/views/PresetEditorView.tsx
index 3c9a2a0..ba81591 100644
--- a/dashboard/src/views/PresetEditorView.tsx
+++ b/dashboard/src/views/PresetEditorView.tsx
@@ -311,7 +311,10 @@ export function PresetEditorView({
onChange={(e) => setRepeat(Number(e.currentTarget.value))}
/>
-
+
-
+
diff --git a/data/dream-validation.yaml b/data/dream-validation.yaml
new file mode 100644
index 0000000..24dd9f0
--- /dev/null
+++ b/data/dream-validation.yaml
@@ -0,0 +1,436 @@
+version: "1.0"
+id: "dream-validation-v1"
+name: "Dream-System Validation Pack (P-1 -> P2)"
+
+defaults:
+ max_turns: 6
+ timeout_seconds: 30
+ category: "Dream"
+ persona: smb-founder
+ rubric: multi-session-memory
+ user_name: "Jordan Rivera"
+ copilot_mode: "fast"
+
+# Dream-system validation pack.
+#
+# This pack pairs the existing conversational rubric with three new
+# quantitative scorers — `demotion`, `procedure`, `dedup` — that exercise
+# the dream-system roadmap items from `dream/TODO.md`:
+#
+# - P-1.3 retract-vs-soft-delete split (Snodgrass bi-temporal)
+# - P0.3a stale-fact deprecation
+# - P0.3b scoped cascading expiry (single-hop discipline)
+# - P1 procedure synthesis + Save-as-Skill
+# - P2 memory dedup / near-duplicate merge
+#
+# Each scenario carries one of the new scorer blocks plus a fixture
+# under `data/fixtures/dream/` so the scorer math is exercised offline.
+# When AutoGPT's dream pass starts emitting structured payloads inline
+# on the chat-stream response, swap `source.fixture` for
+# `source.raw_exchange_key` and the same scenarios drive live runs.
+
+scenarios:
+
+ # =========================================================================
+ # P-1.3 — retract sets only expired_at (Snodgrass system retraction)
+ # =========================================================================
+ - id: dream-demotion-retract-discipline
+ name: "Demotion: user-initiated forget sets expired_at only (not invalid_at)"
+ description: |
+ Validates `_retract_edges` from P-1.3: a user-initiated forget is a
+ system retraction (only `expired_at`), not a world change
+ (`invalid_at` must remain null). Conflating the two is the Snodgrass
+ bi-temporal bug the audit flagged.
+ tags: [dream, demotion, snodgrass, p_minus_1]
+ priority: critical
+ rubric: memory-abstention
+
+ context:
+ system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations."
+
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Forget that I mentioned the Q2 budget."
+
+ expectations:
+ expected_behavior: |
+ The forget operation flips `expired_at` on the budget edge but
+ leaves `invalid_at` null — that fact didn't stop being true, the
+ system just stopped tracking it.
+ expected_outcome: resolved
+ failure_modes:
+ - snodgrass_violation: Both expired_at and invalid_at were set
+ - missed_demotion: The budget edge was not touched at all
+ - wrong_edge: A different edge was retracted
+
+ demotion:
+ expected_demotions: ["edge-budget-q2"]
+ expected_retracts: ["edge-budget-q2"]
+ pass_threshold: 0.6
+ weight:
+ set_precision: 1.0
+ set_recall: 1.0
+ set_f1: 1.0
+ timestamp_discipline: 3.0
+ cascade_bounded: 0.0
+ cascade_direct_f1: 0.0
+ source:
+ fixture: fixtures/dream/demotion-snodgrass-retract.json
+
+ # =========================================================================
+ # P-1.3 negative — retract that *also* set invalid_at (the bug)
+ # =========================================================================
+ - id: dream-demotion-snodgrass-violation
+ name: "Demotion: a retract that sets invalid_at is a hard fail"
+ description: |
+ Negative-test sibling: the fixture deliberately violates the
+ Snodgrass discipline (both timestamps set). The scorer must flag
+ this and force a fail regardless of the set-membership score.
+ tags: [dream, demotion, snodgrass, negative, p_minus_1]
+ priority: critical
+ rubric: memory-abstention
+
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Forget that I mentioned the Q2 budget."
+
+ expectations:
+ expected_behavior: |
+ This scenario exists to prove the scorer flags a Snodgrass
+ violation. The fixture is intentionally wrong; the scenario is
+ expected to FAIL with `timestampViolationCount > 0`.
+ expected_outcome: resolved
+ failure_modes:
+ - false_positive: Scorer marks this as passing despite the violation
+ - missed_violation: timestampViolationCount stays at 0
+
+ demotion:
+ expected_demotions: ["edge-budget-q2"]
+ expected_retracts: ["edge-budget-q2"]
+ pass_threshold: 0.6
+ weight:
+ set_precision: 1.0
+ set_recall: 1.0
+ set_f1: 1.0
+ timestamp_discipline: 3.0
+ cascade_bounded: 0.0
+ cascade_direct_f1: 0.0
+ source:
+ fixture: fixtures/dream/demotion-snodgrass-violation.json
+
+ # =========================================================================
+ # P0.3a — stale fact demotion
+ # =========================================================================
+ - id: dream-demotion-stale-fact
+ name: "Demotion: stale pricing flagged by dream pass is correctly demoted"
+ description: |
+ Validates P0.3a stale-fact deprecation. The dream pass identified
+ that old pricing was stale; the demotion sets the expected edge
+ to status='superseded' with expired_at-only timestamps.
+ tags: [dream, demotion, stale, p0_3a]
+ priority: high
+ rubric: memory-temporal
+
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "What's our current pricing?"
+
+ expectations:
+ expected_behavior: |
+ The dream pass flipped the old pricing edge to superseded. The
+ edge is gone from active search results but lives in the audit
+ trail.
+ expected_outcome: resolved
+ failure_modes:
+ - missed: Stale edge was not demoted
+ - over_demoted: Active edges were demoted alongside
+
+ demotion:
+ expected_demotions: ["edge-pricing-old"]
+ pass_threshold: 0.6
+ source:
+ fixture: fixtures/dream/demotion-stale-fact.json
+
+ # =========================================================================
+ # P0.3b — cascading expiry (bounded to 1-hop, no runaway)
+ # =========================================================================
+ - id: dream-demotion-cascade-bounded
+ name: "Demotion: invalidating an entity touches direct neighbors only"
+ description: |
+ Validates P0.3b cascading expiry. When the NorthStar lead is
+ invalidated, the cascade touches only the entity's direct
+ attachments (Marcus, CTO role) and not adjacent infrastructure
+ (HubSpot CRM). This is the single-hop discipline the spec
+ explicitly calls out.
+ tags: [dream, demotion, cascade, p0_3b]
+ priority: critical
+ rubric: memory-hygiene
+
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "The NorthStar deal fell through. Drop them from our system."
+
+ expectations:
+ expected_behavior: |
+ Marcus and the CTO role edge are retracted. HubSpot, fiscal year,
+ and invoicing rule edges stay active. The cascade is bounded.
+ expected_outcome: resolved
+ failure_modes:
+ - runaway_cascade: HubSpot (or other 2+ hop edges) were also touched
+ - under_cascade: Only one of the two direct-neighbor edges was touched
+
+ demotion:
+ expected_demotions: ["edge-northstar-marcus", "edge-northstar-cto"]
+ cascade:
+ expected_direct_neighbors:
+ - "edge-northstar-marcus"
+ - "edge-northstar-cto"
+ tangential_edges:
+ - "edge-hubspot-crm"
+ - "edge-fiscal-year"
+ - "edge-invoicing-1st"
+ pass_threshold: 0.6
+ source:
+ fixture: fixtures/dream/demotion-cascade-bounded.json
+
+ # =========================================================================
+ # P0.3b negative — runaway cascade (the bug to prevent)
+ # =========================================================================
+ - id: dream-demotion-cascade-runaway
+ name: "Demotion: cascade that touched a 2-hop edge is a hard fail"
+ description: |
+ Negative test: fixture shows the cascade touching HubSpot (which is
+ 2+ hops away from NorthStar via the user). Per p0-spec.md §4 this
+ is the runaway-demotion bug that single-hop discipline exists to
+ prevent. The scorer MUST flag this even though set_precision drops.
+ tags: [dream, demotion, cascade, negative, p0_3b]
+ priority: critical
+ rubric: memory-hygiene
+
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "The NorthStar deal fell through. Drop them from our system."
+
+ expectations:
+ expected_behavior: |
+ The fixture intentionally shows a runaway cascade. The scenario
+ is expected to FAIL with cascadeBounded=false.
+ expected_outcome: resolved
+ failure_modes:
+ - false_positive: Scorer marks this as passing despite the runaway
+
+ demotion:
+ expected_demotions: ["edge-northstar-marcus", "edge-northstar-cto"]
+ cascade:
+ expected_direct_neighbors:
+ - "edge-northstar-marcus"
+ - "edge-northstar-cto"
+ tangential_edges:
+ - "edge-hubspot-crm"
+ - "edge-fiscal-year"
+ - "edge-invoicing-1st"
+ pass_threshold: 0.6
+ source:
+ fixture: fixtures/dream/demotion-cascade-runaway.json
+
+ # =========================================================================
+ # P1 — procedure synthesis: weekly report workflow
+ # =========================================================================
+ - id: dream-procedure-weekly-report
+ name: "Procedure: dream pass extracts the weekly-report workflow"
+ description: |
+ Validates P1 procedure synthesis. After three weeks of the user
+ doing the same weekly-report sequence, the dream pass should
+ extract a ProcedureMemory with the four canonical steps in order
+ plus two parameters (recipient list, week window).
+ tags: [dream, procedure, p1]
+ priority: high
+ rubric: multi-session-memory
+
+ sessions:
+ - id: s1-week-1
+ time_offset: "0h"
+ reset: none
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Pull this week's metrics from analytics, summarize per channel, then draft an email to the leadership list. Send when ready."
+
+ - id: s2-week-2
+ time_offset: "168h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Same weekly report. Pull this week's numbers, channel breakdown, email to leadership."
+
+ - id: s3-week-3
+ time_offset: "336h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Run the weekly report again."
+
+ expectations:
+ expected_behavior: |
+ Dream pass extracts a ProcedureMemory with the four ordered steps
+ and the two recurring parameters. Step order matters: pull
+ before summarize before draft before send.
+ expected_outcome: resolved
+ failure_modes:
+ - missing_step: Procedure has < 4 steps
+ - wrong_order: Steps present but in the wrong order
+ - missing_parameters: recipient_list or week_window not captured
+
+ procedure:
+ golden_steps:
+ - "pull metrics from analytics"
+ - "summarize per channel"
+ - "draft email to stakeholders"
+ - "send"
+ golden_parameters:
+ - "recipient_list"
+ - "week_window"
+ pass_threshold: 0.7
+ weight:
+ step_coverage: 1.0
+ step_order: 2.0
+ parameter_coverage: 1.0
+ source:
+ fixture: fixtures/dream/procedure-weekly-report.json
+
+ # =========================================================================
+ # P1 — procedure synthesis: client onboarding
+ # =========================================================================
+ - id: dream-procedure-client-onboarding
+ name: "Procedure: dream pass extracts the client-onboarding workflow"
+ description: |
+ Second P1 procedure scenario: a different repeated workflow (client
+ onboarding) with its own step set and parameters. Validates the
+ procedure scorer against a non-degenerate-similarity case.
+ tags: [dream, procedure, p1]
+ priority: high
+ rubric: multi-session-memory
+
+ sessions:
+ - id: s1-client-a
+ time_offset: "0h"
+ reset: none
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "New client onboarding: create their folder in the workspace, draft welcome email, schedule kickoff call, share the onboarding doc."
+
+ - id: s2-client-b
+ time_offset: "72h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Onboarding another new client. Same steps."
+
+ expectations:
+ expected_behavior: |
+ Procedure has all four steps in order with the two parameters
+ (client name, kickoff date).
+ expected_outcome: resolved
+ failure_modes:
+ - missing_step: < 4 steps captured
+ - wrong_order: Steps present but reordered
+ - missing_parameters: client_name or kickoff_date not captured
+
+ procedure:
+ golden_steps:
+ - "create folder in workspace"
+ - "draft welcome email"
+ - "schedule kickoff call"
+ - "share onboarding doc"
+ golden_parameters:
+ - "client_name"
+ - "kickoff_date"
+ pass_threshold: 0.7
+ source:
+ fixture: fixtures/dream/procedure-onboarding.json
+
+ # =========================================================================
+ # P2 — dedup: clean near-duplicate merge
+ # =========================================================================
+ - id: dream-dedup-near-duplicates
+ name: "Dedup: near-duplicate Sarah-billing facts are merged"
+ description: |
+ Validates P2 memory dedup. Two near-duplicate facts about Sarah's
+ billing role ("Sarah moved to billing" vs "Sarah is on the billing
+ team") should be merged into one cluster; HubSpot and fiscal-year
+ facts stay as singletons.
+ tags: [dream, dedup, p2]
+ priority: high
+ rubric: multi-session-memory
+
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Show me what you know."
+
+ expectations:
+ expected_behavior: |
+ Dedup pass clusters the two Sarah-billing facts together. ARI
+ approaches 1.0; pairwise F1 = 1.0.
+ expected_outcome: resolved
+ failure_modes:
+ - under_merge: Each Sarah-billing fact stayed in its own cluster
+ - over_merge: Sarah-billing got merged with an unrelated fact
+
+ dedup:
+ golden_clusters:
+ - ["fact-sarah-billing-1", "fact-sarah-billing-2"]
+ - ["fact-hubspot-crm-1"]
+ - ["fact-fiscal-year-1"]
+ pass_threshold: 0.7
+ source:
+ fixture: fixtures/dream/dedup-near-duplicates.json
+
+ # =========================================================================
+ # P2 negative — dedup that over-merges (false positive)
+ # =========================================================================
+ - id: dream-dedup-false-positive
+ name: "Dedup: false-positive merge (Sarah-billing + Sarah-manager) fails"
+ description: |
+ Negative test: fixture over-merges by sweeping a Sarah-manager
+ fact into the Sarah-billing cluster. ARI drops; pairwise precision
+ drops; the scenario is expected to FAIL.
+ tags: [dream, dedup, negative, p2]
+ priority: high
+ rubric: multi-session-memory
+
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Show me what you know."
+
+ expectations:
+ expected_behavior: |
+ Fixture intentionally has an over-merge. Scenario is expected to
+ FAIL on the dedup score.
+ expected_outcome: resolved
+ failure_modes:
+ - false_positive: Scorer marked the over-merge as passing
+
+ dedup:
+ golden_clusters:
+ - ["fact-sarah-billing-1", "fact-sarah-billing-2"]
+ - ["fact-sarah-manager-1"]
+ - ["fact-hubspot-crm-1"]
+ pass_threshold: 0.7
+ source:
+ fixture: fixtures/dream/dedup-false-positive.json
diff --git a/data/fixtures/dream/dedup-false-positive.json b/data/fixtures/dream/dedup-false-positive.json
new file mode 100644
index 0000000..16a52b9
--- /dev/null
+++ b/data/fixtures/dream/dedup-false-positive.json
@@ -0,0 +1,10 @@
+{
+ "clusters": [
+ [
+ "fact-sarah-billing-1",
+ "fact-sarah-billing-2",
+ "fact-sarah-manager-1"
+ ],
+ ["fact-hubspot-crm-1"]
+ ]
+}
diff --git a/data/fixtures/dream/dedup-near-duplicates.json b/data/fixtures/dream/dedup-near-duplicates.json
new file mode 100644
index 0000000..1a154c4
--- /dev/null
+++ b/data/fixtures/dream/dedup-near-duplicates.json
@@ -0,0 +1,7 @@
+{
+ "clusters": [
+ ["fact-sarah-billing-1", "fact-sarah-billing-2"],
+ ["fact-hubspot-crm-1"],
+ ["fact-fiscal-year-1"]
+ ]
+}
diff --git a/data/fixtures/dream/demotion-cascade-bounded.json b/data/fixtures/dream/demotion-cascade-bounded.json
new file mode 100644
index 0000000..5cfc44b
--- /dev/null
+++ b/data/fixtures/dream/demotion-cascade-bounded.json
@@ -0,0 +1,8 @@
+{
+ "observed": ["edge-northstar-marcus", "edge-northstar-cto"],
+ "cascade_touched": ["edge-northstar-marcus", "edge-northstar-cto"],
+ "retract_actions": [
+ { "uuid": "edge-northstar-marcus", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" },
+ { "uuid": "edge-northstar-cto", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" }
+ ]
+}
diff --git a/data/fixtures/dream/demotion-cascade-runaway.json b/data/fixtures/dream/demotion-cascade-runaway.json
new file mode 100644
index 0000000..5f0c0e5
--- /dev/null
+++ b/data/fixtures/dream/demotion-cascade-runaway.json
@@ -0,0 +1,13 @@
+{
+ "observed": ["edge-northstar-marcus", "edge-northstar-cto", "edge-hubspot-crm"],
+ "cascade_touched": [
+ "edge-northstar-marcus",
+ "edge-northstar-cto",
+ "edge-hubspot-crm"
+ ],
+ "retract_actions": [
+ { "uuid": "edge-northstar-marcus", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" },
+ { "uuid": "edge-northstar-cto", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" },
+ { "uuid": "edge-hubspot-crm", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" }
+ ]
+}
diff --git a/data/fixtures/dream/demotion-snodgrass-retract.json b/data/fixtures/dream/demotion-snodgrass-retract.json
new file mode 100644
index 0000000..7debae7
--- /dev/null
+++ b/data/fixtures/dream/demotion-snodgrass-retract.json
@@ -0,0 +1,6 @@
+{
+ "observed": ["edge-budget-q2"],
+ "retract_actions": [
+ { "uuid": "edge-budget-q2", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" }
+ ]
+}
diff --git a/data/fixtures/dream/demotion-snodgrass-violation.json b/data/fixtures/dream/demotion-snodgrass-violation.json
new file mode 100644
index 0000000..241f69b
--- /dev/null
+++ b/data/fixtures/dream/demotion-snodgrass-violation.json
@@ -0,0 +1,11 @@
+{
+ "observed": ["edge-budget-q2"],
+ "retract_actions": [
+ {
+ "uuid": "edge-budget-q2",
+ "expired_at_set": true,
+ "invalid_at_set": true,
+ "status": "superseded"
+ }
+ ]
+}
diff --git a/data/fixtures/dream/demotion-stale-fact.json b/data/fixtures/dream/demotion-stale-fact.json
new file mode 100644
index 0000000..dcd2abe
--- /dev/null
+++ b/data/fixtures/dream/demotion-stale-fact.json
@@ -0,0 +1,11 @@
+{
+ "observed": ["edge-pricing-old"],
+ "retract_actions": [
+ {
+ "uuid": "edge-pricing-old",
+ "expired_at_set": true,
+ "invalid_at_set": false,
+ "status": "superseded"
+ }
+ ]
+}
diff --git a/data/fixtures/dream/procedure-onboarding.json b/data/fixtures/dream/procedure-onboarding.json
new file mode 100644
index 0000000..e2d0bce
--- /dev/null
+++ b/data/fixtures/dream/procedure-onboarding.json
@@ -0,0 +1,9 @@
+{
+ "steps": [
+ "create folder in workspace",
+ "draft welcome email",
+ "schedule kickoff call",
+ "share onboarding doc"
+ ],
+ "parameters": ["client_name", "kickoff_date"]
+}
diff --git a/data/fixtures/dream/procedure-weekly-report.json b/data/fixtures/dream/procedure-weekly-report.json
new file mode 100644
index 0000000..f829f0c
--- /dev/null
+++ b/data/fixtures/dream/procedure-weekly-report.json
@@ -0,0 +1,9 @@
+{
+ "steps": [
+ "pull metrics from analytics",
+ "summarize per channel",
+ "draft email to stakeholders",
+ "send"
+ ],
+ "parameters": ["recipient_list", "week_window"]
+}
diff --git a/data/fixtures/retrieval/cascade-after-northstar-invalidated.json b/data/fixtures/retrieval/cascade-after-northstar-invalidated.json
new file mode 100644
index 0000000..d6c0073
--- /dev/null
+++ b/data/fixtures/retrieval/cascade-after-northstar-invalidated.json
@@ -0,0 +1,17 @@
+[
+ {
+ "id": "internal-process-invoicing",
+ "label": "Standing rule: invoices on the 1st",
+ "entity": "policy"
+ },
+ {
+ "id": "hubspot-config",
+ "label": "HubSpot is the CRM",
+ "entity": "tool"
+ },
+ {
+ "id": "fiscal-year",
+ "label": "Fiscal year ends in March",
+ "entity": "policy"
+ }
+]
diff --git a/data/fixtures/retrieval/forget-budget-after.json b/data/fixtures/retrieval/forget-budget-after.json
new file mode 100644
index 0000000..0b1a6f3
--- /dev/null
+++ b/data/fixtures/retrieval/forget-budget-after.json
@@ -0,0 +1,14 @@
+[
+ {
+ "id": "fact-marketing-channel-plan",
+ "label": "Marketing channel plan: balance paid and organic"
+ },
+ {
+ "id": "fact-q2-sales-cycle",
+ "label": "Q2 sales cycle averages six weeks"
+ },
+ {
+ "id": "fact-finance-runway",
+ "label": "Runway through Q3 funded from existing revenue"
+ }
+]
diff --git a/data/fixtures/retrieval/scope-project-atlas.json b/data/fixtures/retrieval/scope-project-atlas.json
new file mode 100644
index 0000000..6bbba1f
--- /dev/null
+++ b/data/fixtures/retrieval/scope-project-atlas.json
@@ -0,0 +1,17 @@
+[
+ {
+ "id": "atlas-spec-doc",
+ "label": "Atlas project specification: v2 metrics overhaul",
+ "scope": "project:atlas"
+ },
+ {
+ "id": "atlas-status",
+ "label": "Atlas status: green, on track for Q3",
+ "scope": "project:atlas"
+ },
+ {
+ "id": "atlas-owner",
+ "label": "Atlas owner: Sarah",
+ "scope": "project:atlas"
+ }
+]
diff --git a/data/fixtures/retrieval/stale-fact-supersession.json b/data/fixtures/retrieval/stale-fact-supersession.json
new file mode 100644
index 0000000..7f9afd8
--- /dev/null
+++ b/data/fixtures/retrieval/stale-fact-supersession.json
@@ -0,0 +1,20 @@
+[
+ {
+ "id": "fact-pricing-current",
+ "label": "Pricing: $79/seat starter, $129/seat pro (current)",
+ "status": "active",
+ "valid_at": "2026-04-01"
+ },
+ {
+ "id": "fact-roadmap-current",
+ "label": "Roadmap: shipping the metrics overhaul this quarter",
+ "status": "active",
+ "valid_at": "2026-04-15"
+ },
+ {
+ "id": "fact-pricing-old",
+ "label": "Pricing: $49/seat flat (old, superseded)",
+ "status": "superseded",
+ "valid_at": "2025-09-01"
+ }
+]
diff --git a/data/fixtures/retrieval/warm-context-sarah.json b/data/fixtures/retrieval/warm-context-sarah.json
new file mode 100644
index 0000000..75f1d81
--- /dev/null
+++ b/data/fixtures/retrieval/warm-context-sarah.json
@@ -0,0 +1,27 @@
+[
+ {
+ "id": "fact-1",
+ "label": "Sarah's email: sarah@acme.co",
+ "scope": "personal"
+ },
+ {
+ "id": "fact-2",
+ "label": "Atlas project status: green, shipping Q3",
+ "scope": "project:atlas"
+ },
+ {
+ "id": "fact-3",
+ "label": "HubSpot is the CRM",
+ "scope": "personal"
+ },
+ {
+ "id": "fact-4",
+ "label": "Marcus Lee, marcus@northstar.io, CTO",
+ "scope": "personal"
+ },
+ {
+ "id": "fact-5",
+ "label": "Standing rule: invoices on the 1st",
+ "scope": "personal"
+ }
+]
diff --git a/data/retrieval-memory.yaml b/data/retrieval-memory.yaml
new file mode 100644
index 0000000..854976c
--- /dev/null
+++ b/data/retrieval-memory.yaml
@@ -0,0 +1,398 @@
+version: "1.0"
+id: "retrieval-memory-v1"
+name: "Retrieval-scored Memory Evaluation"
+
+defaults:
+ max_turns: 6
+ timeout_seconds: 30
+ category: "Memory"
+ persona: smb-founder
+ rubric: multi-session-memory
+ user_name: "Jordan Rivera"
+ copilot_mode: "fast"
+
+# Retrieval-scored memory pack.
+#
+# This pack pairs the existing conversational rubric with a quantitative
+# ranking scorer that grades the top-k of a returned memory list against a
+# curated golden set. Each scenario carries a `retrieval:` block that
+# declares:
+# - `golden`: the items the memory system should surface
+# - `forbidden` (optional): items that MUST NOT be in the top-k; a forbidden
+# hit forces a fail regardless of weighted score (used for forget,
+# scope-filter, and demotion-correctness probes)
+# - `k`: rank cutoff
+# - `weight`: per-metric weights for the weighted aggregate
+# - `pass_threshold`: pass cutoff on the weighted aggregate
+# - `match`: comparison policy (substring | exact | regex)
+# - `source`: where to read the returned items at scoring time
+# * `raw_exchange_key`: read from `last_reply.rawExchange[]`
+# when the adapter returns retrieval payloads inline (production
+# AutoGPT-backed runs)
+# * `fixture`: load a JSON file relative to this YAML (offline mode,
+# exercising the scorer math against a known graph snapshot)
+#
+# Scenarios use fixtures here so they can be run against the scorer math
+# without a live AutoGPT backend. Swap `source.fixture` for `source.raw_exchange_key`
+# when running against a backend that emits retrieval payloads inline.
+
+scenarios:
+
+ # =========================================================================
+ # 1. Forget-on-request (ranking sibling of mem-negative-forget-on-request)
+ # =========================================================================
+ - id: mem-retrieval-forget-on-request
+ name: "Retrieval: explicit forget command removes the fact from top-k"
+ description: |
+ Ranking-scored sibling of `mem-negative-forget-on-request` in
+ multi-session-memory.yaml (line 1710). Where the parent scenario uses
+ the LLM-judge to assert that the assistant doesn't return $50K, this
+ scenario asserts directly that the forbidden item is not in the
+ top-k of the retrieval response.
+ tags: [multi_session, memory, negative, retrieval]
+ priority: high
+ rubric: memory-abstention
+
+ context:
+ system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations."
+
+ sessions:
+ - id: s1-teach
+ time_offset: "0h"
+ reset: none
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Quick planning note: our budget for the Q2 marketing push is $50K. Help me sketch out how to allocate it across channels."
+
+ - id: s2-forget
+ time_offset: "48h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Actually, scratch what I said about the Q2 marketing budget. I don't want that number floating around in your memory. Forget it."
+
+ - id: s3-probe
+ time_offset: "120h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "What's our Q2 marketing budget?"
+
+ expectations:
+ expected_behavior: |
+ Agent's retrieval response in S3 must not include the $50K figure or
+ the Q2 marketing budget fact. Other adjacent marketing-related
+ memories may still surface.
+ expected_outcome: resolved
+ failure_modes:
+ - forget_ignored: $50K appears in the top-k
+ - forget_acknowledged_but_leaked: $50K appears in the top-k even though the assistant text says it was forgotten (proves forget didn't clear storage -- worst failure)
+ - fabrication: A different number appears in the top-k
+ tester_note: |
+ Pair this with the existing `mem-negative-forget-on-request` judge
+ scenario in multi-session-memory.yaml. The judge checks the agent's
+ text reply; this scenario checks the retrieval payload directly.
+
+ retrieval:
+ golden:
+ - "Marketing channel plan"
+ - "Q2 sales cycle"
+ forbidden:
+ - "$50K"
+ - "Q2 marketing budget"
+ k: 5
+ pass_threshold: 0.2
+ match: substring
+ weight:
+ precision_at_k: 0.5
+ recall_at_k: 0.5
+ mrr: 1.0
+ ndcg_at_k: 1.0
+ source:
+ fixture: fixtures/retrieval/forget-budget-after.json
+
+ # =========================================================================
+ # 2. Warm-context relevance — given a query, does the right set surface?
+ # =========================================================================
+ - id: mem-retrieval-warm-context-sarah
+ name: "Retrieval: warm-context relevance for a typical knowledge query"
+ description: |
+ Given a working session that establishes Sarah, the Atlas project, the
+ CRM choice, and several incidental facts, a follow-up query about
+ Sarah should surface her email and the Atlas project status near the
+ top of the returned set. Tangential facts (invoicing rule, contact
+ info) may co-occur lower; this is graded by NDCG@k.
+ tags: [multi_session, memory, retention, retrieval]
+ priority: high
+ rubric: multi-session-memory
+
+ context:
+ system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations."
+
+ sessions:
+ - id: s1-seed
+ time_offset: "0h"
+ reset: none
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Quick housekeeping: my assistant is Sarah -- CC her on client stuff. Our CRM is HubSpot, our standing rule is invoices on the 1st, and Atlas is our flagship project (status: green, shipping Q3). New contact: Marcus Lee , NorthStar, CTO."
+
+ - id: s2-probe
+ time_offset: "48h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "What do you have on Sarah and the Atlas project?"
+
+ expectations:
+ expected_behavior: |
+ Sarah's email and Atlas project status appear in the top-k of the
+ retrieval response. Adjacent identity facts may co-surface, but the
+ two gold items should dominate the early ranks.
+ expected_outcome: resolved
+ failure_modes:
+ - partial_recall: Only one of the two gold facts surfaces
+ - missing: Neither gold fact surfaces (retention failure)
+ - poor_ranking: Gold facts present but buried below tangential ones (low NDCG)
+
+ retrieval:
+ golden:
+ - "Sarah's email"
+ - "Atlas project status"
+ k: 5
+ pass_threshold: 0.5
+ match: substring
+ weight:
+ precision_at_k: 1.0
+ recall_at_k: 1.0
+ mrr: 1.0
+ ndcg_at_k: 1.0
+ source:
+ fixture: fixtures/retrieval/warm-context-sarah.json
+
+ # =========================================================================
+ # 3. Stale-fact demotion — superseded fact must not appear in top-k
+ # =========================================================================
+ - id: mem-retrieval-stale-fact-demotion
+ name: "Retrieval: superseded pricing must not appear in top-k"
+ description: |
+ Pricing is updated mid-flight; the old pricing should be demoted
+ (marked `superseded`) and excluded from retrieval. The new pricing
+ and the current roadmap should surface. This is the dream-pass
+ "stale-fact deprecation" probe expressed as a ranking assertion.
+ tags: [multi_session, memory, temporal, retrieval]
+ priority: high
+ rubric: memory-temporal
+
+ context:
+ system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations."
+
+ sessions:
+ - id: s1-old-pricing
+ time_offset: "0h"
+ reset: none
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Quick reference for proposals: our pricing is $49/seat flat. Use that when drafting deals."
+
+ - id: s2-pricing-update
+ time_offset: "240h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Pricing change: we're moving to tiered pricing. $79/seat starter, $129/seat pro. Use the new tiers from here forward."
+
+ - id: s3-probe
+ time_offset: "480h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "What's our current pricing for new deals?"
+
+ expectations:
+ expected_behavior: |
+ Retrieval surfaces only the new tiered pricing. The old $49/seat
+ figure should be demoted (superseded) and absent from the top-k.
+ expected_outcome: resolved
+ failure_modes:
+ - stale_returned: Old $49/seat figure appears in top-k
+ - both_returned: Both old and new appear (no demotion happened)
+ - missing: Neither appears (retention regression)
+
+ retrieval:
+ golden:
+ - "$79/seat starter, $129/seat pro"
+ forbidden:
+ - "$49/seat"
+ k: 3
+ pass_threshold: 0.5
+ match: substring
+ weight:
+ precision_at_k: 1.0
+ recall_at_k: 1.0
+ mrr: 1.0
+ ndcg_at_k: 1.0
+ source:
+ fixture: fixtures/retrieval/stale-fact-supersession.json
+
+ # =========================================================================
+ # 4. Scope filtering — project:atlas query must not surface project:other
+ # =========================================================================
+ - id: mem-retrieval-scope-filter-project
+ name: "Retrieval: scope-filtered query stays inside its scope"
+ description: |
+ A query scoped to project:atlas should surface only Atlas memories;
+ memories scoped to other projects (or personal scope) must not
+ appear in the top-k. Validates typed-edge filtering in the dream
+ memory graph.
+ tags: [multi_session, memory, retrieval, scope]
+ priority: high
+ rubric: memory-crossdomain
+
+ context:
+ system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations."
+ injected_data:
+ query_scope: "project:atlas"
+
+ sessions:
+ - id: s1-mixed-context
+ time_offset: "0h"
+ reset: none
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Working notes: Atlas project is green (v2 metrics overhaul). Sarah owns it. By the way, the Beacon project is on pause and the standard invoicing rule is the 1st of the month."
+
+ - id: s2-scoped-probe
+ time_offset: "72h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Tell me about the Atlas project specifically -- only Atlas, not anything else."
+
+ expectations:
+ expected_behavior: |
+ Retrieval returns only Atlas-scoped facts. Beacon, invoicing rule,
+ and other unrelated facts must not appear in the top-k.
+ expected_outcome: resolved
+ failure_modes:
+ - scope_leak_other_project: Beacon or another project's memory appears
+ - scope_leak_personal: Personal-scope memory appears in a project-scoped query
+ - missing: No Atlas facts surface
+
+ retrieval:
+ golden:
+ - "Atlas project specification"
+ - "Atlas status"
+ - "Atlas owner"
+ forbidden:
+ - "Beacon"
+ - "invoices on the 1st"
+ k: 3
+ pass_threshold: 0.5
+ match: substring
+ weight:
+ precision_at_k: 1.0
+ recall_at_k: 1.0
+ mrr: 1.0
+ ndcg_at_k: 1.0
+ source:
+ fixture: fixtures/retrieval/scope-project-atlas.json
+
+ # =========================================================================
+ # 5. Cascading expiry — invalidated entity removes its facts, not the
+ # tangentially-related ones
+ # =========================================================================
+ - id: mem-retrieval-cascading-expiry
+ name: "Retrieval: invalidating an entity removes its facts but not adjacent ones"
+ description: |
+ The user retracts the NorthStar lead (entity invalidated). Retrieval
+ after the retract must NOT surface Marcus Lee or any NorthStar fact,
+ but adjacent operational memory (CRM choice, fiscal year, standing
+ invoicing rule) should remain. Validates that the cascade is bounded
+ to the entity's own facts.
+ tags: [multi_session, memory, retrieval, cascading]
+ priority: medium
+ rubric: memory-hygiene
+
+ context:
+ system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations."
+
+ sessions:
+ - id: s1-seed-northstar
+ time_offset: "0h"
+ reset: none
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Add a new contact: Marcus Lee , NorthStar, CTO. Also: HubSpot is our CRM, our fiscal year ends in March, and invoices go out on the 1st."
+
+ - id: s2-retract-entity
+ time_offset: "72h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Actually, scratch NorthStar entirely. The deal fell through, the lead's dead, please retract everything we have on them or Marcus."
+
+ - id: s3-probe-adjacent
+ time_offset: "168h"
+ reset: fresh_agent
+ max_turns: 1
+ turns:
+ - role: user
+ use_exact_message: true
+ content: "Refresh me on our operational basics -- CRM, fiscal year, billing cadence."
+
+ expectations:
+ expected_behavior: |
+ Retrieval in S3 surfaces operational adjacent facts (HubSpot,
+ fiscal year, invoicing rule). NorthStar and Marcus Lee facts are
+ absent. The cascade should not have removed adjacent unrelated
+ memory.
+ expected_outcome: resolved
+ failure_modes:
+ - cascade_too_wide: Adjacent facts (HubSpot, fiscal year, invoicing) missing -- cascade swept too much
+ - cascade_too_narrow: Marcus Lee or NorthStar still appears -- cascade didn't reach attached facts
+ - retain_after_retract: NorthStar entity still listed as active
+
+ retrieval:
+ golden:
+ - "HubSpot is the CRM"
+ - "Fiscal year ends in March"
+ - "invoices on the 1st"
+ forbidden:
+ - "Marcus Lee"
+ - "NorthStar"
+ - "marcus@northstar.io"
+ k: 5
+ pass_threshold: 0.6
+ match: substring
+ weight:
+ precision_at_k: 1.0
+ recall_at_k: 1.0
+ mrr: 1.0
+ ndcg_at_k: 1.0
+ source:
+ fixture: fixtures/retrieval/cascade-after-northstar-invalidated.json
diff --git a/docs/QUALITY_SCORE.md b/docs/QUALITY_SCORE.md
index 6f85cd8..952c3c5 100644
--- a/docs/QUALITY_SCORE.md
+++ b/docs/QUALITY_SCORE.md
@@ -1,6 +1,6 @@
# Quality Score
-Last updated: 2026-05-08
+Last updated: 2026-05-13
## Health summary
diff --git a/docs/generated/workspace-inventory.md b/docs/generated/workspace-inventory.md
index 865161b..2cc904b 100644
--- a/docs/generated/workspace-inventory.md
+++ b/docs/generated/workspace-inventory.md
@@ -1,6 +1,6 @@
# Workspace Inventory
-Generated: 2026-05-08T14:16:42.633Z
+Generated: 2026-05-13T19:37:36.614Z
```text
AGENTS.md
@@ -66,6 +66,7 @@ data/
data/adversarial-scenarios.yaml
data/autogpt-endpoint.yaml
data/baseline-scenarios.yaml
+ data/dream-validation.yaml
data/fixture-manifest.json
data/fixtures/
data/fixtures/ad_spend_3mo.csv
@@ -127,6 +128,16 @@ data/fixtures/
data/fixtures/dm_corpus.json
data/fixtures/document_types.yaml
data/fixtures/domain_to_industry_mapping.json
+data/fixtures/dream/
+ data/fixtures/dream/dedup-false-positive.json
+ data/fixtures/dream/dedup-near-duplicates.json
+ data/fixtures/dream/demotion-cascade-bounded.json
+ data/fixtures/dream/demotion-cascade-runaway.json
+ data/fixtures/dream/demotion-snodgrass-retract.json
+ data/fixtures/dream/demotion-snodgrass-violation.json
+ data/fixtures/dream/demotion-stale-fact.json
+ data/fixtures/dream/procedure-onboarding.json
+ data/fixtures/dream/procedure-weekly-report.json
data/fixtures/editor_brief_template.md
data/fixtures/employee_details.json
data/fixtures/erp_ledger_2026-03.csv
@@ -243,6 +254,12 @@ data/fixtures/
data/fixtures/research_questions.md
data/fixtures/resume.pdf
data/fixtures/retention_by_product.csv
+data/fixtures/retrieval/
+ data/fixtures/retrieval/cascade-after-northstar-invalidated.json
+ data/fixtures/retrieval/forget-budget-after.json
+ data/fixtures/retrieval/scope-project-atlas.json
+ data/fixtures/retrieval/stale-fact-supersession.json
+ data/fixtures/retrieval/warm-context-sarah.json
data/fixtures/reviews_android.json
data/fixtures/reviews_google_yelp.json
data/fixtures/reviews_ios.json
@@ -310,6 +327,7 @@ data/fixtures/snapshots/
data/openclaw-endpoints.yaml
data/opencode-endpoints.yaml
data/personas.yaml
+ data/retrieval-memory.yaml
data/rubric.yaml
data/scenarios.yaml
docker-compose.yml
@@ -395,8 +413,24 @@ src/cli/
src/cli/main.ts
src/domains/
src/domains/evaluation/
+ src/domains/evaluation/clustering.test.ts
+ src/domains/evaluation/clustering.ts
+ src/domains/evaluation/dedup-scorer.test.ts
+ src/domains/evaluation/dedup-scorer.ts
+ src/domains/evaluation/demotion-match.test.ts
+ src/domains/evaluation/demotion-match.ts
+ src/domains/evaluation/demotion-scorer.test.ts
+ src/domains/evaluation/demotion-scorer.ts
src/domains/evaluation/judge.ts
src/domains/evaluation/ports.ts
+ src/domains/evaluation/procedure-match.test.ts
+ src/domains/evaluation/procedure-match.ts
+ src/domains/evaluation/procedure-scorer.test.ts
+ src/domains/evaluation/procedure-scorer.ts
+ src/domains/evaluation/ranking.test.ts
+ src/domains/evaluation/ranking.ts
+ src/domains/evaluation/retrieval-scorer.test.ts
+ src/domains/evaluation/retrieval-scorer.ts
src/domains/evaluation/run-suite.ts
src/domains/evaluation/simulator.ts
src/domains/reporting/
@@ -476,6 +510,7 @@ tests/unit/architecture/
tests/unit/dashboard/
tests/unit/dashboard/compare-view.test.tsx
tests/unit/db.test.ts
+ tests/unit/dream-validation.test.ts
tests/unit/endpoint-config.test.ts
tests/unit/endpoint-overrides-controller.test.ts
tests/unit/judge.test.ts
@@ -493,6 +528,7 @@ tests/unit/persistence/
tests/unit/persistence/repository-contract.test.ts
tests/unit/persistence/url.test.ts
tests/unit/report.test.ts
+ tests/unit/retrieval-memory.test.ts
tests/unit/runner.test.ts
tests/unit/server/
tests/unit/server/comparison.test.ts
diff --git a/docs/product-specs/current-state.md b/docs/product-specs/current-state.md
index 71ee1a4..dcf3f6e 100644
--- a/docs/product-specs/current-state.md
+++ b/docs/product-specs/current-state.md
@@ -35,6 +35,8 @@ Last validated against `platform.md`: 2026-04-17
- [x] Database URL credentials stay redacted in operator-visible output
- [x] Docker Compose readiness waits for server readiness
- [x] Human scoring drains an unscored backlog one chat at a time
+- [x] Ranking-scored scenarios grade retrieval relevance against a curated golden set
+- [x] Dream-system scenarios validate demotion, procedure, and dedup behavior
## Notes
diff --git a/docs/product-specs/e2e-checklist.md b/docs/product-specs/e2e-checklist.md
index b09881d..5540498 100644
--- a/docs/product-specs/e2e-checklist.md
+++ b/docs/product-specs/e2e-checklist.md
@@ -32,3 +32,5 @@ Derived from `platform.md`. Every scenario should have a coverage owner.
| Database URL credentials stay redacted in operator-visible output | `tests/unit/persistence/url.test.ts` + `tests/unit/server/config.test.ts` | ✅ covered |
| Docker Compose readiness waits for server readiness | `docker-compose.yml` + `docs/playbooks/agent-probe-server.md` + `docker compose config` | ✅ covered |
| Human scoring drains an unscored backlog one chat at a time | `tests/integration/server/human-scoring.test.ts` + `tests/unit/persistence/human-scoring.test.ts` | ✅ covered |
+| Ranking-scored scenarios grade retrieval relevance against a curated golden set | `src/domains/evaluation/ranking.test.ts` + `src/domains/evaluation/retrieval-scorer.test.ts` + `tests/unit/retrieval-memory.test.ts` + `tests/unit/runner.test.ts` | ✅ covered |
+| Dream-system scenarios validate demotion, procedure, and dedup behavior | `src/domains/evaluation/clustering.test.ts` + `src/domains/evaluation/demotion-match.test.ts` + `src/domains/evaluation/procedure-match.test.ts` + `src/domains/evaluation/{demotion,procedure,dedup}-scorer.test.ts` + `tests/unit/dream-validation.test.ts` | ✅ covered |
diff --git a/docs/product-specs/platform.md b/docs/product-specs/platform.md
index 85711c1..aec8c4d 100644
--- a/docs/product-specs/platform.md
+++ b/docs/product-specs/platform.md
@@ -298,6 +298,41 @@ The queue ignores scenario_runs whose status is not `completed`, and rerunning
the click on a scenario already scored for the dimension is an upsert (no new
row).
+### Ranking-scored scenarios grade retrieval relevance against a curated golden set
+
+**Given** a scenario YAML that declares a `retrieval:` block with `golden`
+(required), optional `forbidden`, `k`, `match`, `pass_threshold`, per-metric
+`weight`, and a `source` (either `raw_exchange_key` or `fixture`)
+**When** AgentProbe runs the scenario and the adapter returns a retrieval
+payload (either inline on the last reply's `rawExchange[]` or via a
+JSON fixture relative to the scenario YAML)
+**Then** the runner computes precision@k, recall@k, MRR, and NDCG@k on the
+returned list against the golden set, aggregates them under a weighted
+average with `pass_threshold`, and forces a scenario fail when any
+`forbidden` item appears in the top-k. Per-metric and aggregate scores are
+persisted to `retrieval_scores` keyed by `scenario_run_id` for replay, and
+the rendered run report surfaces them alongside the LLM-judge dimensions.
+
+### Dream-system scenarios validate demotion, procedure, and dedup behavior
+
+**Given** a scenario YAML that declares one of `demotion:`, `procedure:`, or
+`dedup:` (each a sibling of the existing `retrieval:` block) with its own
+golden expectation, weight, threshold, and `source` (fixture or
+`raw_exchange_key`)
+**When** AgentProbe runs the scenario and the adapter returns the
+corresponding payload (observed demotions / extracted procedure / predicted
+clusters) inline on the last reply's `rawExchange` or via a JSON fixture
+**Then** the runner computes the appropriate metric set — set
+precision/recall/F1 + Snodgrass timestamp discipline + single-hop cascade
+bound for demotion; step-coverage F1 + LCS-normalized order similarity +
+parameter Jaccard for procedure; pairwise P/R/F1 + Adjusted Rand Index for
+dedup — aggregates under per-metric weights, and forces a fail on hard
+violations (Snodgrass conflict, runaway cascade, over- or under-merge below
+threshold). Per-metric and aggregate scores are persisted to
+`demotion_scores`, `procedure_scores`, and `dedup_scores` tables keyed by
+`scenario_run_id`, and the rendered run report surfaces them alongside the
+LLM-judge and retrieval dimensions.
+
### Database URL credentials stay redacted in operator-visible output
**Given** an operator configures persistence with a database URL that contains
diff --git a/scripts/seed-eval-scores.ts b/scripts/seed-eval-scores.ts
new file mode 100644
index 0000000..0f72610
--- /dev/null
+++ b/scripts/seed-eval-scores.ts
@@ -0,0 +1,186 @@
+/**
+ * One-off seeder that writes a run + scenario_runs + retrieval/demotion/
+ * procedure/dedup scores into the SQLite DB so the dashboard EvalScoresView
+ * has real data to render. Intended for local-dev demo / smoke testing.
+ *
+ * Usage:
+ * AGENTPROBE_DB_URL="sqlite:///$(pwd)/data/.agentprobe/runs.sqlite3" \
+ * bun run scripts/seed-eval-scores.ts
+ */
+
+import { randomUUID } from "node:crypto";
+
+import { scoreScenarioDedup } from "../src/domains/evaluation/dedup-scorer.ts";
+import { scoreScenarioDemotion } from "../src/domains/evaluation/demotion-scorer.ts";
+import { scoreScenarioProcedure } from "../src/domains/evaluation/procedure-scorer.ts";
+import { scoreRetrieval } from "../src/domains/evaluation/retrieval-scorer.ts";
+import {
+ parseRubricsYaml,
+ parseScenarioYaml,
+} from "../src/domains/validation/load-suite.ts";
+import { SqliteRunRecorder } from "../src/providers/persistence/sqlite-run-history.ts";
+
+const dbUrl =
+ Bun.env.AGENTPROBE_DB_URL ??
+ `sqlite:///${process.cwd()}/data/.agentprobe/runs.sqlite3`;
+
+console.log(`Seeding eval scores into ${dbUrl}`);
+
+const rubrics = parseRubricsYaml(`${process.cwd()}/data/rubric.yaml`).rubrics;
+
+const dreamScenarios = parseScenarioYaml(
+ `${process.cwd()}/data/dream-validation.yaml`,
+);
+const retrievalScenarios = parseScenarioYaml(
+ `${process.cwd()}/data/retrieval-memory.yaml`,
+);
+
+const recorder = new SqliteRunRecorder(dbUrl);
+
+const runId = await recorder.recordRunStarted({
+ endpoint: "data/autogpt-endpoint.yaml",
+ scenarios: "data/dream-validation.yaml + data/retrieval-memory.yaml",
+ personas: "data/personas.yaml",
+ rubric: "data/rubric.yaml",
+ label: "eval-scores demo seed",
+ notes: "seeded by scripts/seed-eval-scores.ts to populate the dashboard",
+ trigger: "manual",
+});
+console.log(`Run id: ${runId}`);
+
+const allScenarios = [
+ ...dreamScenarios.scenarios,
+ ...retrievalScenarios.scenarios,
+];
+const personaSnapshot = {
+ id: "smb-founder",
+ name: "SMB Founder",
+};
+let ordinal = 1;
+
+for (const scenario of allScenarios) {
+ const rubric = rubrics.find((r) => r.id === scenario.rubric);
+ if (!rubric) {
+ console.warn(`Skipping ${scenario.id}: no rubric resolved`);
+ continue;
+ }
+
+ const scenarioRunId = await recorder.recordScenarioStarted({
+ scenario,
+ persona: {
+ id: personaSnapshot.id,
+ name: personaSnapshot.name,
+ demographics: {
+ role: "founder",
+ techLiteracy: "high",
+ domainExpertise: "intermediate",
+ languageStyle: "terse",
+ },
+ personality: {
+ patience: 3,
+ assertiveness: 4,
+ detailOrientation: 4,
+ cooperativeness: 4,
+ emotionalIntensity: 2,
+ },
+ behavior: {
+ openingStyle: "direct",
+ followUpStyle: "concise",
+ escalationTriggers: [],
+ topicDrift: "low",
+ clarificationCompliance: "high",
+ },
+ systemPrompt: "You are an SMB founder.",
+ },
+ rubric,
+ ordinal,
+ userId: randomUUID(),
+ });
+
+ await recorder.recordJudgeResult(scenarioRunId, {
+ rubric,
+ score: {
+ dimensions: Object.fromEntries(
+ rubric.dimensions.map((dim) => [
+ dim.id,
+ {
+ reasoning: "Synthetic seed data.",
+ evidence: ["seed"],
+ score: dim.scale.points ?? 1,
+ },
+ ]),
+ ),
+ overallNotes: "Synthetic seed",
+ passed: true,
+ },
+ overallScore: 1.0,
+ });
+
+ const evalContext = {
+ scenariosPath:
+ `${process.cwd()}/data/` +
+ (scenario.dedup || scenario.demotion || scenario.procedure
+ ? "dream-validation.yaml"
+ : "retrieval-memory.yaml"),
+ };
+
+ let allPassed = true;
+ const retrieval = scoreRetrieval(scenario, evalContext);
+ if (retrieval) {
+ await recorder.recordRetrievalResult(scenarioRunId, {
+ scenario,
+ score: retrieval,
+ });
+ allPassed = allPassed && retrieval.passed;
+ }
+ const demotion = scoreScenarioDemotion(scenario, evalContext);
+ if (demotion) {
+ await recorder.recordDemotionResult(scenarioRunId, {
+ scenario,
+ score: demotion,
+ });
+ allPassed = allPassed && demotion.passed;
+ }
+ const procedure = scoreScenarioProcedure(scenario, evalContext);
+ if (procedure) {
+ await recorder.recordProcedureResult(scenarioRunId, {
+ scenario,
+ score: procedure,
+ });
+ allPassed = allPassed && procedure.passed;
+ }
+ const dedup = scoreScenarioDedup(scenario, evalContext);
+ if (dedup) {
+ await recorder.recordDedupResult(scenarioRunId, {
+ scenario,
+ score: dedup,
+ });
+ allPassed = allPassed && dedup.passed;
+ }
+
+ await recorder.recordScenarioFinished(scenarioRunId, {
+ result: {
+ scenarioId: scenario.id,
+ scenarioName: scenario.name,
+ personaId: "smb-founder",
+ rubricId: rubric.id,
+ passed: allPassed,
+ overallScore: 1.0,
+ transcript: [],
+ checkpoints: [],
+ },
+ });
+ ordinal += 1;
+ console.log(
+ ` [${ordinal - 1}/${allScenarios.length}] ${scenario.id}: passed=${allPassed}`,
+ );
+}
+
+await recorder.recordRunFinished({
+ runId,
+ passed: true,
+ exitCode: 0,
+ results: [],
+});
+
+console.log(`Seeded ${ordinal - 1} scenarios in run ${runId}`);
diff --git a/src/domains/evaluation/clustering.test.ts b/src/domains/evaluation/clustering.test.ts
new file mode 100644
index 0000000..a2353ff
--- /dev/null
+++ b/src/domains/evaluation/clustering.test.ts
@@ -0,0 +1,158 @@
+import { describe, expect, test } from "bun:test";
+
+import {
+ adjustedRandIndex,
+ pairwiseAgreement,
+ pairwiseScores,
+ scoreClustering,
+} from "./clustering.ts";
+
+describe("pairwiseAgreement", () => {
+ test("perfect agreement counts every same-cluster pair as TP", () => {
+ const result = pairwiseAgreement([["a", "b", "c"]], [["a", "b", "c"]]);
+ expect(result.truePositives).toBe(3);
+ expect(result.falsePositives).toBe(0);
+ expect(result.falseNegatives).toBe(0);
+ expect(result.trueNegatives).toBe(0);
+ });
+
+ test("over-merging counts as false positives", () => {
+ // Predicted merges {a, b}; golden keeps them separate.
+ const result = pairwiseAgreement([["a", "b"]], [["a"], ["b"]]);
+ expect(result.truePositives).toBe(0);
+ expect(result.falsePositives).toBe(1);
+ expect(result.falseNegatives).toBe(0);
+ expect(result.trueNegatives).toBe(0);
+ });
+
+ test("under-merging counts as false negatives", () => {
+ // Predicted keeps them separate; golden merges them.
+ const result = pairwiseAgreement([["a"], ["b"]], [["a", "b"]]);
+ expect(result.truePositives).toBe(0);
+ expect(result.falsePositives).toBe(0);
+ expect(result.falseNegatives).toBe(1);
+ expect(result.trueNegatives).toBe(0);
+ });
+});
+
+describe("pairwiseScores", () => {
+ test("perfect partition yields F1 of 1.0", () => {
+ const result = pairwiseScores(
+ [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ );
+ expect(result.precision).toBeCloseTo(1.0, 6);
+ expect(result.recall).toBeCloseTo(1.0, 6);
+ expect(result.f1).toBeCloseTo(1.0, 6);
+ });
+
+ test("over-merging drops precision more than recall", () => {
+ // Golden: {a, b}, {c, d}. Predicted: {a, b, c, d} (over-merge).
+ // Pairs: (a,b), (a,c), (a,d), (b,c), (b,d), (c,d) = 6 same-cluster predicted
+ // Of those, (a,b) and (c,d) are also same in golden -> TP=2, FP=4, FN=0
+ const result = pairwiseScores(
+ [["a", "b", "c", "d"]],
+ [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ );
+ expect(result.precision).toBeCloseTo(2 / 6, 6);
+ expect(result.recall).toBeCloseTo(1.0, 6);
+ });
+
+ test("under-merging drops recall more than precision", () => {
+ // Golden: {a, b, c, d}. Predicted: {a, b}, {c, d}.
+ const result = pairwiseScores(
+ [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ [["a", "b", "c", "d"]],
+ );
+ expect(result.precision).toBeCloseTo(1.0, 6);
+ expect(result.recall).toBeCloseTo(2 / 6, 6);
+ });
+});
+
+describe("adjustedRandIndex", () => {
+ test("perfect agreement yields ARI = 1", () => {
+ expect(
+ adjustedRandIndex(
+ [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ ),
+ ).toBeCloseTo(1.0, 6);
+ });
+
+ test("complete disagreement on two pairs", () => {
+ // Golden: {a, b}, {c, d}. Predicted: {a, c}, {b, d}.
+ // Hubert-Arabie: index = 0 (no shared same-cluster pairs),
+ // sumPredChoose = C(2,2)+C(2,2) = 2, sumGoldChoose = 2,
+ // expected = 2*2/C(4,2) = 2/3, maxIndex = 2,
+ // ARI = (0 - 2/3) / (2 - 2/3) = -0.5.
+ expect(
+ adjustedRandIndex(
+ [
+ ["a", "c"],
+ ["b", "d"],
+ ],
+ [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ ),
+ ).toBeCloseTo(-0.5, 6);
+ });
+
+ test("ARI is symmetric in its inputs", () => {
+ const left = [
+ ["a", "b", "c"],
+ ["d", "e"],
+ ];
+ const right = [
+ ["a", "b"],
+ ["c", "d", "e"],
+ ];
+ expect(adjustedRandIndex(left, right)).toBeCloseTo(
+ adjustedRandIndex(right, left),
+ 6,
+ );
+ });
+
+ test("single-item input yields ARI = 1", () => {
+ expect(adjustedRandIndex([["a"]], [["a"]])).toBe(1);
+ });
+});
+
+describe("scoreClustering", () => {
+ test("aggregates precision/recall/F1/ARI in one call", () => {
+ const result = scoreClustering(
+ [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ );
+ expect(result.precision).toBeCloseTo(1.0, 6);
+ expect(result.recall).toBeCloseTo(1.0, 6);
+ expect(result.f1).toBeCloseTo(1.0, 6);
+ expect(result.ari).toBeCloseTo(1.0, 6);
+ expect(result.itemCount).toBe(4);
+ });
+});
diff --git a/src/domains/evaluation/clustering.ts b/src/domains/evaluation/clustering.ts
new file mode 100644
index 0000000..8859f93
--- /dev/null
+++ b/src/domains/evaluation/clustering.ts
@@ -0,0 +1,239 @@
+/**
+ * Pure clustering / partition metrics for the dedup scorer.
+ *
+ * Given a `predicted` partition (list of clusters of item IDs) and a `golden`
+ * partition (the ground-truth clusters), score how well they agree. Used to
+ * grade memory-dedup passes: did the dedup pass cluster near-duplicates
+ * correctly?
+ *
+ * Metrics:
+ * - pairwise precision/recall/F1 over the same-cluster relation
+ * - Adjusted Rand Index (Hubert & Arabie 1985) — chance-corrected agreement
+ *
+ * All functions operate on item IDs (strings). Items present in the predicted
+ * partition but absent from the golden one (or vice versa) are treated as
+ * singletons in the missing side, so the math degrades gracefully under
+ * partial coverage. No I/O.
+ */
+
+export type Cluster = readonly string[];
+export type Partition = readonly Cluster[];
+
+/**
+ * Collect every distinct item across both partitions. Each item appears at
+ * most once even when the input clusters contain duplicates.
+ */
+function collectItems(left: Partition, right: Partition): string[] {
+ const seen = new Set();
+ for (const cluster of [...left, ...right]) {
+ for (const item of cluster) {
+ seen.add(item);
+ }
+ }
+ return [...seen].sort();
+}
+
+/**
+ * Map each item to a numeric cluster id under the partition. Items present in
+ * `items` but not assigned a cluster in `partition` are emitted as
+ * singleton clusters (each gets its own unique id) so the math degrades
+ * gracefully under partial coverage.
+ */
+function assignClusterIds(
+ partition: Partition,
+ items: readonly string[],
+): Map {
+ const assignment = new Map();
+ partition.forEach((cluster, index) => {
+ for (const item of cluster) {
+ if (!assignment.has(item)) {
+ assignment.set(item, index);
+ }
+ }
+ });
+ let nextSingletonId = partition.length;
+ for (const item of items) {
+ if (!assignment.has(item)) {
+ assignment.set(item, nextSingletonId);
+ nextSingletonId += 1;
+ }
+ }
+ return assignment;
+}
+
+export type PairwiseAgreement = {
+ truePositives: number;
+ falsePositives: number;
+ falseNegatives: number;
+ trueNegatives: number;
+};
+
+/**
+ * Build the 2x2 contingency over unordered item pairs:
+ * TP = same cluster in both
+ * FP = same in predicted, different in golden
+ * FN = different in predicted, same in golden
+ * TN = different in both
+ */
+export function pairwiseAgreement(
+ predicted: Partition,
+ golden: Partition,
+): PairwiseAgreement {
+ const items = collectItems(predicted, golden);
+ const pred = assignClusterIds(predicted, items);
+ const gold = assignClusterIds(golden, items);
+
+ let tp = 0;
+ let fp = 0;
+ let fn = 0;
+ let tn = 0;
+ for (let i = 0; i < items.length; i += 1) {
+ for (let j = i + 1; j < items.length; j += 1) {
+ const left = items[i] ?? "";
+ const right = items[j] ?? "";
+ const sameInPred = pred.get(left) === pred.get(right);
+ const sameInGold = gold.get(left) === gold.get(right);
+ if (sameInPred && sameInGold) {
+ tp += 1;
+ } else if (sameInPred && !sameInGold) {
+ fp += 1;
+ } else if (!sameInPred && sameInGold) {
+ fn += 1;
+ } else {
+ tn += 1;
+ }
+ }
+ }
+ return {
+ truePositives: tp,
+ falsePositives: fp,
+ falseNegatives: fn,
+ trueNegatives: tn,
+ };
+}
+
+export type PairwiseScores = {
+ precision: number;
+ recall: number;
+ f1: number;
+};
+
+/**
+ * Pairwise precision/recall/F1 over the same-cluster relation. Returns 1 for
+ * a metric when its denominator is 0 (the partition has no positive
+ * judgments to score). This matches the convention used by `pytrec_eval` and
+ * `scikit-learn.metrics.cluster.pair_confusion_matrix`.
+ */
+export function pairwiseScores(
+ predicted: Partition,
+ golden: Partition,
+): PairwiseScores {
+ const { truePositives, falsePositives, falseNegatives } = pairwiseAgreement(
+ predicted,
+ golden,
+ );
+ const precision =
+ truePositives + falsePositives === 0
+ ? 1
+ : truePositives / (truePositives + falsePositives);
+ const recall =
+ truePositives + falseNegatives === 0
+ ? 1
+ : truePositives / (truePositives + falseNegatives);
+ const f1 =
+ precision + recall === 0
+ ? 0
+ : (2 * precision * recall) / (precision + recall);
+ return { precision, recall, f1 };
+}
+
+function choose2(n: number): number {
+ return n < 2 ? 0 : (n * (n - 1)) / 2;
+}
+
+/**
+ * Adjusted Rand Index. Range typically [-0.something, 1]; 0 means agreement
+ * at chance level, 1 means perfect agreement, negative means worse than
+ * chance. Hubert & Arabie 1985. When both partitions have a single
+ * cluster (or all singletons), ARI is defined as 1.
+ */
+export function adjustedRandIndex(
+ predicted: Partition,
+ golden: Partition,
+): number {
+ const items = collectItems(predicted, golden);
+ if (items.length < 2) {
+ return 1;
+ }
+ const pred = assignClusterIds(predicted, items);
+ const gold = assignClusterIds(golden, items);
+
+ const predIds = [...new Set(pred.values())];
+ const goldIds = [...new Set(gold.values())];
+
+ // Contingency matrix counts[i][j] = items in pred cluster i and gold cluster j.
+ const counts = new Map>();
+ for (const item of items) {
+ const p = pred.get(item) ?? -1;
+ const g = gold.get(item) ?? -1;
+ let row = counts.get(p);
+ if (!row) {
+ row = new Map();
+ counts.set(p, row);
+ }
+ row.set(g, (row.get(g) ?? 0) + 1);
+ }
+
+ const predSizes = predIds.map((id) =>
+ items.reduce((sum, item) => (pred.get(item) === id ? sum + 1 : sum), 0),
+ );
+ const goldSizes = goldIds.map((id) =>
+ items.reduce((sum, item) => (gold.get(item) === id ? sum + 1 : sum), 0),
+ );
+
+ let index = 0;
+ for (const row of counts.values()) {
+ for (const value of row.values()) {
+ index += choose2(value);
+ }
+ }
+
+ const sumPredChoose = predSizes.reduce((sum, size) => sum + choose2(size), 0);
+ const sumGoldChoose = goldSizes.reduce((sum, size) => sum + choose2(size), 0);
+ const total = choose2(items.length);
+ if (total === 0) {
+ return 1;
+ }
+ const expected = (sumPredChoose * sumGoldChoose) / total;
+ const maxIndex = (sumPredChoose + sumGoldChoose) / 2;
+ if (maxIndex === expected) {
+ return 1;
+ }
+ return (index - expected) / (maxIndex - expected);
+}
+
+export type ClusterScore = {
+ precision: number;
+ recall: number;
+ f1: number;
+ ari: number;
+ pairCounts: PairwiseAgreement;
+ itemCount: number;
+};
+
+export function scoreClustering(
+ predicted: Partition,
+ golden: Partition,
+): ClusterScore {
+ const items = collectItems(predicted, golden);
+ const { precision, recall, f1 } = pairwiseScores(predicted, golden);
+ const ari = adjustedRandIndex(predicted, golden);
+ return {
+ precision,
+ recall,
+ f1,
+ ari,
+ pairCounts: pairwiseAgreement(predicted, golden),
+ itemCount: items.length,
+ };
+}
diff --git a/src/domains/evaluation/dedup-scorer.test.ts b/src/domains/evaluation/dedup-scorer.test.ts
new file mode 100644
index 0000000..33f7427
--- /dev/null
+++ b/src/domains/evaluation/dedup-scorer.test.ts
@@ -0,0 +1,129 @@
+import { describe, expect, test } from "bun:test";
+import { mkdtempSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import type {
+ AdapterReply,
+ DedupConfig,
+ Scenario,
+} from "../../shared/types/contracts.ts";
+import {
+ coerceDedupPayload,
+ resolveDedupPayload,
+ scoreScenarioDedup,
+} from "./dedup-scorer.ts";
+
+function makeTempDir(prefix: string): string {
+ return mkdtempSync(join(tmpdir(), `agentprobe-${prefix}-`));
+}
+
+function buildConfig(overrides: Partial = {}): DedupConfig {
+ return {
+ goldenClusters: overrides.goldenClusters ?? [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ weights: overrides.weights ?? { precision: 1, recall: 1, f1: 1, ari: 1 },
+ passThreshold: overrides.passThreshold ?? 0.6,
+ source: overrides.source,
+ };
+}
+
+function buildScenario(config: DedupConfig | undefined): Scenario {
+ return {
+ id: "dedup-test",
+ name: "Dedup test",
+ tags: [],
+ turns: [],
+ sessions: [],
+ expectations: {
+ mustInclude: [],
+ mustNotInclude: [],
+ expectedTools: [],
+ failureModes: [],
+ },
+ dedup: config,
+ };
+}
+
+function buildReply(payload: unknown): AdapterReply {
+ return {
+ assistantText: "...",
+ toolCalls: [],
+ rawExchange: { dedup: payload } as unknown as AdapterReply["rawExchange"],
+ latencyMs: 0,
+ usage: {},
+ };
+}
+
+describe("coerceDedupPayload", () => {
+ test("accepts a bare list of clusters", () => {
+ expect(coerceDedupPayload([["a", "b"], ["c"]]).clusters).toEqual([
+ ["a", "b"],
+ ["c"],
+ ]);
+ });
+
+ test("accepts {clusters: [[...]]}", () => {
+ expect(coerceDedupPayload({ clusters: [["a"], ["b"]] }).clusters).toEqual([
+ ["a"],
+ ["b"],
+ ]);
+ });
+
+ test("ignores non-string members and empty clusters", () => {
+ expect(
+ coerceDedupPayload([["a", 1, null, "b"], [], ["c"]]).clusters,
+ ).toEqual([["a", "b"], ["c"]]);
+ });
+});
+
+describe("resolveDedupPayload", () => {
+ test("loads fixture", () => {
+ const dir = makeTempDir("dedup-fixture");
+ writeFileSync(
+ join(dir, "dd.json"),
+ JSON.stringify({ clusters: [["a", "b"], ["c"]] }),
+ "utf8",
+ );
+ const config = buildConfig({ source: { fixture: "dd.json" } });
+ const result = resolveDedupPayload(config, {
+ scenariosPath: join(dir, "scenarios.yaml"),
+ });
+ expect(result.source).toBe("fixture");
+ expect(result.payload.clusters).toEqual([["a", "b"], ["c"]]);
+ });
+});
+
+describe("scoreScenarioDedup", () => {
+ test("returns undefined when no dedup block on scenario", () => {
+ expect(scoreScenarioDedup(buildScenario(undefined), {})).toBeUndefined();
+ });
+
+ test("perfect match passes with all metrics 1.0 (ARI mapped to 1)", () => {
+ const scenario = buildScenario(buildConfig());
+ const reply = buildReply({
+ clusters: [
+ ["a", "b"],
+ ["c", "d"],
+ ],
+ });
+ const result = scoreScenarioDedup(scenario, { lastAdapterReply: reply });
+ expect(result?.passed).toBe(true);
+ expect(result?.weightedScore).toBeCloseTo(1.0, 6);
+ });
+
+ test("complete disagreement drops the weighted score below threshold", () => {
+ const scenario = buildScenario(buildConfig());
+ // Golden: {a, b}, {c, d}. Predicted: {a, c}, {b, d} — pairwise F1=0, ARI=-0.5
+ const reply = buildReply({
+ clusters: [
+ ["a", "c"],
+ ["b", "d"],
+ ],
+ });
+ const result = scoreScenarioDedup(scenario, { lastAdapterReply: reply });
+ expect(result?.passed).toBe(false);
+ });
+});
diff --git a/src/domains/evaluation/dedup-scorer.ts b/src/domains/evaluation/dedup-scorer.ts
new file mode 100644
index 0000000..7720a22
--- /dev/null
+++ b/src/domains/evaluation/dedup-scorer.ts
@@ -0,0 +1,174 @@
+import { existsSync, readFileSync, statSync } from "node:fs";
+import { dirname, isAbsolute, resolve } from "node:path";
+
+import type {
+ AdapterReply,
+ DedupConfig,
+ DedupMetricScore,
+ DedupScore,
+ EvalSource,
+ JsonValue,
+ Scenario,
+} from "../../shared/types/contracts.ts";
+import { AgentProbeRuntimeError } from "../../shared/utils/errors.ts";
+import { logWarn } from "../../shared/utils/logging.ts";
+import { scoreClustering } from "./clustering.ts";
+
+const DEFAULT_RAW_EXCHANGE_KEY = "dedup";
+
+export type DedupPayload = {
+ clusters?: string[][];
+};
+
+function resolveFixturePath(
+ scenariosPath: string | undefined,
+ fixture: string,
+): string {
+ if (isAbsolute(fixture)) {
+ return fixture;
+ }
+ if (!scenariosPath) {
+ return resolve(fixture);
+ }
+ let base: string;
+ try {
+ base =
+ existsSync(scenariosPath) && statSync(scenariosPath).isDirectory()
+ ? scenariosPath
+ : dirname(scenariosPath);
+ } catch {
+ base = dirname(scenariosPath);
+ }
+ return resolve(base, fixture);
+}
+
+export function coerceDedupPayload(payload: unknown): DedupPayload {
+ if (!payload) {
+ return {};
+ }
+ // Accept `{clusters: [[...], [...]]}` or a bare `[[...], [...]]`.
+ if (Array.isArray(payload)) {
+ return { clusters: coerceClusters(payload) };
+ }
+ if (typeof payload === "object") {
+ const record = payload as Record;
+ if (Array.isArray(record.clusters)) {
+ return { clusters: coerceClusters(record.clusters) };
+ }
+ }
+ return {};
+}
+
+function coerceClusters(values: unknown[]): string[][] {
+ const clusters: string[][] = [];
+ for (const cluster of values) {
+ if (!Array.isArray(cluster)) {
+ continue;
+ }
+ const items = cluster.flatMap((item) =>
+ typeof item === "string" ? [item] : [],
+ );
+ if (items.length > 0) {
+ clusters.push(items);
+ }
+ }
+ return clusters;
+}
+
+export type DedupSourceContext = {
+ scenariosPath?: string;
+ lastAdapterReply?: AdapterReply;
+};
+
+export type ResolvedDedup = {
+ payload: DedupPayload;
+ source: EvalSource;
+};
+
+export function resolveDedupPayload(
+ config: DedupConfig,
+ context: DedupSourceContext,
+): ResolvedDedup {
+ const fixture = config.source?.fixture;
+ if (fixture) {
+ const resolved = resolveFixturePath(context.scenariosPath, fixture);
+ if (!existsSync(resolved)) {
+ throw new AgentProbeRuntimeError(`Dedup fixture not found: ${resolved}`);
+ }
+ return {
+ payload: coerceDedupPayload(JSON.parse(readFileSync(resolved, "utf8"))),
+ source: "fixture",
+ };
+ }
+ const key = config.source?.rawExchangeKey ?? DEFAULT_RAW_EXCHANGE_KEY;
+ const rawExchange = context.lastAdapterReply?.rawExchange;
+ if (rawExchange && typeof rawExchange === "object") {
+ const candidate = (rawExchange as Record)[key];
+ if (candidate !== undefined) {
+ return {
+ payload: coerceDedupPayload(candidate),
+ source: "raw_exchange",
+ };
+ }
+ }
+ return { payload: {}, source: "missing" };
+}
+
+export function scoreScenarioDedup(
+ scenario: Scenario,
+ context: DedupSourceContext,
+): DedupScore | undefined {
+ const config = scenario.dedup;
+ if (!config) {
+ return undefined;
+ }
+
+ let resolution: ResolvedDedup;
+ try {
+ resolution = resolveDedupPayload(config, context);
+ } catch (error) {
+ logWarn(
+ `Dedup scorer failed to resolve payload for ${scenario.id}: ${
+ error instanceof Error ? error.message : String(error)
+ }`,
+ );
+ resolution = { payload: {}, source: "missing" };
+ }
+
+ const predicted = resolution.payload.clusters ?? [];
+ const result = scoreClustering(predicted, config.goldenClusters);
+
+ const metrics: DedupMetricScore[] = [
+ {
+ metric: "precision",
+ value: result.precision,
+ weight: config.weights.precision,
+ },
+ { metric: "recall", value: result.recall, weight: config.weights.recall },
+ { metric: "f1", value: result.f1, weight: config.weights.f1 },
+ // Map ARI from [-1, 1] to [0, 1] so it composes with the others.
+ { metric: "ari", value: (result.ari + 1) / 2, weight: config.weights.ari },
+ ];
+ const totalWeight = metrics.reduce(
+ (sum, m) => sum + Math.max(0, m.weight),
+ 0,
+ );
+ const weightedScore =
+ totalWeight === 0
+ ? 0
+ : metrics.reduce(
+ (sum, m) => (m.weight > 0 ? sum + m.value * m.weight : sum),
+ 0,
+ ) / totalWeight;
+
+ return {
+ metrics,
+ weightedScore,
+ passThreshold: config.passThreshold,
+ passed: weightedScore >= config.passThreshold,
+ predictedClusters: predicted.map((c) => [...c]),
+ goldenClusters: config.goldenClusters.map((c) => [...c]),
+ itemCount: result.itemCount,
+ source: resolution.source,
+ };
+}
diff --git a/src/domains/evaluation/demotion-match.test.ts b/src/domains/evaluation/demotion-match.test.ts
new file mode 100644
index 0000000..dea55d1
--- /dev/null
+++ b/src/domains/evaluation/demotion-match.test.ts
@@ -0,0 +1,154 @@
+import { describe, expect, test } from "bun:test";
+
+import {
+ assertCascadeBounded,
+ assertExpectedSet,
+ assertTimestampDiscipline,
+ scoreDemotion,
+} from "./demotion-match.ts";
+
+describe("assertExpectedSet", () => {
+ test("perfect match yields F1 1.0", () => {
+ const result = assertExpectedSet(["a", "b"], ["a", "b"]);
+ expect(result.f1).toBeCloseTo(1.0, 6);
+ expect(result.falsePositives).toEqual([]);
+ expect(result.falseNegatives).toEqual([]);
+ });
+
+ test("touched the wrong edge yields FP and precision drop", () => {
+ const result = assertExpectedSet(["a", "wrong"], ["a", "b"]);
+ expect(result.falsePositives).toEqual(["wrong"]);
+ expect(result.falseNegatives).toEqual(["b"]);
+ expect(result.precision).toBeCloseTo(0.5, 6);
+ expect(result.recall).toBeCloseTo(0.5, 6);
+ });
+
+ test("missed an expected edge yields FN and recall drop", () => {
+ const result = assertExpectedSet(["a"], ["a", "b"]);
+ expect(result.recall).toBeCloseTo(0.5, 6);
+ expect(result.precision).toBeCloseTo(1.0, 6);
+ });
+
+ test("nothing expected and nothing touched is perfect", () => {
+ const result = assertExpectedSet([], []);
+ expect(result.f1).toBeCloseTo(1.0, 6);
+ });
+});
+
+describe("assertTimestampDiscipline", () => {
+ test("a clean retract (expired_at only) has no violation", () => {
+ const violations = assertTimestampDiscipline(
+ [
+ {
+ uuid: "edge1",
+ expiredAtSet: true,
+ invalidAtSet: false,
+ },
+ ],
+ [],
+ );
+ expect(violations).toEqual([]);
+ });
+
+ test("a retract that also set invalid_at is flagged", () => {
+ const violations = assertTimestampDiscipline(
+ [
+ {
+ uuid: "edge1",
+ expiredAtSet: true,
+ invalidAtSet: true,
+ },
+ ],
+ [],
+ );
+ expect(violations).toHaveLength(1);
+ expect(violations[0]?.expectation).toBe("retract_only_expired");
+ });
+
+ test("a soft_delete that set only one timestamp is flagged", () => {
+ const violations = assertTimestampDiscipline(
+ [],
+ [
+ {
+ uuid: "edge2",
+ expiredAtSet: true,
+ invalidAtSet: false,
+ },
+ ],
+ );
+ expect(violations).toHaveLength(1);
+ expect(violations[0]?.expectation).toBe("soft_delete_both");
+ });
+});
+
+describe("assertCascadeBounded", () => {
+ test("touching only direct neighbors is bounded", () => {
+ const result = assertCascadeBounded(
+ ["e_ab", "e_bc"],
+ ["e_ab", "e_bc"],
+ ["e_cd"],
+ );
+ expect(result.bounded).toBe(true);
+ expect(result.touchedTangentialNeighbors).toEqual([]);
+ expect(result.directNeighborF1).toBeCloseTo(1.0, 6);
+ });
+
+ test("touching a 2-hop edge is a runaway-demotion failure", () => {
+ // Graph A -> B -> C -> D. Invalidate B. Direct: (A,B), (B,C). Tangential: (C,D).
+ const result = assertCascadeBounded(
+ ["e_ab", "e_bc", "e_cd"],
+ ["e_ab", "e_bc"],
+ ["e_cd"],
+ );
+ expect(result.bounded).toBe(false);
+ expect(result.touchedTangentialNeighbors).toEqual(["e_cd"]);
+ });
+
+ test("missing a direct neighbor lowers directNeighborF1 but stays bounded", () => {
+ const result = assertCascadeBounded(["e_ab"], ["e_ab", "e_bc"], ["e_cd"]);
+ expect(result.bounded).toBe(true);
+ expect(result.missedDirectNeighbors).toEqual(["e_bc"]);
+ expect(result.directNeighborF1).toBeLessThan(1.0);
+ });
+});
+
+describe("scoreDemotion", () => {
+ test("perfect demotion of the expected set passes", () => {
+ const result = scoreDemotion({
+ observedDemotions: ["e1", "e2"],
+ expectedDemotions: ["e1", "e2"],
+ retractActions: [
+ { uuid: "e1", expiredAtSet: true, invalidAtSet: false },
+ { uuid: "e2", expiredAtSet: true, invalidAtSet: false },
+ ],
+ });
+ expect(result.weightedScore).toBeCloseTo(1.0, 6);
+ expect(result.passed).toBe(true);
+ });
+
+ test("a timestamp violation is a hard fail regardless of set match", () => {
+ const result = scoreDemotion({
+ observedDemotions: ["e1"],
+ expectedDemotions: ["e1"],
+ retractActions: [
+ { uuid: "e1", expiredAtSet: true, invalidAtSet: true }, // wrong
+ ],
+ });
+ expect(result.timestampViolations).toHaveLength(1);
+ expect(result.passed).toBe(false);
+ });
+
+ test("a runaway cascade is a hard fail", () => {
+ const result = scoreDemotion({
+ observedDemotions: ["e_ab", "e_bc", "e_cd"],
+ expectedDemotions: ["e_ab", "e_bc"],
+ cascade: {
+ touched: ["e_ab", "e_bc", "e_cd"],
+ expectedDirectNeighbors: ["e_ab", "e_bc"],
+ tangentialEdges: ["e_cd"],
+ },
+ });
+ expect(result.cascade?.bounded).toBe(false);
+ expect(result.passed).toBe(false);
+ });
+});
diff --git a/src/domains/evaluation/demotion-match.ts b/src/domains/evaluation/demotion-match.ts
new file mode 100644
index 0000000..26b4ab3
--- /dev/null
+++ b/src/domains/evaluation/demotion-match.ts
@@ -0,0 +1,323 @@
+/**
+ * Demotion-correctness primitives for the demotion-precision scorer.
+ *
+ * These score the *structural* half of demotion correctness — which edges
+ * the dream pass actually touched vs which it was supposed to touch. They
+ * cover:
+ *
+ * - **P-1.3 retract-vs-soft-delete:** did `_retract_edges` set only
+ * `expired_at`, and did `_soft_delete_edges` set both? Scored by
+ * `assertTimestampDiscipline`.
+ * - **P0.3a stale-fact deprecation:** did the pass demote items that were
+ * genuinely stale and leave fresh items alone? Scored by
+ * `assertExpectedSet`.
+ * - **P0.3b scoped cascading expiry:** did the cascade touch the entity's
+ * direct neighbors and nothing 2+ hops away? Scored by
+ * `assertCascadeBounded`.
+ *
+ * The LLM-judged half (was the demotion semantically warranted?) goes
+ * through the existing `judgeResponse` pipeline; this module is only the
+ * deterministic structural check.
+ *
+ * All functions are pure.
+ */
+
+export type DemotionAction = {
+ /** UUID of the edge or memory that was demoted. */
+ uuid: string;
+ /** Optional human label for reports. */
+ label?: string;
+ /** `expired_at` / `invalid_at` flags set by the operation. */
+ expiredAtSet: boolean;
+ invalidAtSet: boolean;
+ /** New status property, if any. */
+ status?: string;
+};
+
+export type SetCheckResult = {
+ /** Items the dream pass correctly touched. */
+ truePositives: string[];
+ /** Items it touched but shouldn't have. */
+ falsePositives: string[];
+ /** Items it missed. */
+ falseNegatives: string[];
+ precision: number;
+ recall: number;
+ f1: number;
+};
+
+function normalize(value: string): string {
+ return value.trim();
+}
+
+function dedup(values: readonly string[]): string[] {
+ return [...new Set(values.map(normalize))];
+}
+
+/**
+ * Set-level precision/recall over which UUIDs were touched vs the
+ * `expected` set. The denominators degrade gracefully:
+ * - empty expected + empty observed = perfect score
+ * - empty expected + nonempty observed = precision 0, recall 1
+ * - nonempty expected + empty observed = precision 1, recall 0
+ */
+export function assertExpectedSet(
+ observed: readonly string[],
+ expected: readonly string[],
+): SetCheckResult {
+ const observed_ = dedup(observed);
+ const expected_ = dedup(expected);
+ const expectedSet = new Set(expected_);
+ const observedSet = new Set(observed_);
+ const tp = observed_.filter((id) => expectedSet.has(id));
+ const fp = observed_.filter((id) => !expectedSet.has(id));
+ const fn = expected_.filter((id) => !observedSet.has(id));
+ const precision =
+ observed_.length === 0
+ ? expected_.length === 0
+ ? 1
+ : 0
+ : tp.length / observed_.length;
+ const recall = expected_.length === 0 ? 1 : tp.length / expected_.length;
+ const f1 =
+ precision + recall === 0
+ ? 0
+ : (2 * precision * recall) / (precision + recall);
+ return {
+ truePositives: tp,
+ falsePositives: fp,
+ falseNegatives: fn,
+ precision,
+ recall,
+ f1,
+ };
+}
+
+export type TimestampViolation = {
+ uuid: string;
+ expectation: "retract_only_expired" | "soft_delete_both";
+ observed: { expiredAtSet: boolean; invalidAtSet: boolean };
+ message: string;
+};
+
+/**
+ * Verify the Snodgrass bi-temporal discipline for a list of demotions.
+ *
+ * `retract` actions must set only `expired_at` (transaction-time
+ * retraction). `soft_delete` actions must set BOTH `expired_at` and
+ * `invalid_at` (the world changed AND we recorded it). Returns a list of
+ * violations; empty list means the discipline held.
+ */
+export function assertTimestampDiscipline(
+ retractActions: readonly DemotionAction[],
+ softDeleteActions: readonly DemotionAction[],
+): TimestampViolation[] {
+ const violations: TimestampViolation[] = [];
+ for (const action of retractActions) {
+ if (!action.expiredAtSet || action.invalidAtSet) {
+ violations.push({
+ uuid: action.uuid,
+ expectation: "retract_only_expired",
+ observed: {
+ expiredAtSet: action.expiredAtSet,
+ invalidAtSet: action.invalidAtSet,
+ },
+ message: `retract must set expired_at only; got expired_at=${action.expiredAtSet}, invalid_at=${action.invalidAtSet}`,
+ });
+ }
+ }
+ for (const action of softDeleteActions) {
+ if (!action.expiredAtSet || !action.invalidAtSet) {
+ violations.push({
+ uuid: action.uuid,
+ expectation: "soft_delete_both",
+ observed: {
+ expiredAtSet: action.expiredAtSet,
+ invalidAtSet: action.invalidAtSet,
+ },
+ message: `soft_delete must set both expired_at and invalid_at; got expired_at=${action.expiredAtSet}, invalid_at=${action.invalidAtSet}`,
+ });
+ }
+ }
+ return violations;
+}
+
+export type CascadeCheckResult = {
+ /** Edges the cascade touched that should have been touched (1-hop). */
+ touchedDirectNeighbors: string[];
+ /** Edges 1-hop away that the cascade was supposed to touch but didn't. */
+ missedDirectNeighbors: string[];
+ /** Edges 2+ hops away that the cascade touched (RUNAWAY DEMOTION — failure). */
+ touchedTangentialNeighbors: string[];
+ /** True when no tangential edges were touched. The single-hop discipline rule. */
+ bounded: boolean;
+ /** Set-level F1 over the expected-direct set. */
+ directNeighborF1: number;
+};
+
+/**
+ * P0.3b single-hop cascade check.
+ *
+ * `expectedDirectNeighbors` is the set of edges that should be demoted when
+ * the entity is invalidated (its direct attachments). `tangentialEdges` is
+ * the set of 2+ hop edges that must NOT be touched. `touched` is the actual
+ * list of edges the cascade demoted.
+ */
+export function assertCascadeBounded(
+ touched: readonly string[],
+ expectedDirectNeighbors: readonly string[],
+ tangentialEdges: readonly string[],
+): CascadeCheckResult {
+ const touched_ = dedup(touched);
+ const expected_ = dedup(expectedDirectNeighbors);
+ const tangential_ = dedup(tangentialEdges);
+ const expectedSet = new Set(expected_);
+ const tangentialSet = new Set(tangential_);
+ const touchedSet = new Set(touched_);
+
+ const touchedDirect = touched_.filter((id) => expectedSet.has(id));
+ const missedDirect = expected_.filter((id) => !touchedSet.has(id));
+ const touchedTangential = touched_.filter((id) => tangentialSet.has(id));
+
+ const setResult = assertExpectedSet(
+ touched_.filter((id) => expectedSet.has(id) || tangentialSet.has(id)),
+ expected_,
+ );
+
+ return {
+ touchedDirectNeighbors: touchedDirect,
+ missedDirectNeighbors: missedDirect,
+ touchedTangentialNeighbors: touchedTangential,
+ bounded: touchedTangential.length === 0,
+ directNeighborF1: setResult.f1,
+ };
+}
+
+export type DemotionMetricKey =
+ | "set_precision"
+ | "set_recall"
+ | "set_f1"
+ | "timestamp_discipline"
+ | "cascade_bounded"
+ | "cascade_direct_f1";
+
+export type DemotionMetricScore = {
+ metric: DemotionMetricKey;
+ value: number;
+ weight: number;
+};
+
+export type DemotionMatchInput = {
+ observedDemotions: readonly string[];
+ expectedDemotions: readonly string[];
+ retractActions?: readonly DemotionAction[];
+ softDeleteActions?: readonly DemotionAction[];
+ cascade?: {
+ touched: readonly string[];
+ expectedDirectNeighbors: readonly string[];
+ tangentialEdges: readonly string[];
+ };
+ weights?: Partial>;
+ passThreshold?: number;
+};
+
+export type DemotionMatchResult = {
+ metrics: DemotionMetricScore[];
+ weightedScore: number;
+ passThreshold: number;
+ passed: boolean;
+ set: SetCheckResult;
+ timestampViolations: TimestampViolation[];
+ cascade?: CascadeCheckResult;
+};
+
+const DEFAULT_DEMOTION_WEIGHTS: Required> = {
+ set_precision: 1,
+ set_recall: 1,
+ set_f1: 1,
+ timestamp_discipline: 1,
+ cascade_bounded: 1,
+ cascade_direct_f1: 1,
+};
+
+const DEFAULT_DEMOTION_THRESHOLD = 0.6;
+
+/**
+ * Aggregate the structural side of demotion correctness. The LLM-judged
+ * "was this demotion warranted?" half is scored separately via the
+ * existing `judgeResponse` path; this returns deterministic metrics that
+ * can be asserted in CI without an LLM call.
+ */
+export function scoreDemotion(input: DemotionMatchInput): DemotionMatchResult {
+ const set = assertExpectedSet(
+ input.observedDemotions,
+ input.expectedDemotions,
+ );
+ const violations = assertTimestampDiscipline(
+ input.retractActions ?? [],
+ input.softDeleteActions ?? [],
+ );
+ const cascade = input.cascade
+ ? assertCascadeBounded(
+ input.cascade.touched,
+ input.cascade.expectedDirectNeighbors,
+ input.cascade.tangentialEdges,
+ )
+ : undefined;
+
+ const timestampScore = violations.length === 0 ? 1 : 0;
+
+ const weights = { ...DEFAULT_DEMOTION_WEIGHTS, ...(input.weights ?? {}) };
+ const metrics: DemotionMetricScore[] = [
+ {
+ metric: "set_precision",
+ value: set.precision,
+ weight: weights.set_precision,
+ },
+ { metric: "set_recall", value: set.recall, weight: weights.set_recall },
+ { metric: "set_f1", value: set.f1, weight: weights.set_f1 },
+ {
+ metric: "timestamp_discipline",
+ value: timestampScore,
+ weight: weights.timestamp_discipline,
+ },
+ ];
+ if (cascade) {
+ metrics.push(
+ {
+ metric: "cascade_bounded",
+ value: cascade.bounded ? 1 : 0,
+ weight: weights.cascade_bounded,
+ },
+ {
+ metric: "cascade_direct_f1",
+ value: cascade.directNeighborF1,
+ weight: weights.cascade_direct_f1,
+ },
+ );
+ }
+
+ const totalWeight = metrics.reduce(
+ (sum, m) => sum + Math.max(0, m.weight),
+ 0,
+ );
+ const weightedScore =
+ totalWeight === 0
+ ? 0
+ : metrics.reduce(
+ (sum, m) => (m.weight > 0 ? sum + m.value * m.weight : sum),
+ 0,
+ ) / totalWeight;
+ const passThreshold = input.passThreshold ?? DEFAULT_DEMOTION_THRESHOLD;
+ const hardFail =
+ violations.length > 0 || (cascade !== undefined && !cascade.bounded);
+ return {
+ metrics,
+ weightedScore,
+ passThreshold,
+ passed: !hardFail && weightedScore >= passThreshold,
+ set,
+ timestampViolations: violations,
+ cascade,
+ };
+}
diff --git a/src/domains/evaluation/demotion-scorer.test.ts b/src/domains/evaluation/demotion-scorer.test.ts
new file mode 100644
index 0000000..d16cf46
--- /dev/null
+++ b/src/domains/evaluation/demotion-scorer.test.ts
@@ -0,0 +1,159 @@
+import { describe, expect, test } from "bun:test";
+import { mkdtempSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import type {
+ AdapterReply,
+ DemotionConfig,
+ Scenario,
+} from "../../shared/types/contracts.ts";
+import {
+ coerceDemotionPayload,
+ resolveDemotionPayload,
+ scoreScenarioDemotion,
+} from "./demotion-scorer.ts";
+
+function makeTempDir(prefix: string): string {
+ return mkdtempSync(join(tmpdir(), `agentprobe-${prefix}-`));
+}
+
+function buildConfig(overrides: Partial = {}): DemotionConfig {
+ return {
+ expectedDemotions: overrides.expectedDemotions ?? ["e1", "e2"],
+ expectedRetracts: overrides.expectedRetracts,
+ cascade: overrides.cascade,
+ weights: overrides.weights ?? {
+ set_precision: 1,
+ set_recall: 1,
+ set_f1: 1,
+ timestamp_discipline: 1,
+ cascade_bounded: 1,
+ cascade_direct_f1: 1,
+ },
+ passThreshold: overrides.passThreshold ?? 0.6,
+ source: overrides.source,
+ };
+}
+
+function buildScenario(config: DemotionConfig | undefined): Scenario {
+ return {
+ id: "demotion-test",
+ name: "Demotion test",
+ tags: [],
+ turns: [],
+ sessions: [],
+ expectations: {
+ mustInclude: [],
+ mustNotInclude: [],
+ expectedTools: [],
+ failureModes: [],
+ },
+ demotion: config,
+ };
+}
+
+function buildReply(payload: unknown): AdapterReply {
+ return {
+ assistantText: "...",
+ toolCalls: [],
+ rawExchange: {
+ demotions: payload,
+ } as unknown as AdapterReply["rawExchange"],
+ latencyMs: 0,
+ usage: {},
+ };
+}
+
+describe("coerceDemotionPayload", () => {
+ test("extracts observed UUIDs from `observed` and `demotions` keys", () => {
+ expect(coerceDemotionPayload({ observed: ["a", "b"] }).observed).toEqual([
+ "a",
+ "b",
+ ]);
+ expect(coerceDemotionPayload({ demotions: ["c"] }).observed).toEqual(["c"]);
+ });
+
+ test("extracts cascade and action records", () => {
+ const payload = coerceDemotionPayload({
+ observed: ["a"],
+ cascade_touched: ["e1", "e2"],
+ retract_actions: [
+ { uuid: "e1", expired_at_set: true, invalid_at_set: false },
+ ],
+ });
+ expect(payload.cascadeTouched).toEqual(["e1", "e2"]);
+ expect(payload.retractActions?.[0]?.uuid).toBe("e1");
+ });
+});
+
+describe("resolveDemotionPayload", () => {
+ test("loads fixture relative to scenarios path", () => {
+ const dir = makeTempDir("demotion-fixture");
+ const fp = join(dir, "demo.json");
+ writeFileSync(
+ fp,
+ JSON.stringify({ observed: ["e1", "e2"], cascade_touched: ["e1"] }),
+ "utf8",
+ );
+ const config = buildConfig({ source: { fixture: "demo.json" } });
+ const result = resolveDemotionPayload(config, {
+ scenariosPath: join(dir, "scenarios.yaml"),
+ });
+ expect(result.source).toBe("fixture");
+ expect(result.payload.observed).toEqual(["e1", "e2"]);
+ expect(result.payload.cascadeTouched).toEqual(["e1"]);
+ });
+
+ test("reads from rawExchange when no fixture configured", () => {
+ const result = resolveDemotionPayload(buildConfig(), {
+ lastAdapterReply: buildReply({ observed: ["e1"] }),
+ });
+ expect(result.source).toBe("raw_exchange");
+ expect(result.payload.observed).toEqual(["e1"]);
+ });
+
+ test("missing source returns empty payload", () => {
+ expect(resolveDemotionPayload(buildConfig(), {}).source).toBe("missing");
+ });
+});
+
+describe("scoreScenarioDemotion", () => {
+ test("returns undefined when no demotion block on the scenario", () => {
+ expect(scoreScenarioDemotion(buildScenario(undefined), {})).toBeUndefined();
+ });
+
+ test("perfect demotion passes", () => {
+ const scenario = buildScenario(buildConfig());
+ const reply = buildReply({ observed: ["e1", "e2"] });
+ const result = scoreScenarioDemotion(scenario, { lastAdapterReply: reply });
+ expect(result?.passed).toBe(true);
+ expect(result?.weightedScore).toBeGreaterThan(0.6);
+ });
+
+ test("runaway cascade flips passed to false", () => {
+ const scenario = buildScenario(
+ buildConfig({
+ expectedDemotions: ["e_ab", "e_bc"],
+ cascade: {
+ expectedDirectNeighbors: ["e_ab", "e_bc"],
+ tangentialEdges: ["e_cd"],
+ },
+ }),
+ );
+ const reply = buildReply({
+ observed: ["e_ab", "e_bc", "e_cd"],
+ cascade_touched: ["e_ab", "e_bc", "e_cd"],
+ });
+ const result = scoreScenarioDemotion(scenario, { lastAdapterReply: reply });
+ expect(result?.cascadeBounded).toBe(false);
+ expect(result?.passed).toBe(false);
+ });
+
+ test("missing source produces a failing score with source=missing", () => {
+ const scenario = buildScenario(buildConfig());
+ const result = scoreScenarioDemotion(scenario, {});
+ expect(result?.source).toBe("missing");
+ expect(result?.passed).toBe(false);
+ });
+});
diff --git a/src/domains/evaluation/demotion-scorer.ts b/src/domains/evaluation/demotion-scorer.ts
new file mode 100644
index 0000000..e1ffb69
--- /dev/null
+++ b/src/domains/evaluation/demotion-scorer.ts
@@ -0,0 +1,191 @@
+import { existsSync, readFileSync, statSync } from "node:fs";
+import { dirname, isAbsolute, resolve } from "node:path";
+
+import type {
+ AdapterReply,
+ DemotionAction,
+ DemotionConfig,
+ DemotionScore,
+ EvalSource,
+ JsonValue,
+ Scenario,
+} from "../../shared/types/contracts.ts";
+import { AgentProbeRuntimeError } from "../../shared/utils/errors.ts";
+import { logWarn } from "../../shared/utils/logging.ts";
+import { scoreDemotion } from "./demotion-match.ts";
+
+const DEFAULT_RAW_EXCHANGE_KEY = "demotions";
+
+export type DemotionPayload = {
+ /** Observed demotion UUIDs. */
+ observed?: string[];
+ /** Optional raw retract / soft-delete action records for Snodgrass check. */
+ retractActions?: DemotionAction[];
+ softDeleteActions?: DemotionAction[];
+ /** Observed cascade edge UUIDs. */
+ cascadeTouched?: string[];
+};
+
+function resolveFixturePath(
+ scenariosPath: string | undefined,
+ fixture: string,
+): string {
+ if (isAbsolute(fixture)) {
+ return fixture;
+ }
+ if (!scenariosPath) {
+ return resolve(fixture);
+ }
+ let base: string;
+ try {
+ base =
+ existsSync(scenariosPath) && statSync(scenariosPath).isDirectory()
+ ? scenariosPath
+ : dirname(scenariosPath);
+ } catch {
+ base = dirname(scenariosPath);
+ }
+ return resolve(base, fixture);
+}
+
+export function coerceDemotionPayload(payload: unknown): DemotionPayload {
+ if (!payload || typeof payload !== "object" || Array.isArray(payload)) {
+ return {};
+ }
+ const record = payload as Record;
+ const observed = Array.isArray(record.observed)
+ ? record.observed.filter((id): id is string => typeof id === "string")
+ : Array.isArray(record.demotions)
+ ? record.demotions.filter((id): id is string => typeof id === "string")
+ : undefined;
+ const cascadeTouched = Array.isArray(record.cascade_touched)
+ ? record.cascade_touched.filter(
+ (id): id is string => typeof id === "string",
+ )
+ : undefined;
+ const retractActions = Array.isArray(record.retract_actions)
+ ? record.retract_actions.flatMap(coerceAction)
+ : undefined;
+ const softDeleteActions = Array.isArray(record.soft_delete_actions)
+ ? record.soft_delete_actions.flatMap(coerceAction)
+ : undefined;
+ return { observed, cascadeTouched, retractActions, softDeleteActions };
+}
+
+function coerceAction(value: unknown): DemotionAction[] {
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
+ return [];
+ }
+ const obj = value as Record;
+ const uuid = typeof obj.uuid === "string" ? obj.uuid : undefined;
+ if (!uuid) {
+ return [];
+ }
+ return [
+ {
+ uuid,
+ label: typeof obj.label === "string" ? obj.label : undefined,
+ expiredAtSet: obj.expired_at_set === true || obj.expiredAtSet === true,
+ invalidAtSet: obj.invalid_at_set === true || obj.invalidAtSet === true,
+ status: typeof obj.status === "string" ? obj.status : undefined,
+ },
+ ];
+}
+
+export type DemotionSourceContext = {
+ scenariosPath?: string;
+ lastAdapterReply?: AdapterReply;
+};
+
+export type ResolvedDemotion = {
+ payload: DemotionPayload;
+ source: EvalSource;
+};
+
+export function resolveDemotionPayload(
+ config: DemotionConfig,
+ context: DemotionSourceContext,
+): ResolvedDemotion {
+ const fixture = config.source?.fixture;
+ if (fixture) {
+ const resolved = resolveFixturePath(context.scenariosPath, fixture);
+ if (!existsSync(resolved)) {
+ throw new AgentProbeRuntimeError(
+ `Demotion fixture not found: ${resolved}`,
+ );
+ }
+ return {
+ payload: coerceDemotionPayload(
+ JSON.parse(readFileSync(resolved, "utf8")),
+ ),
+ source: "fixture",
+ };
+ }
+ const key = config.source?.rawExchangeKey ?? DEFAULT_RAW_EXCHANGE_KEY;
+ const rawExchange = context.lastAdapterReply?.rawExchange;
+ if (rawExchange && typeof rawExchange === "object") {
+ const candidate = (rawExchange as Record)[key];
+ if (candidate !== undefined) {
+ return {
+ payload: coerceDemotionPayload(candidate),
+ source: "raw_exchange",
+ };
+ }
+ }
+ return { payload: {}, source: "missing" };
+}
+
+export function scoreScenarioDemotion(
+ scenario: Scenario,
+ context: DemotionSourceContext,
+): DemotionScore | undefined {
+ const config = scenario.demotion;
+ if (!config) {
+ return undefined;
+ }
+
+ let resolution: ResolvedDemotion;
+ try {
+ resolution = resolveDemotionPayload(config, context);
+ } catch (error) {
+ logWarn(
+ `Demotion scorer failed to resolve payload for ${scenario.id}: ${
+ error instanceof Error ? error.message : String(error)
+ }`,
+ );
+ resolution = { payload: {}, source: "missing" };
+ }
+
+ const observed = resolution.payload.observed ?? [];
+ const cascadeTouched = resolution.payload.cascadeTouched ?? [];
+ const cascadeConfig = config.cascade;
+ const cascade = cascadeConfig
+ ? {
+ touched: cascadeTouched.length > 0 ? cascadeTouched : observed,
+ expectedDirectNeighbors: cascadeConfig.expectedDirectNeighbors,
+ tangentialEdges: cascadeConfig.tangentialEdges,
+ }
+ : undefined;
+
+ const match = scoreDemotion({
+ observedDemotions: observed,
+ expectedDemotions: config.expectedDemotions,
+ retractActions: resolution.payload.retractActions,
+ softDeleteActions: resolution.payload.softDeleteActions,
+ cascade,
+ weights: config.weights,
+ passThreshold: config.passThreshold,
+ });
+
+ return {
+ metrics: match.metrics,
+ weightedScore: match.weightedScore,
+ passThreshold: config.passThreshold,
+ passed: match.passed,
+ observed,
+ expected: [...config.expectedDemotions],
+ cascadeBounded: match.cascade?.bounded,
+ timestampViolationCount: match.timestampViolations.length,
+ source: resolution.source,
+ };
+}
diff --git a/src/domains/evaluation/procedure-match.test.ts b/src/domains/evaluation/procedure-match.test.ts
new file mode 100644
index 0000000..fcb5e7c
--- /dev/null
+++ b/src/domains/evaluation/procedure-match.test.ts
@@ -0,0 +1,160 @@
+import { describe, expect, test } from "bun:test";
+
+import {
+ longestCommonSubsequenceLength,
+ orderSimilarity,
+ parameterCoverage,
+ scoreProcedure,
+ stepCoverage,
+} from "./procedure-match.ts";
+
+describe("stepCoverage", () => {
+ test("perfect coverage yields F1 of 1.0", () => {
+ const result = stepCoverage(["a", "b", "c"], ["a", "b", "c"]);
+ expect(result.f1).toBeCloseTo(1.0, 6);
+ expect(result.matchedSteps).toEqual(["a", "b", "c"]);
+ expect(result.missingSteps).toEqual([]);
+ expect(result.extraSteps).toEqual([]);
+ });
+
+ test("missing one step drops recall", () => {
+ const result = stepCoverage(["a", "b"], ["a", "b", "c"]);
+ expect(result.precision).toBeCloseTo(1.0, 6);
+ expect(result.recall).toBeCloseTo(2 / 3, 6);
+ expect(result.missingSteps).toEqual(["c"]);
+ });
+
+ test("extra step drops precision", () => {
+ const result = stepCoverage(["a", "b", "junk"], ["a", "b"]);
+ expect(result.precision).toBeCloseTo(2 / 3, 6);
+ expect(result.recall).toBeCloseTo(1.0, 6);
+ expect(result.extraSteps).toEqual(["junk"]);
+ });
+
+ test("normalization is case-insensitive and whitespace-trimmed", () => {
+ const result = stepCoverage(
+ [" Open Ticket ", "ASSIGN"],
+ ["open ticket", "assign"],
+ );
+ expect(result.f1).toBeCloseTo(1.0, 6);
+ });
+});
+
+describe("longestCommonSubsequenceLength", () => {
+ test("identical sequences yield length |seq|", () => {
+ expect(
+ longestCommonSubsequenceLength(["a", "b", "c"], ["a", "b", "c"]),
+ ).toBe(3);
+ });
+
+ test("disjoint sequences yield 0", () => {
+ expect(longestCommonSubsequenceLength(["a", "b"], ["c", "d"])).toBe(0);
+ });
+
+ test("classic ABCBDAB / BDCAB example yields 4", () => {
+ // LCS of "abcbdab" and "bdcab" is "bcab" (length 4) — a canonical CLRS case.
+ expect(
+ longestCommonSubsequenceLength(
+ ["a", "b", "c", "b", "d", "a", "b"],
+ ["b", "d", "c", "a", "b"],
+ ),
+ ).toBe(4);
+ });
+});
+
+describe("orderSimilarity", () => {
+ test("identical order is 1.0", () => {
+ expect(orderSimilarity(["a", "b", "c"], ["a", "b", "c"])).toBeCloseTo(
+ 1.0,
+ 6,
+ );
+ });
+
+ test("reversed order with shared elements drops below 1.0", () => {
+ // LCS of [a,b,c] and [c,b,a] is 1 (b alone, or a/c alone); max length 3 => 1/3.
+ expect(orderSimilarity(["a", "b", "c"], ["c", "b", "a"])).toBeCloseTo(
+ 1 / 3,
+ 6,
+ );
+ });
+
+ test("empty inputs degrade to 1", () => {
+ expect(orderSimilarity([], [])).toBe(1);
+ });
+});
+
+describe("parameterCoverage", () => {
+ test("identical sets yield Jaccard 1.0", () => {
+ const result = parameterCoverage(
+ ["ticket_id", "assignee"],
+ ["ticket_id", "assignee"],
+ );
+ expect(result.jaccard).toBeCloseTo(1.0, 6);
+ expect(result.missing).toEqual([]);
+ expect(result.extra).toEqual([]);
+ });
+
+ test("missing one and extra one penalize symmetrically", () => {
+ const result = parameterCoverage(
+ ["ticket_id", "junk"],
+ ["ticket_id", "assignee"],
+ );
+ // matched=1, union={ticket_id, junk, assignee}=3 -> 1/3
+ expect(result.jaccard).toBeCloseTo(1 / 3, 6);
+ });
+});
+
+describe("scoreProcedure", () => {
+ test("perfect match passes with weightedScore 1.0", () => {
+ const result = scoreProcedure({
+ predictedSteps: ["open ticket", "assign", "close"],
+ goldenSteps: ["open ticket", "assign", "close"],
+ predictedParameters: ["ticket_id"],
+ goldenParameters: ["ticket_id"],
+ });
+ expect(result.weightedScore).toBeCloseTo(1.0, 6);
+ expect(result.passed).toBe(true);
+ });
+
+ test("missing one of three steps still passes at default threshold", () => {
+ const result = scoreProcedure({
+ predictedSteps: ["open ticket", "assign"],
+ goldenSteps: ["open ticket", "assign", "close"],
+ });
+ // step_coverage F1 = 2*(1.0 * 2/3)/(1.0 + 2/3) = 0.8
+ // order similarity = LCS([open ticket, assign], [open ticket, assign, close]) / max(2,3) = 2/3
+ // parameter_coverage (both empty) = 1
+ // weighted avg = (0.8 + 2/3 + 1) / 3 = ~0.822
+ expect(result.weightedScore).toBeGreaterThan(0.6);
+ expect(result.passed).toBe(true);
+ });
+
+ test("reversed order drops weighted score below threshold", () => {
+ const result = scoreProcedure({
+ predictedSteps: ["close", "assign", "open ticket"],
+ goldenSteps: ["open ticket", "assign", "close"],
+ });
+ // step_coverage F1 = 1.0; order similarity = 1/3; parameter = 1
+ // weighted = (1.0 + 1/3 + 1) / 3 = ~0.778 — actually passes at 0.6
+ expect(result.weightedScore).toBeGreaterThan(0);
+ // But if step_order is weighted heavily it should fail:
+ const heavy = scoreProcedure({
+ predictedSteps: ["close", "assign", "open ticket"],
+ goldenSteps: ["open ticket", "assign", "close"],
+ weights: { step_coverage: 1, step_order: 5, parameter_coverage: 0 },
+ });
+ // (1.0 * 1 + 1/3 * 5 + 0) / 6 = ~0.444
+ expect(heavy.weightedScore).toBeLessThan(0.6);
+ expect(heavy.passed).toBe(false);
+ });
+
+ test("zero weights collapse cleanly", () => {
+ const result = scoreProcedure({
+ predictedSteps: ["a"],
+ goldenSteps: ["a"],
+ weights: { step_coverage: 0, step_order: 0, parameter_coverage: 0 },
+ });
+ expect(result.weightedScore).toBe(0);
+ expect(result.passed).toBe(false);
+ });
+});
diff --git a/src/domains/evaluation/procedure-match.ts b/src/domains/evaluation/procedure-match.ts
new file mode 100644
index 0000000..c8869c3
--- /dev/null
+++ b/src/domains/evaluation/procedure-match.ts
@@ -0,0 +1,247 @@
+/**
+ * Procedure-structure matching primitives for the procedure-extraction scorer.
+ *
+ * Given a `golden` procedure (ordered list of step IDs / labels, optional
+ * parameter set) and a `predicted` procedure produced by the dream-pass
+ * procedure-synthesis pipeline (`ProcedureMemory`), score how well they
+ * match on three axes:
+ *
+ * 1. Step coverage — Jaccard / F1 over the set of step labels
+ * 2. Step order — normalized Levenshtein edit distance over the
+ * two step sequences (LCS-based normalization)
+ * 3. Parameter coverage — Jaccard over named parameters
+ *
+ * No I/O. All math is pure and pinned by known-answer tests.
+ */
+
+import { precisionAtK, recallAtK } from "./ranking.ts";
+
+/** Normalize a step or parameter label for matching. */
+function normalize(value: string): string {
+ return value.trim().toLowerCase();
+}
+
+function unique(values: readonly string[]): string[] {
+ return [...new Set(values.map(normalize))];
+}
+
+export type StepCoverage = {
+ precision: number;
+ recall: number;
+ f1: number;
+ matchedSteps: string[];
+ missingSteps: string[];
+ extraSteps: string[];
+};
+
+/**
+ * Set-level coverage of predicted steps vs golden steps. Uses normalized
+ * exact equality (case-insensitive, whitespace-trimmed) — substring matching
+ * would be too lax for procedure step labels.
+ */
+export function stepCoverage(
+ predicted: readonly string[],
+ golden: readonly string[],
+): StepCoverage {
+ const predicted_ = unique(predicted);
+ const golden_ = unique(golden);
+ const goldenSet = new Set(golden_);
+ const predictedSet = new Set(predicted_);
+
+ const matched = predicted_.filter((step) => goldenSet.has(step));
+ const missing = golden_.filter((step) => !predictedSet.has(step));
+ const extra = predicted_.filter((step) => !goldenSet.has(step));
+
+ const precision =
+ predicted_.length === 0
+ ? golden_.length === 0
+ ? 1
+ : 0
+ : matched.length / predicted_.length;
+ const recall = golden_.length === 0 ? 1 : matched.length / golden_.length;
+ const f1 =
+ precision + recall === 0
+ ? 0
+ : (2 * precision * recall) / (precision + recall);
+ return {
+ precision,
+ recall,
+ f1,
+ matchedSteps: matched,
+ missingSteps: missing,
+ extraSteps: extra,
+ };
+}
+
+/**
+ * Length of the longest common subsequence between `a` and `b`. O(|a| * |b|)
+ * time and space — fine for procedures of <100 steps; we don't need the
+ * Hirschberg refinement.
+ */
+export function longestCommonSubsequenceLength(
+ a: readonly string[],
+ b: readonly string[],
+): number {
+ const an = a.map(normalize);
+ const bn = b.map(normalize);
+ const rows = an.length + 1;
+ const cols = bn.length + 1;
+ const dp = new Array(rows * cols).fill(0);
+ const at = (i: number, j: number): number => dp[i * cols + j] ?? 0;
+ const set = (i: number, j: number, value: number): void => {
+ dp[i * cols + j] = value;
+ };
+ for (let i = 1; i < rows; i += 1) {
+ for (let j = 1; j < cols; j += 1) {
+ if (an[i - 1] === bn[j - 1]) {
+ set(i, j, at(i - 1, j - 1) + 1);
+ } else {
+ set(i, j, Math.max(at(i - 1, j), at(i, j - 1)));
+ }
+ }
+ }
+ return at(an.length, bn.length);
+}
+
+/**
+ * Order similarity in [0, 1]. Computed as `LCS / max(|a|, |b|)`. Two
+ * identical sequences yield 1; two with no shared elements yield 0.
+ *
+ * This is intentionally different from raw Levenshtein. Procedures are
+ * order-sensitive but tolerate insertions/deletions; LCS-normalized
+ * similarity matches what the dream-pass extractor is trying to recover.
+ */
+export function orderSimilarity(
+ predicted: readonly string[],
+ golden: readonly string[],
+): number {
+ if (predicted.length === 0 && golden.length === 0) {
+ return 1;
+ }
+ const denom = Math.max(predicted.length, golden.length);
+ if (denom === 0) {
+ return 1;
+ }
+ const lcs = longestCommonSubsequenceLength(predicted, golden);
+ return lcs / denom;
+}
+
+export type ParameterCoverage = {
+ jaccard: number;
+ matched: string[];
+ missing: string[];
+ extra: string[];
+};
+
+export function parameterCoverage(
+ predicted: readonly string[],
+ golden: readonly string[],
+): ParameterCoverage {
+ const predicted_ = unique(predicted);
+ const golden_ = unique(golden);
+ const goldenSet = new Set(golden_);
+ const predictedSet = new Set(predicted_);
+ const matched = predicted_.filter((p) => goldenSet.has(p));
+ const missing = golden_.filter((p) => !predictedSet.has(p));
+ const extra = predicted_.filter((p) => !goldenSet.has(p));
+ const unionSize = new Set([...predicted_, ...golden_]).size;
+ const jaccard = unionSize === 0 ? 1 : matched.length / unionSize;
+ return { jaccard, matched, missing, extra };
+}
+
+export type ProcedureMatchInput = {
+ predictedSteps: readonly string[];
+ goldenSteps: readonly string[];
+ predictedParameters?: readonly string[];
+ goldenParameters?: readonly string[];
+ weights?: {
+ step_coverage?: number;
+ step_order?: number;
+ parameter_coverage?: number;
+ };
+ passThreshold?: number;
+};
+
+export type ProcedureMetricKey =
+ | "step_coverage"
+ | "step_order"
+ | "parameter_coverage";
+
+export type ProcedureMetricScore = {
+ metric: ProcedureMetricKey;
+ value: number;
+ weight: number;
+};
+
+export type ProcedureScoreResult = {
+ metrics: ProcedureMetricScore[];
+ weightedScore: number;
+ passThreshold: number;
+ passed: boolean;
+ step: StepCoverage;
+ order: number;
+ parameters: ParameterCoverage;
+};
+
+const DEFAULT_WEIGHTS = {
+ step_coverage: 1,
+ step_order: 1,
+ parameter_coverage: 1,
+};
+
+const DEFAULT_PASS_THRESHOLD = 0.6;
+
+/**
+ * Score a single (predicted, golden) procedure pair. Use this when the
+ * extractor produces one procedure per query.
+ */
+export function scoreProcedure(
+ input: ProcedureMatchInput,
+): ProcedureScoreResult {
+ const step = stepCoverage(input.predictedSteps, input.goldenSteps);
+ const order = orderSimilarity(input.predictedSteps, input.goldenSteps);
+ const parameters = parameterCoverage(
+ input.predictedParameters ?? [],
+ input.goldenParameters ?? [],
+ );
+ const weights = { ...DEFAULT_WEIGHTS, ...(input.weights ?? {}) };
+
+ const metrics: ProcedureMetricScore[] = [
+ { metric: "step_coverage", value: step.f1, weight: weights.step_coverage },
+ { metric: "step_order", value: order, weight: weights.step_order },
+ {
+ metric: "parameter_coverage",
+ value: parameters.jaccard,
+ weight: weights.parameter_coverage,
+ },
+ ];
+
+ const totalWeight = metrics.reduce(
+ (sum, item) => sum + Math.max(0, item.weight),
+ 0,
+ );
+ const weightedScore =
+ totalWeight === 0
+ ? 0
+ : metrics.reduce(
+ (sum, item) =>
+ item.weight > 0 ? sum + item.value * item.weight : sum,
+ 0,
+ ) / totalWeight;
+ const passThreshold = input.passThreshold ?? DEFAULT_PASS_THRESHOLD;
+ return {
+ metrics,
+ weightedScore,
+ passThreshold,
+ passed: weightedScore >= passThreshold,
+ step,
+ order,
+ parameters,
+ };
+}
+
+/**
+ * Re-export the ranking primitives so callers that want to compose a
+ * procedure score with retrieval-style metrics have one entry point.
+ */
+export { precisionAtK, recallAtK };
diff --git a/src/domains/evaluation/procedure-scorer.test.ts b/src/domains/evaluation/procedure-scorer.test.ts
new file mode 100644
index 0000000..0298a73
--- /dev/null
+++ b/src/domains/evaluation/procedure-scorer.test.ts
@@ -0,0 +1,136 @@
+import { describe, expect, test } from "bun:test";
+import { mkdtempSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import type {
+ AdapterReply,
+ ProcedureConfig,
+ Scenario,
+} from "../../shared/types/contracts.ts";
+import {
+ coerceProcedurePayload,
+ resolveProcedurePayload,
+ scoreScenarioProcedure,
+} from "./procedure-scorer.ts";
+
+function makeTempDir(prefix: string): string {
+ return mkdtempSync(join(tmpdir(), `agentprobe-${prefix}-`));
+}
+
+function buildConfig(
+ overrides: Partial = {},
+): ProcedureConfig {
+ return {
+ goldenSteps: overrides.goldenSteps ?? ["open ticket", "assign", "close"],
+ goldenParameters: overrides.goldenParameters,
+ weights: overrides.weights ?? {
+ step_coverage: 1,
+ step_order: 1,
+ parameter_coverage: 1,
+ },
+ passThreshold: overrides.passThreshold ?? 0.6,
+ source: overrides.source,
+ };
+}
+
+function buildScenario(config: ProcedureConfig | undefined): Scenario {
+ return {
+ id: "procedure-test",
+ name: "Procedure test",
+ tags: [],
+ turns: [],
+ sessions: [],
+ expectations: {
+ mustInclude: [],
+ mustNotInclude: [],
+ expectedTools: [],
+ failureModes: [],
+ },
+ procedure: config,
+ };
+}
+
+function buildReply(payload: unknown): AdapterReply {
+ return {
+ assistantText: "...",
+ toolCalls: [],
+ rawExchange: {
+ procedure: payload,
+ } as unknown as AdapterReply["rawExchange"],
+ latencyMs: 0,
+ usage: {},
+ };
+}
+
+describe("coerceProcedurePayload", () => {
+ test("accepts a bare list of steps", () => {
+ expect(coerceProcedurePayload(["a", "b"]).steps).toEqual(["a", "b"]);
+ });
+
+ test("extracts steps and parameters from object payloads", () => {
+ const payload = coerceProcedurePayload({
+ steps: ["open", "close"],
+ parameters: ["ticket_id"],
+ });
+ expect(payload.steps).toEqual(["open", "close"]);
+ expect(payload.parameters).toEqual(["ticket_id"]);
+ });
+});
+
+describe("resolveProcedurePayload", () => {
+ test("loads from fixture", () => {
+ const dir = makeTempDir("procedure-fixture");
+ writeFileSync(
+ join(dir, "proc.json"),
+ JSON.stringify({ steps: ["open", "close"] }),
+ "utf8",
+ );
+ const config = buildConfig({ source: { fixture: "proc.json" } });
+ const result = resolveProcedurePayload(config, {
+ scenariosPath: join(dir, "scenarios.yaml"),
+ });
+ expect(result.source).toBe("fixture");
+ expect(result.payload.steps).toEqual(["open", "close"]);
+ });
+
+ test("loads from rawExchange", () => {
+ const result = resolveProcedurePayload(buildConfig(), {
+ lastAdapterReply: buildReply({ steps: ["a", "b"] }),
+ });
+ expect(result.source).toBe("raw_exchange");
+ });
+});
+
+describe("scoreScenarioProcedure", () => {
+ test("returns undefined when no procedure block on scenario", () => {
+ expect(
+ scoreScenarioProcedure(buildScenario(undefined), {}),
+ ).toBeUndefined();
+ });
+
+ test("perfect match passes", () => {
+ const scenario = buildScenario(buildConfig());
+ const result = scoreScenarioProcedure(scenario, {
+ lastAdapterReply: buildReply({
+ steps: ["open ticket", "assign", "close"],
+ }),
+ });
+ expect(result?.weightedScore).toBeCloseTo(1.0, 6);
+ expect(result?.passed).toBe(true);
+ });
+
+ test("reordered steps drop weighted score", () => {
+ const scenario = buildScenario(
+ buildConfig({
+ weights: { step_coverage: 1, step_order: 5, parameter_coverage: 0 },
+ }),
+ );
+ const result = scoreScenarioProcedure(scenario, {
+ lastAdapterReply: buildReply({
+ steps: ["close", "assign", "open ticket"],
+ }),
+ });
+ expect(result?.passed).toBe(false);
+ });
+});
diff --git a/src/domains/evaluation/procedure-scorer.ts b/src/domains/evaluation/procedure-scorer.ts
new file mode 100644
index 0000000..0f98ccc
--- /dev/null
+++ b/src/domains/evaluation/procedure-scorer.ts
@@ -0,0 +1,146 @@
+import { existsSync, readFileSync, statSync } from "node:fs";
+import { dirname, isAbsolute, resolve } from "node:path";
+
+import type {
+ AdapterReply,
+ EvalSource,
+ JsonValue,
+ ProcedureConfig,
+ ProcedureScore,
+ Scenario,
+} from "../../shared/types/contracts.ts";
+import { AgentProbeRuntimeError } from "../../shared/utils/errors.ts";
+import { logWarn } from "../../shared/utils/logging.ts";
+import { scoreProcedure } from "./procedure-match.ts";
+
+const DEFAULT_RAW_EXCHANGE_KEY = "procedure";
+
+export type ProcedurePayload = {
+ steps?: string[];
+ parameters?: string[];
+};
+
+function resolveFixturePath(
+ scenariosPath: string | undefined,
+ fixture: string,
+): string {
+ if (isAbsolute(fixture)) {
+ return fixture;
+ }
+ if (!scenariosPath) {
+ return resolve(fixture);
+ }
+ let base: string;
+ try {
+ base =
+ existsSync(scenariosPath) && statSync(scenariosPath).isDirectory()
+ ? scenariosPath
+ : dirname(scenariosPath);
+ } catch {
+ base = dirname(scenariosPath);
+ }
+ return resolve(base, fixture);
+}
+
+export function coerceProcedurePayload(payload: unknown): ProcedurePayload {
+ if (!payload || typeof payload !== "object") {
+ return {};
+ }
+ if (Array.isArray(payload)) {
+ return {
+ steps: payload.filter((s): s is string => typeof s === "string"),
+ };
+ }
+ const record = payload as Record;
+ const steps = Array.isArray(record.steps)
+ ? record.steps.filter((s): s is string => typeof s === "string")
+ : undefined;
+ const parameters = Array.isArray(record.parameters)
+ ? record.parameters.filter((s): s is string => typeof s === "string")
+ : undefined;
+ return { steps, parameters };
+}
+
+export type ProcedureSourceContext = {
+ scenariosPath?: string;
+ lastAdapterReply?: AdapterReply;
+};
+
+export type ResolvedProcedure = {
+ payload: ProcedurePayload;
+ source: EvalSource;
+};
+
+export function resolveProcedurePayload(
+ config: ProcedureConfig,
+ context: ProcedureSourceContext,
+): ResolvedProcedure {
+ const fixture = config.source?.fixture;
+ if (fixture) {
+ const resolved = resolveFixturePath(context.scenariosPath, fixture);
+ if (!existsSync(resolved)) {
+ throw new AgentProbeRuntimeError(
+ `Procedure fixture not found: ${resolved}`,
+ );
+ }
+ return {
+ payload: coerceProcedurePayload(
+ JSON.parse(readFileSync(resolved, "utf8")),
+ ),
+ source: "fixture",
+ };
+ }
+ const key = config.source?.rawExchangeKey ?? DEFAULT_RAW_EXCHANGE_KEY;
+ const rawExchange = context.lastAdapterReply?.rawExchange;
+ if (rawExchange && typeof rawExchange === "object") {
+ const candidate = (rawExchange as Record)[key];
+ if (candidate !== undefined) {
+ return {
+ payload: coerceProcedurePayload(candidate),
+ source: "raw_exchange",
+ };
+ }
+ }
+ return { payload: {}, source: "missing" };
+}
+
+export function scoreScenarioProcedure(
+ scenario: Scenario,
+ context: ProcedureSourceContext,
+): ProcedureScore | undefined {
+ const config = scenario.procedure;
+ if (!config) {
+ return undefined;
+ }
+
+ let resolution: ResolvedProcedure;
+ try {
+ resolution = resolveProcedurePayload(config, context);
+ } catch (error) {
+ logWarn(
+ `Procedure scorer failed to resolve payload for ${scenario.id}: ${
+ error instanceof Error ? error.message : String(error)
+ }`,
+ );
+ resolution = { payload: {}, source: "missing" };
+ }
+
+ const match = scoreProcedure({
+ predictedSteps: resolution.payload.steps ?? [],
+ goldenSteps: config.goldenSteps,
+ predictedParameters: resolution.payload.parameters,
+ goldenParameters: config.goldenParameters,
+ weights: config.weights,
+ passThreshold: config.passThreshold,
+ });
+
+ return {
+ metrics: match.metrics,
+ weightedScore: match.weightedScore,
+ passThreshold: config.passThreshold,
+ passed: match.passed,
+ predictedSteps: [...(resolution.payload.steps ?? [])],
+ goldenSteps: [...config.goldenSteps],
+ source: resolution.source,
+ };
+}
diff --git a/src/domains/evaluation/ranking.test.ts b/src/domains/evaluation/ranking.test.ts
new file mode 100644
index 0000000..96c4f26
--- /dev/null
+++ b/src/domains/evaluation/ranking.test.ts
@@ -0,0 +1,217 @@
+import { describe, expect, test } from "bun:test";
+
+import {
+ buildRelevanceVector,
+ countUniqueGoldHits,
+ mrr,
+ ndcgAtK,
+ precisionAtK,
+ recallAtK,
+ scoreRanking,
+} from "./ranking.ts";
+
+describe("precisionAtK", () => {
+ test("counts hits in the top-k window", () => {
+ expect(precisionAtK([1, 1, 0], 2)).toBeCloseTo(1.0, 6);
+ expect(precisionAtK([1, 0, 1], 3)).toBeCloseTo(2 / 3, 6);
+ expect(precisionAtK([0, 0, 0], 3)).toBe(0);
+ });
+
+ test("uses k as the denominator even when fewer items were returned", () => {
+ // Two items returned, both relevant, but k=5 — short list still penalized.
+ expect(precisionAtK([1, 1], 5)).toBeCloseTo(2 / 5, 6);
+ });
+
+ test("returns 0 for non-positive k", () => {
+ expect(precisionAtK([1, 1, 1], 0)).toBe(0);
+ });
+});
+
+describe("recallAtK", () => {
+ test("returns 1 when there are no expected items", () => {
+ expect(recallAtK([0, 0, 0], 5, 0)).toBe(1);
+ });
+
+ test("scales by total relevant", () => {
+ expect(recallAtK([1, 0, 1], 3, 2)).toBeCloseTo(1.0, 6);
+ expect(recallAtK([1, 0, 1], 3, 4)).toBeCloseTo(0.5, 6);
+ expect(recallAtK([1, 0, 0], 3, 2)).toBeCloseTo(0.5, 6);
+ });
+
+ test("only counts hits within the cutoff", () => {
+ expect(recallAtK([0, 0, 1], 2, 1)).toBe(0);
+ expect(recallAtK([0, 0, 1], 3, 1)).toBe(1);
+ });
+});
+
+describe("mrr", () => {
+ test("returns the reciprocal of the first hit rank", () => {
+ expect(mrr([0, 0, 1])).toBeCloseTo(1 / 3, 6);
+ expect(mrr([1, 0, 0])).toBeCloseTo(1.0, 6);
+ expect(mrr([0, 1, 1])).toBeCloseTo(0.5, 6);
+ });
+
+ test("returns 0 when no hits", () => {
+ expect(mrr([0, 0, 0])).toBe(0);
+ });
+
+ test("respects the k cutoff", () => {
+ // Hit at rank 3 but k=2 — treat as no hit.
+ expect(mrr([0, 0, 1], 2)).toBe(0);
+ expect(mrr([0, 0, 1], 3)).toBeCloseTo(1 / 3, 6);
+ });
+});
+
+describe("ndcgAtK", () => {
+ test("perfect ranking yields 1", () => {
+ expect(ndcgAtK([1, 1, 1], 3)).toBeCloseTo(1.0, 6);
+ expect(ndcgAtK([1, 1, 0], 3)).toBeCloseTo(1.0, 6);
+ });
+
+ test("NDCG of [1, 0, 1] with log2 discount", () => {
+ // DCG = 1/log2(2) + 0/log2(3) + 1/log2(4) = 1 + 0 + 0.5 = 1.5
+ // ideal DCG (sorted desc = [1, 1, 0]) = 1/log2(2) + 1/log2(3) + 0 = 1 + ~0.6309 = ~1.6309
+ // NDCG = 1.5 / 1.6309 = ~0.9197
+ expect(ndcgAtK([1, 0, 1], 3)).toBeCloseTo(0.91972, 4);
+ });
+
+ test("returns 0 when no relevant items exist", () => {
+ expect(ndcgAtK([0, 0, 0], 3)).toBe(0);
+ });
+
+ test("respects the k cutoff", () => {
+ // Relevant only at rank 3, k=2 — DCG over window is 0.
+ expect(ndcgAtK([0, 0, 1], 2)).toBe(0);
+ // Same vector, k=3, DCG = 1/log2(4) = 0.5, ideal = 1, => 0.5.
+ expect(ndcgAtK([0, 0, 1], 3)).toBeCloseTo(0.5, 6);
+ });
+});
+
+describe("buildRelevanceVector", () => {
+ test("substring policy is case-insensitive and bidirectional", () => {
+ const returned = ["Sarah's email address", "Random other note"];
+ const golden = ["sarah"];
+ expect(buildRelevanceVector(returned, golden, "substring")).toEqual([1, 0]);
+ });
+
+ test("exact policy requires full normalized equality", () => {
+ expect(
+ buildRelevanceVector(
+ ["Atlas Project Status"],
+ ["atlas project status"],
+ "exact",
+ ),
+ ).toEqual([1]);
+ expect(
+ buildRelevanceVector(
+ ["Atlas Project"],
+ ["Atlas Project Status"],
+ "exact",
+ ),
+ ).toEqual([0]);
+ });
+
+ test("regex policy interprets the golden item as a pattern", () => {
+ expect(buildRelevanceVector(["budget: $50K"], ["\\$50k"], "regex")).toEqual(
+ [1],
+ );
+ });
+
+ test("returns 0 for empty golden item to avoid false matches", () => {
+ expect(buildRelevanceVector(["anything"], [""], "substring")).toEqual([0]);
+ });
+});
+
+describe("countUniqueGoldHits", () => {
+ test("dedupes duplicate returns against the same gold item", () => {
+ const returned = ["Sarah", "Sarah", "Atlas"];
+ const golden = ["Sarah", "Atlas"];
+ expect(countUniqueGoldHits(returned, golden, 5)).toBe(2);
+ });
+
+ test("respects k cutoff", () => {
+ const returned = ["Atlas", "Sarah"];
+ const golden = ["Sarah", "Atlas"];
+ expect(countUniqueGoldHits(returned, golden, 1)).toBe(1);
+ expect(countUniqueGoldHits(returned, golden, 2)).toBe(2);
+ });
+});
+
+describe("scoreRanking", () => {
+ test("perfect top-k returns weightedScore 1 and passes", () => {
+ const result = scoreRanking({
+ returned: ["sarahs email", "atlas project status"],
+ golden: ["sarah", "atlas project"],
+ k: 2,
+ });
+
+ expect(result.k).toBe(2);
+ expect(result.hitCount).toBe(2);
+ expect(result.forbiddenHits).toBe(0);
+ expect(result.weightedScore).toBeCloseTo(1.0, 6);
+ expect(result.passed).toBe(true);
+ });
+
+ test("missing gold items lower recall and weighted score", () => {
+ const result = scoreRanking({
+ returned: ["unrelated note"],
+ golden: ["sarah", "atlas project"],
+ k: 5,
+ passThreshold: 0.5,
+ });
+
+ expect(result.hitCount).toBe(0);
+ expect(result.weightedScore).toBe(0);
+ expect(result.passed).toBe(false);
+ });
+
+ test("forbidden hits force a fail even when score is high", () => {
+ const result = scoreRanking({
+ returned: ["sarah", "old budget figure"],
+ golden: ["sarah"],
+ forbidden: ["old budget"],
+ k: 2,
+ passThreshold: 0.3,
+ });
+
+ expect(result.forbiddenHits).toBe(1);
+ expect(result.passed).toBe(false);
+ });
+
+ test("weight=0 excludes a metric from weightedScore without dropping the report", () => {
+ const result = scoreRanking({
+ returned: ["sarah"],
+ golden: ["sarah", "atlas"],
+ weights: {
+ precision_at_k: 1,
+ recall_at_k: 0,
+ mrr: 1,
+ ndcg_at_k: 1,
+ },
+ k: 1,
+ });
+
+ const recall = result.metrics.find((item) => item.metric === "recall_at_k");
+ expect(recall?.weight).toBe(0);
+ // recall at k=1 with 2 gold items is 0.5; weightedScore should ignore it.
+ // precision=1, mrr=1, ndcg=1 -> average 1.0
+ expect(result.weightedScore).toBeCloseTo(1.0, 6);
+ });
+
+ test("defaults k to max(|returned|, |golden|, 1)", () => {
+ const result = scoreRanking({
+ returned: ["a", "b", "c"],
+ golden: ["a"],
+ });
+ expect(result.k).toBe(3);
+ });
+
+ test("zero golden items yields trivial recall=1 and a score driven by precision-style metrics", () => {
+ const result = scoreRanking({
+ returned: [],
+ golden: [],
+ });
+ const recall = result.metrics.find((item) => item.metric === "recall_at_k");
+ expect(recall?.value).toBe(1);
+ });
+});
diff --git a/src/domains/evaluation/ranking.ts b/src/domains/evaluation/ranking.ts
new file mode 100644
index 0000000..a449b43
--- /dev/null
+++ b/src/domains/evaluation/ranking.ts
@@ -0,0 +1,377 @@
+/**
+ * Pure information-retrieval ranking metrics.
+ *
+ * All functions take a `relevance` vector — the binary relevance (0 or 1)
+ * of the returned list at each rank position. They return values in [0, 1].
+ *
+ * No I/O, no LLM calls; this module is intended to be the load-bearing math
+ * behind the YAML `retrieval:` scorer. Tests pin the algebra against
+ * known-answer cases.
+ */
+
+/** Truncate to `k`, defaulting to the full length when `k` is undefined or invalid. */
+function clampK(length: number, k: number | undefined): number {
+ if (k === undefined || !Number.isFinite(k) || k <= 0) {
+ return length;
+ }
+ return Math.min(length, Math.floor(k));
+}
+
+/**
+ * Precision@k — fraction of the top-k returned items that are relevant.
+ *
+ * When `k` exceeds the returned list, the denominator stays at `k` so that
+ * a short list still gets penalized for not surfacing enough items. This
+ * matches the `pytrec_eval` convention.
+ */
+export function precisionAtK(relevance: number[], k: number): number {
+ if (k <= 0) {
+ return 0;
+ }
+ const limit = Math.min(relevance.length, Math.floor(k));
+ let hits = 0;
+ for (let index = 0; index < limit; index += 1) {
+ if ((relevance[index] ?? 0) > 0) {
+ hits += 1;
+ }
+ }
+ return hits / Math.floor(k);
+}
+
+/**
+ * Recall@k — fraction of all relevant items that appear in the top-k.
+ *
+ * `totalRelevant` is the total number of items the suite expected to be
+ * relevant (the size of the golden set), not the count of relevant items
+ * actually returned. When `totalRelevant` is 0, recall is defined as 1
+ * (no expectations means nothing to miss).
+ */
+export function recallAtK(
+ relevance: number[],
+ k: number,
+ totalRelevant: number,
+): number {
+ if (totalRelevant <= 0) {
+ return 1;
+ }
+ if (k <= 0) {
+ return 0;
+ }
+ const limit = clampK(relevance.length, k);
+ let hits = 0;
+ for (let index = 0; index < limit; index += 1) {
+ if ((relevance[index] ?? 0) > 0) {
+ hits += 1;
+ }
+ }
+ return hits / totalRelevant;
+}
+
+/**
+ * Mean reciprocal rank — `1 / rankOfFirstHit`, or 0 when no relevant item is
+ * returned. Computed for a single query (the "mean" is implicit when the
+ * caller averages across multiple queries).
+ *
+ * When `k` is provided, only the first `k` positions are considered, so a
+ * hit at rank `k + 1` is treated as no hit.
+ */
+export function mrr(relevance: number[], k?: number): number {
+ const limit = clampK(relevance.length, k);
+ for (let index = 0; index < limit; index += 1) {
+ if ((relevance[index] ?? 0) > 0) {
+ return 1 / (index + 1);
+ }
+ }
+ return 0;
+}
+
+function dcgAtK(relevance: number[], k: number): number {
+ const limit = clampK(relevance.length, k);
+ let dcg = 0;
+ for (let index = 0; index < limit; index += 1) {
+ const rel = relevance[index] ?? 0;
+ if (rel <= 0) {
+ continue;
+ }
+ // log2(rank + 1) discount, with rank starting at 1.
+ dcg += rel / Math.log2(index + 2);
+ }
+ return dcg;
+}
+
+/**
+ * Normalized discounted cumulative gain at k.
+ *
+ * Uses the classic `log2(rank + 1)` discount and idealizes DCG against the
+ * relevance vector sorted descending. With binary relevance this collapses to
+ * the standard NDCG@k.
+ *
+ * When the ideal DCG is 0 (no relevant items expected), NDCG is defined as 0.
+ */
+export function ndcgAtK(relevance: number[], k: number): number {
+ if (k <= 0) {
+ return 0;
+ }
+ const idealRelevance = [...relevance].sort((left, right) => right - left);
+ const ideal = dcgAtK(idealRelevance, k);
+ if (ideal <= 0) {
+ return 0;
+ }
+ return dcgAtK(relevance, k) / ideal;
+}
+
+export type RankingMetricKey =
+ | "precision_at_k"
+ | "recall_at_k"
+ | "mrr"
+ | "ndcg_at_k";
+
+export type RankingMetricResult = {
+ metric: RankingMetricKey;
+ value: number;
+ weight: number;
+};
+
+export type RankingScoreResult = {
+ k: number;
+ totalRelevant: number;
+ totalReturned: number;
+ hitCount: number;
+ forbiddenHits: number;
+ metrics: RankingMetricResult[];
+ /** Weighted average across the metrics that carry positive weight. */
+ weightedScore: number;
+ /** True when score >= `passThreshold` AND no forbidden items appeared in top-k. */
+ passed: boolean;
+};
+
+export type RankingWeights = Partial>;
+
+export type RankingScoreInput = {
+ /** The list of returned items, in rank order. */
+ returned: string[];
+ /** The golden set of relevant items. */
+ golden: string[];
+ /**
+ * Optional forbidden items. Any forbidden item that appears in the top-k
+ * forces `passed: false` and is reported via `forbiddenHits`.
+ */
+ forbidden?: string[];
+ /** Rank cutoff. Defaults to `Math.max(returned.length, golden.length)`. */
+ k?: number;
+ /** Per-metric weights. Metrics with weight 0 (or absent) are still reported but excluded from `weightedScore`. */
+ weights?: RankingWeights;
+ /** Match policy applied to each `returned` vs `golden` comparison. */
+ match?: MatchPolicy;
+ /** Pass threshold on the `weightedScore`. Defaults to 0.5. */
+ passThreshold?: number;
+};
+
+export type MatchPolicy = "exact" | "substring" | "regex";
+
+const DEFAULT_WEIGHTS: Required = {
+ precision_at_k: 1,
+ recall_at_k: 1,
+ mrr: 1,
+ ndcg_at_k: 1,
+};
+
+const DEFAULT_PASS_THRESHOLD = 0.5;
+
+function normalizeString(value: string): string {
+ return value.trim().toLowerCase();
+}
+
+function matchesItem(
+ returned: string,
+ expected: string,
+ policy: MatchPolicy,
+): boolean {
+ switch (policy) {
+ case "exact":
+ return normalizeString(returned) === normalizeString(expected);
+ case "substring": {
+ const candidate = normalizeString(returned);
+ const needle = normalizeString(expected);
+ if (!needle) {
+ return false;
+ }
+ return candidate.includes(needle) || needle.includes(candidate);
+ }
+ case "regex":
+ try {
+ return new RegExp(expected, "i").test(returned);
+ } catch {
+ return false;
+ }
+ }
+}
+
+/**
+ * Build the binary-relevance vector for `returned` against `golden`.
+ *
+ * Each returned item counts as a hit when any golden item matches under the
+ * supplied policy. Golden items can be matched by multiple returned items
+ * (i.e. duplicates in `returned` do not double-count gold coverage, but each
+ * occurrence is still marked relevant in the vector — this matches the
+ * standard IR convention because rank-based metrics naturally penalize
+ * duplicates via the discount and the `totalRelevant` denominator).
+ */
+export function buildRelevanceVector(
+ returned: string[],
+ golden: string[],
+ policy: MatchPolicy = "substring",
+): number[] {
+ return returned.map((candidate) =>
+ golden.some((expected) => matchesItem(candidate, expected, policy)) ? 1 : 0,
+ );
+}
+
+/**
+ * Count distinct gold items that the returned list covers in the top-k.
+ *
+ * This is the numerator used by `recallAtK` when we want recall to reflect
+ * *unique* gold coverage rather than total relevant returns. It tolerates
+ * duplicates in `returned` without double-counting.
+ */
+export function countUniqueGoldHits(
+ returned: string[],
+ golden: string[],
+ k: number,
+ policy: MatchPolicy = "substring",
+): number {
+ if (k <= 0) {
+ return 0;
+ }
+ const limit = clampK(returned.length, k);
+ const matched = new Set();
+ for (let index = 0; index < limit; index += 1) {
+ const candidate = returned[index] ?? "";
+ for (let gIndex = 0; gIndex < golden.length; gIndex += 1) {
+ if (matched.has(gIndex)) {
+ continue;
+ }
+ if (matchesItem(candidate, golden[gIndex] ?? "", policy)) {
+ matched.add(gIndex);
+ }
+ }
+ }
+ return matched.size;
+}
+
+function countForbiddenHits(
+ returned: string[],
+ forbidden: string[],
+ k: number,
+ policy: MatchPolicy,
+): number {
+ if (forbidden.length === 0 || k <= 0) {
+ return 0;
+ }
+ const limit = clampK(returned.length, k);
+ let hits = 0;
+ for (let index = 0; index < limit; index += 1) {
+ const candidate = returned[index] ?? "";
+ if (
+ forbidden.some((forbiddenItem) =>
+ matchesItem(candidate, forbiddenItem, policy),
+ )
+ ) {
+ hits += 1;
+ }
+ }
+ return hits;
+}
+
+/**
+ * Top-level ranking scorer. Computes the four canonical metrics and
+ * aggregates them under a weighted average. Forbidden items override the
+ * pass decision regardless of metric values.
+ */
+export function scoreRanking(input: RankingScoreInput): RankingScoreResult {
+ const policy = input.match ?? "substring";
+ const k = clampK(
+ Math.max(input.returned.length, input.golden.length, 1),
+ input.k,
+ );
+ const relevance = buildRelevanceVector(input.returned, input.golden, policy);
+ const uniqueHits = countUniqueGoldHits(
+ input.returned,
+ input.golden,
+ k,
+ policy,
+ );
+
+ const weights: Required = {
+ ...DEFAULT_WEIGHTS,
+ ...(input.weights ?? {}),
+ };
+
+ // Recall uses unique gold coverage to keep the math meaningful when the
+ // returned list contains duplicates.
+ const recallVectorHits = uniqueHits;
+ const recall =
+ input.golden.length === 0 ? 1 : recallVectorHits / input.golden.length;
+
+ const metrics: RankingMetricResult[] = [
+ {
+ metric: "precision_at_k",
+ value: precisionAtK(relevance, k),
+ weight: weights.precision_at_k,
+ },
+ {
+ metric: "recall_at_k",
+ value: recall,
+ weight: weights.recall_at_k,
+ },
+ {
+ metric: "mrr",
+ value: mrr(relevance, k),
+ weight: weights.mrr,
+ },
+ {
+ metric: "ndcg_at_k",
+ value: ndcgAtK(relevance, k),
+ weight: weights.ndcg_at_k,
+ },
+ ];
+
+ const totalWeight = metrics.reduce(
+ (sum, item) => sum + (item.weight > 0 ? item.weight : 0),
+ 0,
+ );
+ const weightedScore =
+ totalWeight === 0
+ ? 0
+ : metrics.reduce(
+ (sum, item) =>
+ item.weight > 0 ? sum + item.value * item.weight : sum,
+ 0,
+ ) / totalWeight;
+
+ const forbidden = input.forbidden ?? [];
+ const forbiddenHits = countForbiddenHits(
+ input.returned,
+ forbidden,
+ k,
+ policy,
+ );
+ const passThreshold = input.passThreshold ?? DEFAULT_PASS_THRESHOLD;
+ const passed = forbiddenHits === 0 && weightedScore >= passThreshold;
+
+ const hitCount = relevance.reduce(
+ (sum, value) => sum + (value > 0 ? 1 : 0),
+ 0,
+ );
+
+ return {
+ k,
+ totalRelevant: input.golden.length,
+ totalReturned: input.returned.length,
+ hitCount,
+ forbiddenHits,
+ metrics,
+ weightedScore,
+ passed,
+ };
+}
diff --git a/src/domains/evaluation/retrieval-scorer.test.ts b/src/domains/evaluation/retrieval-scorer.test.ts
new file mode 100644
index 0000000..44a1455
--- /dev/null
+++ b/src/domains/evaluation/retrieval-scorer.test.ts
@@ -0,0 +1,241 @@
+import { describe, expect, test } from "bun:test";
+import { mkdtempSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import type {
+ AdapterReply,
+ RetrievalConfig,
+ Scenario,
+} from "../../shared/types/contracts.ts";
+import {
+ coerceRetrievedItems,
+ resolveRetrievedItems,
+ scoreRetrieval,
+} from "./retrieval-scorer.ts";
+
+function makeTempDir(prefix: string): string {
+ return mkdtempSync(join(tmpdir(), `agentprobe-${prefix}-`));
+}
+
+function buildConfig(
+ overrides: Partial = {},
+): RetrievalConfig {
+ return {
+ golden: overrides.golden ?? ["Sarah's email", "Atlas project status"],
+ forbidden: overrides.forbidden ?? [],
+ k: overrides.k,
+ weights: overrides.weights ?? {
+ precision_at_k: 1,
+ recall_at_k: 1,
+ mrr: 1,
+ ndcg_at_k: 1,
+ },
+ passThreshold: overrides.passThreshold ?? 0.5,
+ match: overrides.match ?? "substring",
+ source: overrides.source,
+ };
+}
+
+function buildScenario(retrieval: RetrievalConfig | undefined): Scenario {
+ return {
+ id: "retrieval-test",
+ name: "Retrieval Test",
+ tags: [],
+ turns: [],
+ sessions: [],
+ expectations: {
+ mustInclude: [],
+ mustNotInclude: [],
+ expectedTools: [],
+ failureModes: [],
+ },
+ retrieval,
+ };
+}
+
+function buildReply(retrieved: unknown): AdapterReply {
+ return {
+ assistantText: "...",
+ toolCalls: [],
+ rawExchange: (retrieved === undefined
+ ? {}
+ : { retrieved }) as unknown as AdapterReply["rawExchange"],
+ latencyMs: 0,
+ usage: {},
+ };
+}
+
+describe("coerceRetrievedItems", () => {
+ test("returns the string itself for a single-string payload", () => {
+ expect(coerceRetrievedItems("only one")).toEqual(["only one"]);
+ });
+
+ test("flattens arrays of strings", () => {
+ expect(coerceRetrievedItems(["a", "b"])).toEqual(["a", "b"]);
+ });
+
+ test("extracts label/text/name/summary/id from object payloads", () => {
+ const payload = [
+ { label: "Sarah" },
+ { text: "Atlas" },
+ { name: "Marcus" },
+ { summary: "Northstar" },
+ { id: "fact-123" },
+ ];
+ expect(coerceRetrievedItems(payload)).toEqual([
+ "Sarah",
+ "Atlas",
+ "Marcus",
+ "Northstar",
+ "fact-123",
+ ]);
+ });
+
+ test("ignores entries with no recognizable label", () => {
+ expect(coerceRetrievedItems([{ irrelevant: 42 }, null, undefined])).toEqual(
+ [],
+ );
+ });
+
+ test("returns [] for non-array, non-string payloads", () => {
+ expect(coerceRetrievedItems(42)).toEqual([]);
+ expect(coerceRetrievedItems({ foo: "bar" })).toEqual([]);
+ });
+});
+
+describe("resolveRetrievedItems", () => {
+ test("reads a JSON fixture relative to the scenarios path", () => {
+ const dir = makeTempDir("retrieval-fixture");
+ const fixturePath = join(dir, "memories.json");
+ writeFileSync(
+ fixturePath,
+ JSON.stringify(["Sarah's email", "Atlas project status"]),
+ "utf8",
+ );
+
+ const config = buildConfig({
+ source: { fixture: "memories.json" },
+ });
+
+ const result = resolveRetrievedItems(config, {
+ scenariosPath: join(dir, "scenarios.yaml"),
+ });
+
+ expect(result.source).toBe("fixture");
+ expect(result.items).toEqual(["Sarah's email", "Atlas project status"]);
+ });
+
+ test("falls back to the default `retrieved` raw exchange key", () => {
+ const result = resolveRetrievedItems(buildConfig(), {
+ lastAdapterReply: buildReply([{ label: "Sarah" }]),
+ });
+ expect(result.source).toBe("raw_exchange");
+ expect(result.items).toEqual(["Sarah"]);
+ });
+
+ test("honors a custom rawExchangeKey", () => {
+ const config = buildConfig({
+ source: { rawExchangeKey: "memories" },
+ });
+ const reply: AdapterReply = {
+ assistantText: "...",
+ toolCalls: [],
+ rawExchange: {
+ memories: ["A", "B"],
+ } as unknown as AdapterReply["rawExchange"],
+ latencyMs: 0,
+ usage: {},
+ };
+
+ const result = resolveRetrievedItems(config, { lastAdapterReply: reply });
+ expect(result.items).toEqual(["A", "B"]);
+ expect(result.source).toBe("raw_exchange");
+ });
+
+ test("returns `missing` source when no fixture or raw exchange field is available", () => {
+ const result = resolveRetrievedItems(buildConfig(), {});
+ expect(result.source).toBe("missing");
+ expect(result.items).toEqual([]);
+ });
+
+ test("throws for a missing fixture file", () => {
+ const config = buildConfig({
+ source: { fixture: "/nonexistent/path/to/file.json" },
+ });
+ expect(() => resolveRetrievedItems(config, {})).toThrow(
+ /Retrieval fixture not found/,
+ );
+ });
+});
+
+describe("scoreRetrieval", () => {
+ test("returns undefined when the scenario has no retrieval block", () => {
+ expect(scoreRetrieval(buildScenario(undefined), {})).toBeUndefined();
+ });
+
+ test("scores a perfect retrieval as passed and weightedScore 1", () => {
+ const scenario = buildScenario(buildConfig({ k: 2 }));
+ const reply = buildReply(["Sarah's email", "Atlas project status"]);
+
+ const result = scoreRetrieval(scenario, { lastAdapterReply: reply });
+
+ expect(result).toBeDefined();
+ expect(result?.source).toBe("raw_exchange");
+ expect(result?.hitCount).toBe(2);
+ expect(result?.weightedScore).toBeCloseTo(1.0, 6);
+ expect(result?.passed).toBe(true);
+ });
+
+ test("flags a forbidden hit and forces a fail", () => {
+ const scenario = buildScenario(
+ buildConfig({
+ golden: ["I do not have that"],
+ forbidden: ["$50K"],
+ k: 3,
+ passThreshold: 0.2,
+ }),
+ );
+ const reply = buildReply([
+ "I do not have that information",
+ "The Q2 marketing budget was $50K",
+ ]);
+
+ const result = scoreRetrieval(scenario, { lastAdapterReply: reply });
+
+ expect(result?.hitCount).toBeGreaterThan(0);
+ expect(result?.forbiddenHits).toBe(1);
+ expect(result?.passed).toBe(false);
+ });
+
+ test("missing source records a 0-hit score with source=missing", () => {
+ const scenario = buildScenario(buildConfig({ k: 5, passThreshold: 0.5 }));
+ const result = scoreRetrieval(scenario, {});
+
+ expect(result?.source).toBe("missing");
+ expect(result?.hitCount).toBe(0);
+ expect(result?.passed).toBe(false);
+ });
+
+ test("loads retrieved items from a fixture relative to the scenarios path", () => {
+ const dir = makeTempDir("retrieval-fixture-scored");
+ writeFileSync(
+ join(dir, "memories.json"),
+ JSON.stringify(["Sarah's email", "Atlas project status"]),
+ "utf8",
+ );
+ const scenario = buildScenario(
+ buildConfig({
+ k: 2,
+ source: { fixture: "memories.json" },
+ }),
+ );
+
+ const result = scoreRetrieval(scenario, {
+ scenariosPath: join(dir, "scenarios.yaml"),
+ });
+
+ expect(result?.source).toBe("fixture");
+ expect(result?.passed).toBe(true);
+ });
+});
diff --git a/src/domains/evaluation/retrieval-scorer.ts b/src/domains/evaluation/retrieval-scorer.ts
new file mode 100644
index 0000000..4a774fa
--- /dev/null
+++ b/src/domains/evaluation/retrieval-scorer.ts
@@ -0,0 +1,205 @@
+import { existsSync, readFileSync, statSync } from "node:fs";
+import { dirname, isAbsolute, resolve } from "node:path";
+
+import type {
+ AdapterReply,
+ JsonValue,
+ RetrievalConfig,
+ RetrievalScore,
+ Scenario,
+} from "../../shared/types/contracts.ts";
+import { AgentProbeRuntimeError } from "../../shared/utils/errors.ts";
+import { logWarn } from "../../shared/utils/logging.ts";
+import { scoreRanking } from "./ranking.ts";
+
+const DEFAULT_RAW_EXCHANGE_KEY = "retrieved";
+
+/**
+ * Convert a raw exchange/fixture payload into a flat list of strings.
+ *
+ * Accepts:
+ * - `["item one", "item two"]`
+ * - `[{ label: "foo" }, { id: "bar", label: "bar" }]`
+ * - `[{ name: "foo" }]` (falls back to `name` then `id`)
+ * - A single string (treated as a one-element list)
+ *
+ * Anything else returns an empty list and logs a warning — the scorer will
+ * then reasonably fail the scenario for missing data, rather than throwing
+ * and crashing the whole suite.
+ */
+export function coerceRetrievedItems(payload: unknown): string[] {
+ if (typeof payload === "string") {
+ return [payload];
+ }
+ if (!Array.isArray(payload)) {
+ return [];
+ }
+ const out: string[] = [];
+ for (const item of payload) {
+ if (typeof item === "string") {
+ out.push(item);
+ continue;
+ }
+ if (typeof item === "number" || typeof item === "boolean") {
+ out.push(String(item));
+ continue;
+ }
+ if (!item || typeof item !== "object" || Array.isArray(item)) {
+ continue;
+ }
+ const record = item as Record;
+ const label =
+ record.label ??
+ record.text ??
+ record.title ??
+ record.name ??
+ record.summary ??
+ record.id ??
+ record.uuid;
+ if (typeof label === "string") {
+ out.push(label);
+ } else if (typeof label === "number") {
+ out.push(String(label));
+ }
+ }
+ return out;
+}
+
+/**
+ * Resolve a retrieval `source.fixture` path relative to the scenario YAML.
+ * When `scenariosPath` is undefined or the scenario was loaded from memory,
+ * absolute paths are honored and relative paths are resolved against CWD.
+ */
+function resolveFixturePath(
+ scenariosPath: string | undefined,
+ fixture: string,
+): string {
+ if (isAbsolute(fixture)) {
+ return fixture;
+ }
+ if (!scenariosPath) {
+ return resolve(fixture);
+ }
+ let base: string;
+ try {
+ base =
+ existsSync(scenariosPath) && statSync(scenariosPath).isDirectory()
+ ? scenariosPath
+ : dirname(scenariosPath);
+ } catch {
+ base = dirname(scenariosPath);
+ }
+ return resolve(base, fixture);
+}
+
+function readFixture(fixturePath: string): unknown {
+ const contents = readFileSync(fixturePath, "utf8");
+ return JSON.parse(contents) as unknown;
+}
+
+export type RetrievalSourceContext = {
+ scenariosPath?: string;
+ lastAdapterReply?: AdapterReply;
+};
+
+export type RetrievedItemsResult = {
+ items: string[];
+ source: RetrievalScore["source"];
+};
+
+/**
+ * Resolve the actual list of retrieved items at scoring time.
+ *
+ * Resolution order:
+ * 1. `retrieval.source.fixture` — read JSON file, coerce to strings.
+ * 2. `retrieval.source.rawExchangeKey` (or `retrieved` by default) on the
+ * last assistant reply's `rawExchange`.
+ *
+ * Returns `{ items: [], source: "missing" }` when neither is available so
+ * the scorer can record an honest miss rather than guessing.
+ */
+export function resolveRetrievedItems(
+ config: RetrievalConfig,
+ context: RetrievalSourceContext,
+): RetrievedItemsResult {
+ const fixture = config.source?.fixture;
+ if (fixture) {
+ const resolved = resolveFixturePath(context.scenariosPath, fixture);
+ if (!existsSync(resolved)) {
+ throw new AgentProbeRuntimeError(
+ `Retrieval fixture not found: ${resolved}`,
+ );
+ }
+ const payload = readFixture(resolved);
+ const items = coerceRetrievedItems(payload);
+ return { items, source: "fixture" };
+ }
+
+ const rawExchangeKey =
+ config.source?.rawExchangeKey ?? DEFAULT_RAW_EXCHANGE_KEY;
+ const rawExchange = context.lastAdapterReply?.rawExchange;
+ if (rawExchange && typeof rawExchange === "object") {
+ const candidate = (rawExchange as Record)[
+ rawExchangeKey
+ ];
+ if (candidate !== undefined) {
+ const items = coerceRetrievedItems(candidate);
+ return { items, source: "raw_exchange" };
+ }
+ }
+
+ return { items: [], source: "missing" };
+}
+
+/**
+ * Score a scenario's retrieval block given a retrieved-list resolution
+ * context. Returns `undefined` when the scenario has no retrieval block,
+ * otherwise always returns a `RetrievalScore` — including for the `missing`
+ * source case (where the score will be 0 and `passed` will be false unless
+ * `passThreshold` is 0).
+ */
+export function scoreRetrieval(
+ scenario: Scenario,
+ context: RetrievalSourceContext,
+): RetrievalScore | undefined {
+ const config = scenario.retrieval;
+ if (!config) {
+ return undefined;
+ }
+
+ let resolution: RetrievedItemsResult;
+ try {
+ resolution = resolveRetrievedItems(config, context);
+ } catch (error) {
+ logWarn(
+ `Retrieval scoring failed to resolve items for scenario ${scenario.id}: ${
+ error instanceof Error ? error.message : String(error)
+ }`,
+ );
+ resolution = { items: [], source: "missing" };
+ }
+
+ const ranking = scoreRanking({
+ returned: resolution.items,
+ golden: config.golden,
+ forbidden: config.forbidden,
+ k: config.k,
+ weights: config.weights,
+ match: config.match,
+ passThreshold: config.passThreshold,
+ });
+
+ return {
+ k: ranking.k,
+ totalRelevant: ranking.totalRelevant,
+ totalReturned: ranking.totalReturned,
+ hitCount: ranking.hitCount,
+ forbiddenHits: ranking.forbiddenHits,
+ metrics: ranking.metrics,
+ weightedScore: ranking.weightedScore,
+ passThreshold: config.passThreshold,
+ passed: ranking.passed,
+ returned: resolution.items,
+ source: resolution.source,
+ };
+}
diff --git a/src/domains/evaluation/run-suite.ts b/src/domains/evaluation/run-suite.ts
index b09049e..47c6637 100644
--- a/src/domains/evaluation/run-suite.ts
+++ b/src/domains/evaluation/run-suite.ts
@@ -6,11 +6,15 @@ import type {
CheckpointAssertion,
CheckpointResult,
ConversationTurn,
+ DedupScore,
+ DemotionScore,
Endpoints,
JsonValue,
JudgeDimensionScore,
Persona,
PresetSnapshot,
+ ProcedureScore,
+ RetrievalScore,
Rubric,
RubricScore,
RunProgressEvent,
@@ -38,12 +42,16 @@ import {
parseScenariosInput,
parseTimeOffset,
} from "../validation/load-suite.ts";
+import { scoreScenarioDedup } from "./dedup-scorer.ts";
+import { scoreScenarioDemotion } from "./demotion-scorer.ts";
import { judgeResponse } from "./judge.ts";
import type {
EndpointAdapter,
EndpointAdapterFactory,
LlmResponsesClient,
} from "./ports.ts";
+import { scoreScenarioProcedure } from "./procedure-scorer.ts";
+import { scoreRetrieval } from "./retrieval-scorer.ts";
import { generatePersonaStep, resolvePersonaModel } from "./simulator.ts";
const resetsRequiringReinit = new Set(["new", "fresh_agent"]);
@@ -159,6 +167,25 @@ export type RunRecorder = {
overallScore: number;
},
) => Promise;
+ recordRetrievalResult?: (
+ scenarioRunId: number,
+ options: {
+ scenario: Scenario;
+ score: RetrievalScore;
+ },
+ ) => Promise;
+ recordDemotionResult?: (
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: DemotionScore },
+ ) => Promise;
+ recordProcedureResult?: (
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: ProcedureScore },
+ ) => Promise;
+ recordDedupResult?: (
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: DedupScore },
+ ) => Promise;
};
export type PreparedScenarioSelection = {
@@ -1037,19 +1064,67 @@ export async function runScenario(
});
}
+ const evalContext = {
+ scenariosPath: options.scenariosPath,
+ lastAdapterReply: lastReply,
+ };
+
+ const retrievalScore = scoreRetrieval(scenario, evalContext);
+ if (retrievalScore && scenarioRunId !== undefined) {
+ await options.recorder?.recordRetrievalResult?.(scenarioRunId, {
+ scenario,
+ score: retrievalScore,
+ });
+ }
+
+ const demotionScore = scoreScenarioDemotion(scenario, evalContext);
+ if (demotionScore && scenarioRunId !== undefined) {
+ await options.recorder?.recordDemotionResult?.(scenarioRunId, {
+ scenario,
+ score: demotionScore,
+ });
+ }
+
+ const procedureScore = scoreScenarioProcedure(scenario, evalContext);
+ if (procedureScore && scenarioRunId !== undefined) {
+ await options.recorder?.recordProcedureResult?.(scenarioRunId, {
+ scenario,
+ score: procedureScore,
+ });
+ }
+
+ const dedupScore = scoreScenarioDedup(scenario, evalContext);
+ if (dedupScore && scenarioRunId !== undefined) {
+ await options.recorder?.recordDedupResult?.(scenarioRunId, {
+ scenario,
+ score: dedupScore,
+ });
+ }
+
+ const overallPassed =
+ score.passed &&
+ (retrievalScore?.passed ?? true) &&
+ (demotionScore?.passed ?? true) &&
+ (procedureScore?.passed ?? true) &&
+ (dedupScore?.passed ?? true);
+
const result: ScenarioRunResult = {
scenarioId: scenario.id,
scenarioName: scenario.name,
personaId: persona.id,
rubricId: rubric.id,
userId: options.userId,
- passed: score.passed,
- failureKind: score.failureKind,
+ passed: overallPassed,
+ failureKind: overallPassed ? undefined : (score.failureKind ?? "agent"),
overallScore: finalScore,
transcript: fullTranscript,
checkpoints,
toolCallsByTurn,
judgeScore: score,
+ retrievalScore,
+ demotionScore,
+ procedureScore,
+ dedupScore,
renderedTurns,
};
if (scenarioRunId !== undefined) {
diff --git a/src/domains/reporting/render-report.ts b/src/domains/reporting/render-report.ts
index 16f98cb..a6f9058 100644
--- a/src/domains/reporting/render-report.ts
+++ b/src/domains/reporting/render-report.ts
@@ -374,6 +374,38 @@ function buildDimensionRows(scenario: ScenarioRecord): TemplateObject[] {
});
}
+function buildRetrievalRows(scenario: ScenarioRecord): TemplateObject[] {
+ const scores = Array.isArray(scenario.retrievalScores)
+ ? scenario.retrievalScores
+ : [];
+ return scores.map((score) => ({
+ metric: String((score as Record).metric ?? ""),
+ value: numberValue((score as Record).value) ?? 0,
+ weight: numberValue((score as Record).weight) ?? 0,
+ k: numberValue((score as Record).k) ?? 0,
+ weighted_score:
+ numberValue((score as Record).weighted_score) ?? 0,
+ pass_threshold:
+ numberValue((score as Record).pass_threshold) ?? 0,
+ passed: (score as Record).passed === true,
+ total_relevant:
+ numberValue((score as Record).total_relevant) ?? 0,
+ total_returned:
+ numberValue((score as Record).total_returned) ?? 0,
+ hit_count: numberValue((score as Record).hit_count) ?? 0,
+ forbidden_hits:
+ numberValue((score as Record).forbidden_hits) ?? 0,
+ source: String((score as Record).source ?? ""),
+ returned: (score as Record).returned ?? [],
+ value_percent: scorePercent(
+ numberValue((score as Record).value),
+ ),
+ weighted_score_percent: scorePercent(
+ numberValue((score as Record).weighted_score),
+ ),
+ }));
+}
+
function prepareScenarioView(
scenario: ScenarioRecord,
index: number,
@@ -397,6 +429,8 @@ function prepareScenarioView(
threshold_percent: scorePercent(scenario.passThreshold),
turn_rows: buildTurnRows(scenario),
dimension_rows: buildDimensionRows(scenario),
+ retrieval_rows: buildRetrievalRows(scenario),
+ retrieval_scores_pretty: prettyJson(scenario.retrievalScores),
overall_notes: scenario.judge.overallNotes ?? "",
judge_output_pretty: prettyJson(scenario.judge.output),
error_pretty: prettyJson(scenario.error),
diff --git a/src/domains/validation/load-suite.ts b/src/domains/validation/load-suite.ts
index 0e69aef..735fb68 100644
--- a/src/domains/validation/load-suite.ts
+++ b/src/domains/validation/load-suite.ts
@@ -7,6 +7,11 @@ import type {
CheckpointAssertion,
CheckpointTurn,
CliHarness,
+ DedupConfig,
+ DemotionCascade,
+ DemotionConfig,
+ DemotionMetricKey,
+ DreamSource,
EndpointAuth,
EndpointLogging,
EndpointRequest,
@@ -24,7 +29,13 @@ import type {
PersonaDemographics,
PersonaPersonality,
Personas,
+ ProcedureConfig,
+ ProcedureMetricKey,
ProcessedYamlFile,
+ RetrievalConfig,
+ RetrievalMatchPolicy,
+ RetrievalMetricKey,
+ RetrievalSource,
Rubric,
RubricDimension,
RubricScale,
@@ -951,6 +962,353 @@ function parseScenarioExpectations(value: unknown): ScenarioExpectations {
return result;
}
+const RETRIEVAL_METRIC_KEYS: RetrievalMetricKey[] = [
+ "precision_at_k",
+ "recall_at_k",
+ "mrr",
+ "ndcg_at_k",
+];
+
+const VALID_MATCH_POLICIES: RetrievalMatchPolicy[] = [
+ "exact",
+ "substring",
+ "regex",
+];
+
+function parseRetrievalWeights(
+ value: unknown,
+): Required {
+ const defaults: Required = {
+ precision_at_k: 1,
+ recall_at_k: 1,
+ mrr: 1,
+ ndcg_at_k: 1,
+ };
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
+ return defaults;
+ }
+ const raw = value as YamlObject;
+ for (const key of Object.keys(raw)) {
+ if (!RETRIEVAL_METRIC_KEYS.includes(key as RetrievalMetricKey)) {
+ throw new AgentProbeConfigError(
+ `Unknown retrieval metric key: ${key}. Allowed: ${RETRIEVAL_METRIC_KEYS.join(", ")}.`,
+ );
+ }
+ }
+ for (const key of RETRIEVAL_METRIC_KEYS) {
+ const candidate = optionalNumber(raw[key]);
+ if (candidate !== undefined) {
+ if (candidate < 0) {
+ throw new AgentProbeConfigError(
+ `retrieval.weight.${key} must be non-negative.`,
+ );
+ }
+ defaults[key] = candidate;
+ }
+ }
+ return defaults;
+}
+
+function parseRetrievalSource(value: unknown): RetrievalSource | undefined {
+ if (!value) {
+ return undefined;
+ }
+ const raw = ensureObject(value, "retrieval.source must be an object.");
+ const fixture = optionalString(raw.fixture);
+ const rawExchangeKey = optionalString(raw.raw_exchange_key);
+ if (!fixture && !rawExchangeKey) {
+ return undefined;
+ }
+ return {
+ fixture,
+ rawExchangeKey,
+ };
+}
+
+function parseRetrievalConfig(value: unknown): RetrievalConfig | undefined {
+ if (!value) {
+ return undefined;
+ }
+ const raw = ensureObject(value, "scenario.retrieval must be an object.");
+
+ const golden = stringArray(raw.golden);
+ if (golden.length === 0) {
+ throw new AgentProbeConfigError(
+ "scenario.retrieval.golden must be a non-empty list of strings.",
+ );
+ }
+
+ const forbidden = stringArray(raw.forbidden);
+
+ const k = optionalNumber(raw.k);
+ if (k !== undefined && (!Number.isFinite(k) || k <= 0)) {
+ throw new AgentProbeConfigError(
+ "scenario.retrieval.k must be a positive integer when provided.",
+ );
+ }
+
+ const matchValue = optionalString(raw.match) ?? "substring";
+ if (!VALID_MATCH_POLICIES.includes(matchValue as RetrievalMatchPolicy)) {
+ throw new AgentProbeConfigError(
+ `scenario.retrieval.match must be one of: ${VALID_MATCH_POLICIES.join(", ")}.`,
+ );
+ }
+
+ const passThreshold = optionalNumber(raw.pass_threshold);
+ if (
+ passThreshold !== undefined &&
+ (!Number.isFinite(passThreshold) || passThreshold < 0 || passThreshold > 1)
+ ) {
+ throw new AgentProbeConfigError(
+ "scenario.retrieval.pass_threshold must be between 0 and 1 when provided.",
+ );
+ }
+
+ return {
+ golden,
+ forbidden,
+ k: k !== undefined ? Math.floor(k) : undefined,
+ weights: parseRetrievalWeights(raw.weight ?? raw.weights),
+ passThreshold: passThreshold ?? 0.5,
+ match: matchValue as RetrievalMatchPolicy,
+ source: parseRetrievalSource(raw.source),
+ };
+}
+
+const DEMOTION_METRIC_KEYS: DemotionMetricKey[] = [
+ "set_precision",
+ "set_recall",
+ "set_f1",
+ "timestamp_discipline",
+ "cascade_bounded",
+ "cascade_direct_f1",
+];
+
+const PROCEDURE_METRIC_KEYS: ProcedureMetricKey[] = [
+ "step_coverage",
+ "step_order",
+ "parameter_coverage",
+];
+
+const DEDUP_METRIC_KEYS = ["precision", "recall", "f1", "ari"] as const;
+
+function parseDreamSource(
+ value: unknown,
+ scopeLabel: string,
+): DreamSource | undefined {
+ if (!value) {
+ return undefined;
+ }
+ const raw = ensureObject(value, `${scopeLabel}.source must be an object.`);
+ const fixture = optionalString(raw.fixture);
+ const rawExchangeKey = optionalString(raw.raw_exchange_key);
+ if (!fixture && !rawExchangeKey) {
+ return undefined;
+ }
+ return { fixture, rawExchangeKey };
+}
+
+function parseWeightedKeys(
+ value: unknown,
+ scopeLabel: string,
+ allowedKeys: readonly K[],
+ defaultWeight: number,
+): Record {
+ const result = Object.fromEntries(
+ allowedKeys.map((key) => [key, defaultWeight]),
+ ) as Record;
+ if (!value) {
+ return result;
+ }
+ const raw = ensureObject(value, `${scopeLabel}.weight must be an object.`);
+ for (const key of Object.keys(raw)) {
+ if (!(allowedKeys as readonly string[]).includes(key)) {
+ throw new AgentProbeConfigError(
+ `Unknown ${scopeLabel} metric key: ${key}. Allowed: ${allowedKeys.join(", ")}.`,
+ );
+ }
+ }
+ for (const key of allowedKeys) {
+ const candidate = optionalNumber(raw[key]);
+ if (candidate !== undefined) {
+ if (candidate < 0) {
+ throw new AgentProbeConfigError(
+ `${scopeLabel}.weight.${key} must be non-negative.`,
+ );
+ }
+ result[key] = candidate;
+ }
+ }
+ return result;
+}
+
+function parseUnitThreshold(
+ value: unknown,
+ scopeLabel: string,
+ fallback: number,
+): number {
+ const parsed = optionalNumber(value);
+ if (parsed === undefined) {
+ return fallback;
+ }
+ if (!Number.isFinite(parsed) || parsed < 0 || parsed > 1) {
+ throw new AgentProbeConfigError(
+ `${scopeLabel}.pass_threshold must be between 0 and 1 when provided.`,
+ );
+ }
+ return parsed;
+}
+
+function parseDemotionCascade(value: unknown): DemotionCascade | undefined {
+ if (!value) {
+ return undefined;
+ }
+ const raw = ensureObject(value, "demotion.cascade must be an object.");
+ const expected = stringArray(raw.expected_direct_neighbors ?? raw.expected);
+ const tangential = stringArray(raw.tangential_edges ?? raw.tangential);
+ if (expected.length === 0 && tangential.length === 0) {
+ return undefined;
+ }
+ return { expectedDirectNeighbors: expected, tangentialEdges: tangential };
+}
+
+function parseDemotionConfig(value: unknown): DemotionConfig | undefined {
+ if (!value) {
+ return undefined;
+ }
+ const raw = ensureObject(value, "scenario.demotion must be an object.");
+
+ const expectedDemotions = stringArray(raw.expected_demotions ?? raw.expected);
+ if (expectedDemotions.length === 0) {
+ throw new AgentProbeConfigError(
+ "scenario.demotion.expected_demotions must be a non-empty list of UUIDs.",
+ );
+ }
+ const expectedRetracts = stringArray(raw.expected_retracts);
+ const cascade = parseDemotionCascade(raw.cascade);
+ const weights = parseWeightedKeys(
+ raw.weight ?? raw.weights,
+ "scenario.demotion",
+ DEMOTION_METRIC_KEYS,
+ 1,
+ );
+ const passThreshold = parseUnitThreshold(
+ raw.pass_threshold,
+ "scenario.demotion",
+ 0.6,
+ );
+ return {
+ expectedDemotions,
+ expectedRetracts:
+ expectedRetracts.length > 0 ? expectedRetracts : undefined,
+ cascade,
+ weights,
+ passThreshold,
+ source: parseDreamSource(raw.source, "scenario.demotion"),
+ };
+}
+
+function parseProcedureConfig(value: unknown): ProcedureConfig | undefined {
+ if (!value) {
+ return undefined;
+ }
+ const raw = ensureObject(value, "scenario.procedure must be an object.");
+ const goldenObj =
+ raw.golden && typeof raw.golden === "object" && !Array.isArray(raw.golden)
+ ? (raw.golden as YamlObject)
+ : undefined;
+ const stepsRaw = Array.isArray(raw.golden_steps)
+ ? raw.golden_steps
+ : Array.isArray(raw.steps)
+ ? raw.steps
+ : Array.isArray(goldenObj?.steps)
+ ? (goldenObj.steps as unknown[])
+ : [];
+ const goldenSteps = stringArray(stepsRaw);
+ if (goldenSteps.length === 0) {
+ throw new AgentProbeConfigError(
+ "scenario.procedure.golden_steps (or golden.steps) must be a non-empty list of step labels.",
+ );
+ }
+ const paramRaw =
+ raw.golden_parameters ??
+ raw.parameters ??
+ (goldenObj?.parameters as unknown);
+ const goldenParameters = stringArray(paramRaw);
+ const weights = parseWeightedKeys(
+ raw.weight ?? raw.weights,
+ "scenario.procedure",
+ PROCEDURE_METRIC_KEYS,
+ 1,
+ );
+ const passThreshold = parseUnitThreshold(
+ raw.pass_threshold,
+ "scenario.procedure",
+ 0.6,
+ );
+ return {
+ goldenSteps,
+ goldenParameters:
+ goldenParameters.length > 0 ? goldenParameters : undefined,
+ weights,
+ passThreshold,
+ source: parseDreamSource(raw.source, "scenario.procedure"),
+ };
+}
+
+function parseDedupConfig(value: unknown): DedupConfig | undefined {
+ if (!value) {
+ return undefined;
+ }
+ const raw = ensureObject(value, "scenario.dedup must be an object.");
+ const clustersRaw = raw.golden_clusters ?? raw.golden;
+ if (!Array.isArray(clustersRaw)) {
+ throw new AgentProbeConfigError(
+ "scenario.dedup.golden_clusters must be a list of clusters (each a list of UUIDs).",
+ );
+ }
+ const clusters: string[][] = [];
+ for (const cluster of clustersRaw) {
+ if (!Array.isArray(cluster)) {
+ throw new AgentProbeConfigError(
+ "scenario.dedup.golden_clusters items must each be a list of UUIDs.",
+ );
+ }
+ const items = stringArray(cluster);
+ if (items.length === 0) {
+ continue;
+ }
+ clusters.push(items);
+ }
+ if (clusters.length === 0) {
+ throw new AgentProbeConfigError(
+ "scenario.dedup.golden_clusters must contain at least one non-empty cluster.",
+ );
+ }
+ const weights = parseWeightedKeys(
+ raw.weight ?? raw.weights,
+ "scenario.dedup",
+ DEDUP_METRIC_KEYS,
+ 1,
+ );
+ const passThreshold = parseUnitThreshold(
+ raw.pass_threshold,
+ "scenario.dedup",
+ 0.6,
+ );
+ return {
+ goldenClusters: clusters,
+ weights: {
+ precision: weights.precision,
+ recall: weights.recall,
+ f1: weights.f1,
+ ari: weights.ari,
+ },
+ passThreshold,
+ source: parseDreamSource(raw.source, "scenario.dedup"),
+ };
+}
+
function parseSession(value: unknown): Session {
const raw = ensureObject(value, "scenario session must be an object.");
return {
@@ -1014,6 +1372,10 @@ function parseScenario(value: unknown, defaults?: ScenarioDefaults): Scenario {
? raw.sessions.map((item) => parseSession(item))
: [],
expectations: parseScenarioExpectations(raw.expectations),
+ retrieval: parseRetrievalConfig(raw.retrieval),
+ demotion: parseDemotionConfig(raw.demotion),
+ procedure: parseProcedureConfig(raw.procedure),
+ dedup: parseDedupConfig(raw.dedup),
};
}
diff --git a/src/providers/persistence/drizzle/postgres-schema.ts b/src/providers/persistence/drizzle/postgres-schema.ts
index fc4a4fa..986c8f4 100644
--- a/src/providers/persistence/drizzle/postgres-schema.ts
+++ b/src/providers/persistence/drizzle/postgres-schema.ts
@@ -221,6 +221,109 @@ export const postgresJudgeDimensionScores = pgTable(
(table) => [index("idx_judge_scores_scenario_run").on(table.scenarioRunId)],
);
+export const postgresRetrievalScores = pgTable(
+ "retrieval_scores",
+ {
+ id: bigserial("id", { mode: "number" }).primaryKey(),
+ scenarioRunId: bigint("scenario_run_id", { mode: "number" })
+ .notNull()
+ .references(() => postgresScenarioRuns.id, { onDelete: "cascade" }),
+ metric: text("metric").notNull(),
+ value: doublePrecision("value").notNull(),
+ weight: doublePrecision("weight").notNull(),
+ k: integer("k").notNull(),
+ weightedScore: doublePrecision("weighted_score").notNull(),
+ passThreshold: doublePrecision("pass_threshold").notNull(),
+ passed: boolean("passed").notNull(),
+ totalRelevant: integer("total_relevant").notNull(),
+ totalReturned: integer("total_returned").notNull(),
+ hitCount: integer("hit_count").notNull(),
+ forbiddenHits: integer("forbidden_hits").notNull(),
+ source: text("source").notNull(),
+ returnedJson: jsonb("returned_json"),
+ createdAt: timestamp("created_at", { withTimezone: true }).notNull(),
+ },
+ (table) => [
+ index("idx_retrieval_scores_scenario_run").on(table.scenarioRunId),
+ index("idx_retrieval_scores_metric").on(table.metric),
+ ],
+);
+
+export const postgresDemotionScores = pgTable(
+ "demotion_scores",
+ {
+ id: bigserial("id", { mode: "number" }).primaryKey(),
+ scenarioRunId: bigint("scenario_run_id", { mode: "number" })
+ .notNull()
+ .references(() => postgresScenarioRuns.id, { onDelete: "cascade" }),
+ metric: text("metric").notNull(),
+ value: doublePrecision("value").notNull(),
+ weight: doublePrecision("weight").notNull(),
+ weightedScore: doublePrecision("weighted_score").notNull(),
+ passThreshold: doublePrecision("pass_threshold").notNull(),
+ passed: boolean("passed").notNull(),
+ timestampViolationCount: integer("timestamp_violation_count").notNull(),
+ cascadeBounded: boolean("cascade_bounded"),
+ source: text("source").notNull(),
+ observedJson: jsonb("observed_json"),
+ expectedJson: jsonb("expected_json"),
+ createdAt: timestamp("created_at", { withTimezone: true }).notNull(),
+ },
+ (table) => [
+ index("idx_demotion_scores_scenario_run").on(table.scenarioRunId),
+ index("idx_demotion_scores_metric").on(table.metric),
+ ],
+);
+
+export const postgresProcedureScores = pgTable(
+ "procedure_scores",
+ {
+ id: bigserial("id", { mode: "number" }).primaryKey(),
+ scenarioRunId: bigint("scenario_run_id", { mode: "number" })
+ .notNull()
+ .references(() => postgresScenarioRuns.id, { onDelete: "cascade" }),
+ metric: text("metric").notNull(),
+ value: doublePrecision("value").notNull(),
+ weight: doublePrecision("weight").notNull(),
+ weightedScore: doublePrecision("weighted_score").notNull(),
+ passThreshold: doublePrecision("pass_threshold").notNull(),
+ passed: boolean("passed").notNull(),
+ source: text("source").notNull(),
+ predictedJson: jsonb("predicted_json"),
+ goldenJson: jsonb("golden_json"),
+ createdAt: timestamp("created_at", { withTimezone: true }).notNull(),
+ },
+ (table) => [
+ index("idx_procedure_scores_scenario_run").on(table.scenarioRunId),
+ index("idx_procedure_scores_metric").on(table.metric),
+ ],
+);
+
+export const postgresDedupScores = pgTable(
+ "dedup_scores",
+ {
+ id: bigserial("id", { mode: "number" }).primaryKey(),
+ scenarioRunId: bigint("scenario_run_id", { mode: "number" })
+ .notNull()
+ .references(() => postgresScenarioRuns.id, { onDelete: "cascade" }),
+ metric: text("metric").notNull(),
+ value: doublePrecision("value").notNull(),
+ weight: doublePrecision("weight").notNull(),
+ weightedScore: doublePrecision("weighted_score").notNull(),
+ passThreshold: doublePrecision("pass_threshold").notNull(),
+ passed: boolean("passed").notNull(),
+ itemCount: integer("item_count").notNull(),
+ source: text("source").notNull(),
+ predictedJson: jsonb("predicted_json"),
+ goldenJson: jsonb("golden_json"),
+ createdAt: timestamp("created_at", { withTimezone: true }).notNull(),
+ },
+ (table) => [
+ index("idx_dedup_scores_scenario_run").on(table.scenarioRunId),
+ index("idx_dedup_scores_metric").on(table.metric),
+ ],
+);
+
export const postgresHumanDimensionScores = pgTable(
"human_dimension_scores",
{
@@ -303,6 +406,10 @@ export const postgresSchema = {
checkpoints: postgresCheckpoints,
judgeDimensionScores: postgresJudgeDimensionScores,
humanDimensionScores: postgresHumanDimensionScores,
+ retrievalScores: postgresRetrievalScores,
+ demotionScores: postgresDemotionScores,
+ procedureScores: postgresProcedureScores,
+ dedupScores: postgresDedupScores,
presets: postgresPresets,
presetScenarios: postgresPresetScenarios,
appSettings: postgresAppSettings,
diff --git a/src/providers/persistence/drizzle/sqlite-schema.ts b/src/providers/persistence/drizzle/sqlite-schema.ts
index ac7bdcc..66f5309 100644
--- a/src/providers/persistence/drizzle/sqlite-schema.ts
+++ b/src/providers/persistence/drizzle/sqlite-schema.ts
@@ -178,6 +178,109 @@ export const sqliteJudgeDimensionScores = sqliteTable(
},
);
+export const sqliteRetrievalScores = sqliteTable(
+ "retrieval_scores",
+ {
+ id: integer("id").primaryKey({ autoIncrement: true }),
+ scenarioRunId: integer("scenario_run_id")
+ .notNull()
+ .references(() => sqliteScenarioRuns.id, { onDelete: "cascade" }),
+ metric: text("metric").notNull(),
+ value: real("value").notNull(),
+ weight: real("weight").notNull(),
+ k: integer("k").notNull(),
+ weightedScore: real("weighted_score").notNull(),
+ passThreshold: real("pass_threshold").notNull(),
+ passed: integer("passed").notNull(),
+ totalRelevant: integer("total_relevant").notNull(),
+ totalReturned: integer("total_returned").notNull(),
+ hitCount: integer("hit_count").notNull(),
+ forbiddenHits: integer("forbidden_hits").notNull(),
+ source: text("source").notNull(),
+ returnedJson: text("returned_json"),
+ createdAt: text("created_at").notNull(),
+ },
+ (table) => [
+ index("idx_retrieval_scores_scenario_run").on(table.scenarioRunId),
+ index("idx_retrieval_scores_metric").on(table.metric),
+ ],
+);
+
+export const sqliteDemotionScores = sqliteTable(
+ "demotion_scores",
+ {
+ id: integer("id").primaryKey({ autoIncrement: true }),
+ scenarioRunId: integer("scenario_run_id")
+ .notNull()
+ .references(() => sqliteScenarioRuns.id, { onDelete: "cascade" }),
+ metric: text("metric").notNull(),
+ value: real("value").notNull(),
+ weight: real("weight").notNull(),
+ weightedScore: real("weighted_score").notNull(),
+ passThreshold: real("pass_threshold").notNull(),
+ passed: integer("passed").notNull(),
+ timestampViolationCount: integer("timestamp_violation_count").notNull(),
+ cascadeBounded: integer("cascade_bounded"),
+ source: text("source").notNull(),
+ observedJson: text("observed_json"),
+ expectedJson: text("expected_json"),
+ createdAt: text("created_at").notNull(),
+ },
+ (table) => [
+ index("idx_demotion_scores_scenario_run").on(table.scenarioRunId),
+ index("idx_demotion_scores_metric").on(table.metric),
+ ],
+);
+
+export const sqliteProcedureScores = sqliteTable(
+ "procedure_scores",
+ {
+ id: integer("id").primaryKey({ autoIncrement: true }),
+ scenarioRunId: integer("scenario_run_id")
+ .notNull()
+ .references(() => sqliteScenarioRuns.id, { onDelete: "cascade" }),
+ metric: text("metric").notNull(),
+ value: real("value").notNull(),
+ weight: real("weight").notNull(),
+ weightedScore: real("weighted_score").notNull(),
+ passThreshold: real("pass_threshold").notNull(),
+ passed: integer("passed").notNull(),
+ source: text("source").notNull(),
+ predictedJson: text("predicted_json"),
+ goldenJson: text("golden_json"),
+ createdAt: text("created_at").notNull(),
+ },
+ (table) => [
+ index("idx_procedure_scores_scenario_run").on(table.scenarioRunId),
+ index("idx_procedure_scores_metric").on(table.metric),
+ ],
+);
+
+export const sqliteDedupScores = sqliteTable(
+ "dedup_scores",
+ {
+ id: integer("id").primaryKey({ autoIncrement: true }),
+ scenarioRunId: integer("scenario_run_id")
+ .notNull()
+ .references(() => sqliteScenarioRuns.id, { onDelete: "cascade" }),
+ metric: text("metric").notNull(),
+ value: real("value").notNull(),
+ weight: real("weight").notNull(),
+ weightedScore: real("weighted_score").notNull(),
+ passThreshold: real("pass_threshold").notNull(),
+ passed: integer("passed").notNull(),
+ itemCount: integer("item_count").notNull(),
+ source: text("source").notNull(),
+ predictedJson: text("predicted_json"),
+ goldenJson: text("golden_json"),
+ createdAt: text("created_at").notNull(),
+ },
+ (table) => [
+ index("idx_dedup_scores_scenario_run").on(table.scenarioRunId),
+ index("idx_dedup_scores_metric").on(table.metric),
+ ],
+);
+
export const sqliteHumanDimensionScores = sqliteTable(
"human_dimension_scores",
{
@@ -256,6 +359,10 @@ export const sqliteSchema = {
checkpoints: sqliteCheckpoints,
judgeDimensionScores: sqliteJudgeDimensionScores,
humanDimensionScores: sqliteHumanDimensionScores,
+ retrievalScores: sqliteRetrievalScores,
+ demotionScores: sqliteDemotionScores,
+ procedureScores: sqliteProcedureScores,
+ dedupScores: sqliteDedupScores,
presets: sqlitePresets,
presetScenarios: sqlitePresetScenarios,
appSettings: sqliteAppSettings,
diff --git a/src/providers/persistence/migrations/postgres.ts b/src/providers/persistence/migrations/postgres.ts
index 1682b7d..ac54edf 100644
--- a/src/providers/persistence/migrations/postgres.ts
+++ b/src/providers/persistence/migrations/postgres.ts
@@ -2,7 +2,7 @@ import { createPostgresClient, type SqlTag } from "../postgres-client.ts";
import type { MigrationRunner } from "./types.ts";
/** Target schema version for Postgres. Bumps whenever a new migration is added. */
-export const POSTGRES_TARGET_VERSION = 4;
+export const POSTGRES_TARGET_VERSION = 6;
const POSTGRES_BASELINE_DDL = `
create table if not exists meta (
@@ -153,6 +153,73 @@ const POSTGRES_BASELINE_DDL = `
created_at timestamptz not null
);
+ create table if not exists retrieval_scores (
+ id bigserial primary key,
+ scenario_run_id bigint not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value double precision not null,
+ weight double precision not null,
+ k integer not null,
+ weighted_score double precision not null,
+ pass_threshold double precision not null,
+ passed boolean not null,
+ total_relevant integer not null,
+ total_returned integer not null,
+ hit_count integer not null,
+ forbidden_hits integer not null,
+ source text not null,
+ returned_json jsonb,
+ created_at timestamptz not null
+ );
+
+ create table if not exists demotion_scores (
+ id bigserial primary key,
+ scenario_run_id bigint not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value double precision not null,
+ weight double precision not null,
+ weighted_score double precision not null,
+ pass_threshold double precision not null,
+ passed boolean not null,
+ timestamp_violation_count integer not null,
+ cascade_bounded boolean,
+ source text not null,
+ observed_json jsonb,
+ expected_json jsonb,
+ created_at timestamptz not null
+ );
+
+ create table if not exists procedure_scores (
+ id bigserial primary key,
+ scenario_run_id bigint not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value double precision not null,
+ weight double precision not null,
+ weighted_score double precision not null,
+ pass_threshold double precision not null,
+ passed boolean not null,
+ source text not null,
+ predicted_json jsonb,
+ golden_json jsonb,
+ created_at timestamptz not null
+ );
+
+ create table if not exists dedup_scores (
+ id bigserial primary key,
+ scenario_run_id bigint not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value double precision not null,
+ weight double precision not null,
+ weighted_score double precision not null,
+ pass_threshold double precision not null,
+ passed boolean not null,
+ item_count integer not null,
+ source text not null,
+ predicted_json jsonb,
+ golden_json jsonb,
+ created_at timestamptz not null
+ );
+
create table if not exists presets (
id text primary key,
name text not null unique,
@@ -201,6 +268,22 @@ const POSTGRES_BASELINE_DDL = `
on human_dimension_scores(scenario_run_id, dimension_id);
create index if not exists idx_human_dim_scores_scenario_run
on human_dimension_scores(scenario_run_id);
+ create index if not exists idx_retrieval_scores_scenario_run
+ on retrieval_scores(scenario_run_id);
+ create index if not exists idx_retrieval_scores_metric
+ on retrieval_scores(metric);
+ create index if not exists idx_demotion_scores_scenario_run
+ on demotion_scores(scenario_run_id);
+ create index if not exists idx_demotion_scores_metric
+ on demotion_scores(metric);
+ create index if not exists idx_procedure_scores_scenario_run
+ on procedure_scores(scenario_run_id);
+ create index if not exists idx_procedure_scores_metric
+ on procedure_scores(metric);
+ create index if not exists idx_dedup_scores_scenario_run
+ on dedup_scores(scenario_run_id);
+ create index if not exists idx_dedup_scores_metric
+ on dedup_scores(metric);
`;
async function readPostgresVersion(sql: SqlTag): Promise {
@@ -305,6 +388,121 @@ export function createPostgresMigrationRunner(
});
applied.push(4);
}
+ if (from < 5) {
+ await sql.begin(async (tx) => {
+ await tx`
+ create table if not exists retrieval_scores (
+ id bigserial primary key,
+ scenario_run_id bigint not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value double precision not null,
+ weight double precision not null,
+ k integer not null,
+ weighted_score double precision not null,
+ pass_threshold double precision not null,
+ passed boolean not null,
+ total_relevant integer not null,
+ total_returned integer not null,
+ hit_count integer not null,
+ forbidden_hits integer not null,
+ source text not null,
+ returned_json jsonb,
+ created_at timestamptz not null
+ )
+ `;
+ await tx`
+ create index if not exists idx_retrieval_scores_scenario_run
+ on retrieval_scores(scenario_run_id)
+ `;
+ await tx`
+ create index if not exists idx_retrieval_scores_metric
+ on retrieval_scores(metric)
+ `;
+ await tx`update meta set schema_version = 5 where id = 1`;
+ });
+ applied.push(5);
+ }
+ if (from < 6) {
+ await sql.begin(async (tx) => {
+ await tx`
+ create table if not exists demotion_scores (
+ id bigserial primary key,
+ scenario_run_id bigint not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value double precision not null,
+ weight double precision not null,
+ weighted_score double precision not null,
+ pass_threshold double precision not null,
+ passed boolean not null,
+ timestamp_violation_count integer not null,
+ cascade_bounded boolean,
+ source text not null,
+ observed_json jsonb,
+ expected_json jsonb,
+ created_at timestamptz not null
+ )
+ `;
+ await tx`
+ create index if not exists idx_demotion_scores_scenario_run
+ on demotion_scores(scenario_run_id)
+ `;
+ await tx`
+ create index if not exists idx_demotion_scores_metric
+ on demotion_scores(metric)
+ `;
+ await tx`
+ create table if not exists procedure_scores (
+ id bigserial primary key,
+ scenario_run_id bigint not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value double precision not null,
+ weight double precision not null,
+ weighted_score double precision not null,
+ pass_threshold double precision not null,
+ passed boolean not null,
+ source text not null,
+ predicted_json jsonb,
+ golden_json jsonb,
+ created_at timestamptz not null
+ )
+ `;
+ await tx`
+ create index if not exists idx_procedure_scores_scenario_run
+ on procedure_scores(scenario_run_id)
+ `;
+ await tx`
+ create index if not exists idx_procedure_scores_metric
+ on procedure_scores(metric)
+ `;
+ await tx`
+ create table if not exists dedup_scores (
+ id bigserial primary key,
+ scenario_run_id bigint not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value double precision not null,
+ weight double precision not null,
+ weighted_score double precision not null,
+ pass_threshold double precision not null,
+ passed boolean not null,
+ item_count integer not null,
+ source text not null,
+ predicted_json jsonb,
+ golden_json jsonb,
+ created_at timestamptz not null
+ )
+ `;
+ await tx`
+ create index if not exists idx_dedup_scores_scenario_run
+ on dedup_scores(scenario_run_id)
+ `;
+ await tx`
+ create index if not exists idx_dedup_scores_metric
+ on dedup_scores(metric)
+ `;
+ await tx`update meta set schema_version = 6 where id = 1`;
+ });
+ applied.push(6);
+ }
return applied;
} finally {
await sql.end?.();
diff --git a/src/providers/persistence/migrations/sqlite.ts b/src/providers/persistence/migrations/sqlite.ts
index 4b91e36..6668bbc 100644
--- a/src/providers/persistence/migrations/sqlite.ts
+++ b/src/providers/persistence/migrations/sqlite.ts
@@ -4,7 +4,7 @@ import { resolveSqlitePath, withSqliteDatabase } from "../sqlite-connection.ts";
import type { MigrationReport, MigrationRunner } from "./types.ts";
/** Target schema version for SQLite. Keep synced with SCHEMA_VERSION in sqlite-run-history.ts. */
-export const SQLITE_TARGET_VERSION = 8;
+export const SQLITE_TARGET_VERSION = 10;
function utcNow(): string {
return new Date().toISOString();
@@ -186,6 +186,73 @@ export function applySqliteBaseline(database: Database): void {
created_at text not null
);
+ create table if not exists retrieval_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ k integer not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ total_relevant integer not null,
+ total_returned integer not null,
+ hit_count integer not null,
+ forbidden_hits integer not null,
+ source text not null,
+ returned_json text,
+ created_at text not null
+ );
+
+ create table if not exists demotion_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ timestamp_violation_count integer not null,
+ cascade_bounded integer,
+ source text not null,
+ observed_json text,
+ expected_json text,
+ created_at text not null
+ );
+
+ create table if not exists procedure_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ source text not null,
+ predicted_json text,
+ golden_json text,
+ created_at text not null
+ );
+
+ create table if not exists dedup_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ item_count integer not null,
+ source text not null,
+ predicted_json text,
+ golden_json text,
+ created_at text not null
+ );
+
create table if not exists presets (
id text primary key,
name text not null unique,
@@ -238,6 +305,22 @@ export function applySqliteBaseline(database: Database): void {
on human_dimension_scores(scenario_run_id, dimension_id);
create index if not exists idx_human_dim_scores_scenario_run
on human_dimension_scores(scenario_run_id);
+ create index if not exists idx_retrieval_scores_scenario_run
+ on retrieval_scores(scenario_run_id);
+ create index if not exists idx_retrieval_scores_metric
+ on retrieval_scores(metric);
+ create index if not exists idx_demotion_scores_scenario_run
+ on demotion_scores(scenario_run_id);
+ create index if not exists idx_demotion_scores_metric
+ on demotion_scores(metric);
+ create index if not exists idx_procedure_scores_scenario_run
+ on procedure_scores(scenario_run_id);
+ create index if not exists idx_procedure_scores_metric
+ on procedure_scores(metric);
+ create index if not exists idx_dedup_scores_scenario_run
+ on dedup_scores(scenario_run_id);
+ create index if not exists idx_dedup_scores_metric
+ on dedup_scores(metric);
`);
}
@@ -331,6 +414,101 @@ export function applySqliteMigrations(
applied.push(8);
version = 8;
}
+ if (version < 9) {
+ database.exec(`
+ create table if not exists retrieval_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ k integer not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ total_relevant integer not null,
+ total_returned integer not null,
+ hit_count integer not null,
+ forbidden_hits integer not null,
+ source text not null,
+ returned_json text,
+ created_at text not null
+ );
+ create index if not exists idx_retrieval_scores_scenario_run
+ on retrieval_scores(scenario_run_id);
+ create index if not exists idx_retrieval_scores_metric
+ on retrieval_scores(metric);
+ `);
+ database.query("update meta set schema_version = ? where id = 1").run(9);
+ applied.push(9);
+ version = 9;
+ }
+ if (version < 10) {
+ database.exec(`
+ create table if not exists demotion_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ timestamp_violation_count integer not null,
+ cascade_bounded integer,
+ source text not null,
+ observed_json text,
+ expected_json text,
+ created_at text not null
+ );
+ create index if not exists idx_demotion_scores_scenario_run
+ on demotion_scores(scenario_run_id);
+ create index if not exists idx_demotion_scores_metric
+ on demotion_scores(metric);
+
+ create table if not exists procedure_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ source text not null,
+ predicted_json text,
+ golden_json text,
+ created_at text not null
+ );
+ create index if not exists idx_procedure_scores_scenario_run
+ on procedure_scores(scenario_run_id);
+ create index if not exists idx_procedure_scores_metric
+ on procedure_scores(metric);
+
+ create table if not exists dedup_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ item_count integer not null,
+ source text not null,
+ predicted_json text,
+ golden_json text,
+ created_at text not null
+ );
+ create index if not exists idx_dedup_scores_scenario_run
+ on dedup_scores(scenario_run_id);
+ create index if not exists idx_dedup_scores_metric
+ on dedup_scores(metric);
+ `);
+ database.query("update meta set schema_version = ? where id = 1").run(10);
+ applied.push(10);
+ version = 10;
+ }
return applied;
}
diff --git a/src/providers/persistence/postgres-backend.ts b/src/providers/persistence/postgres-backend.ts
index fb2d658..fb2ff4b 100644
--- a/src/providers/persistence/postgres-backend.ts
+++ b/src/providers/persistence/postgres-backend.ts
@@ -188,6 +188,10 @@ function mapScenarioRow(
toolCalls: UnknownRecord[],
checkpoints: UnknownRecord[],
judgeDimensionScores: UnknownRecord[],
+ retrievalScores: UnknownRecord[] = [],
+ demotionScores: UnknownRecord[] = [],
+ procedureScores: UnknownRecord[] = [],
+ dedupScores: UnknownRecord[] = [],
): ScenarioRecord {
const failureKindRaw = asStringOrNull(row.failure_kind);
const failureKind =
@@ -269,6 +273,60 @@ function mapScenarioRow(
reasoning: String(score.reasoning ?? ""),
evidence: asJson(score.evidence_json) ?? [],
})),
+ retrievalScores: retrievalScores.map((score) => ({
+ metric: String(score.metric ?? ""),
+ value: Number(score.value),
+ weight: Number(score.weight),
+ k: Number(score.k),
+ weighted_score: Number(score.weighted_score),
+ pass_threshold: Number(score.pass_threshold),
+ passed: Boolean(score.passed),
+ total_relevant: Number(score.total_relevant),
+ total_returned: Number(score.total_returned),
+ hit_count: Number(score.hit_count),
+ forbidden_hits: Number(score.forbidden_hits),
+ source: String(score.source ?? ""),
+ returned: asJson(score.returned_json) ?? [],
+ })),
+ demotionScores: demotionScores.map((score) => ({
+ metric: String(score.metric ?? ""),
+ value: Number(score.value),
+ weight: Number(score.weight),
+ weighted_score: Number(score.weighted_score),
+ pass_threshold: Number(score.pass_threshold),
+ passed: Boolean(score.passed),
+ timestamp_violation_count: Number(score.timestamp_violation_count),
+ cascade_bounded:
+ score.cascade_bounded === null || score.cascade_bounded === undefined
+ ? null
+ : Boolean(score.cascade_bounded),
+ source: String(score.source ?? ""),
+ observed: asJson(score.observed_json) ?? [],
+ expected: asJson(score.expected_json) ?? [],
+ })),
+ procedureScores: procedureScores.map((score) => ({
+ metric: String(score.metric ?? ""),
+ value: Number(score.value),
+ weight: Number(score.weight),
+ weighted_score: Number(score.weighted_score),
+ pass_threshold: Number(score.pass_threshold),
+ passed: Boolean(score.passed),
+ source: String(score.source ?? ""),
+ predicted: asJson(score.predicted_json) ?? [],
+ golden: asJson(score.golden_json) ?? [],
+ })),
+ dedupScores: dedupScores.map((score) => ({
+ metric: String(score.metric ?? ""),
+ value: Number(score.value),
+ weight: Number(score.weight),
+ weighted_score: Number(score.weighted_score),
+ pass_threshold: Number(score.pass_threshold),
+ passed: Boolean(score.passed),
+ item_count: Number(score.item_count),
+ source: String(score.source ?? ""),
+ predicted: asJson(score.predicted_json) ?? [],
+ golden: asJson(score.golden_json) ?? [],
+ })),
error: asJson>(row.error_json) ?? null,
startedAt: asIsoTimestamp(row.started_at),
completedAt: asIsoTimestampOrNull(row.completed_at),
@@ -308,49 +366,87 @@ async function loadScenarioRecords(
}
if (options.summary) {
- return scenarioRows.map((row) => mapScenarioRow(row, [], [], [], [], []));
+ return scenarioRows.map((row) =>
+ mapScenarioRow(row, [], [], [], [], [], [], [], [], []),
+ );
}
- const [turns, events, toolCalls, checkpoints, dimensionScores] = await span(
- "pg.scenario_children",
- () =>
- Promise.all([
- span(
- "pg.turns",
- () => sql`
+ const [
+ turns,
+ events,
+ toolCalls,
+ checkpoints,
+ dimensionScores,
+ retrievalRows,
+ demotionRows,
+ procedureRows,
+ dedupRows,
+ ] = await span("pg.scenario_children", () =>
+ Promise.all([
+ span(
+ "pg.turns",
+ () => sql`
select * from turns where scenario_run_id in ${sql(ids)}
order by scenario_run_id asc, turn_index asc
`,
- ),
- span(
- "pg.target_events",
- () => sql`
+ ),
+ span(
+ "pg.target_events",
+ () => sql`
select * from target_events where scenario_run_id in ${sql(ids)}
order by scenario_run_id asc, turn_index asc, exchange_index asc
`,
- ),
- span(
- "pg.tool_calls",
- () => sql`
+ ),
+ span(
+ "pg.tool_calls",
+ () => sql`
select * from tool_calls where scenario_run_id in ${sql(ids)}
order by scenario_run_id asc, turn_index asc, call_order asc nulls last
`,
- ),
- span(
- "pg.checkpoints",
- () => sql`
+ ),
+ span(
+ "pg.checkpoints",
+ () => sql`
select * from checkpoints where scenario_run_id in ${sql(ids)}
order by scenario_run_id asc, checkpoint_index asc
`,
- ),
- span(
- "pg.judge_dimension_scores",
- () => sql`
+ ),
+ span(
+ "pg.judge_dimension_scores",
+ () => sql`
select * from judge_dimension_scores where scenario_run_id in ${sql(ids)}
order by scenario_run_id asc, dimension_id asc
`,
- ),
- ]),
+ ),
+ span(
+ "pg.retrieval_scores",
+ () => sql`
+ select * from retrieval_scores where scenario_run_id in ${sql(ids)}
+ order by scenario_run_id asc, id asc
+ `,
+ ),
+ span(
+ "pg.demotion_scores",
+ () => sql`
+ select * from demotion_scores where scenario_run_id in ${sql(ids)}
+ order by scenario_run_id asc, id asc
+ `,
+ ),
+ span(
+ "pg.procedure_scores",
+ () => sql`
+ select * from procedure_scores where scenario_run_id in ${sql(ids)}
+ order by scenario_run_id asc, id asc
+ `,
+ ),
+ span(
+ "pg.dedup_scores",
+ () => sql`
+ select * from dedup_scores where scenario_run_id in ${sql(ids)}
+ order by scenario_run_id asc, id asc
+ `,
+ ),
+ ]),
);
const groupBy = (
@@ -371,6 +467,10 @@ async function loadScenarioRecords(
const toolsByScenario = groupBy(toolCalls, "scenario_run_id");
const checkpointsByScenario = groupBy(checkpoints, "scenario_run_id");
const dimensionsByScenario = groupBy(dimensionScores, "scenario_run_id");
+ const retrievalByScenario = groupBy(retrievalRows, "scenario_run_id");
+ const demotionByScenario = groupBy(demotionRows, "scenario_run_id");
+ const procedureByScenario = groupBy(procedureRows, "scenario_run_id");
+ const dedupByScenario = groupBy(dedupRows, "scenario_run_id");
return scenarioRows.map((row) =>
mapScenarioRow(
@@ -380,6 +480,10 @@ async function loadScenarioRecords(
toolsByScenario.get(Number(row.id)) ?? [],
checkpointsByScenario.get(Number(row.id)) ?? [],
dimensionsByScenario.get(Number(row.id)) ?? [],
+ retrievalByScenario.get(Number(row.id)) ?? [],
+ demotionByScenario.get(Number(row.id)) ?? [],
+ procedureByScenario.get(Number(row.id)) ?? [],
+ dedupByScenario.get(Number(row.id)) ?? [],
),
);
}
diff --git a/src/providers/persistence/postgres-run-recorder.ts b/src/providers/persistence/postgres-run-recorder.ts
index df4a171..09685c6 100644
--- a/src/providers/persistence/postgres-run-recorder.ts
+++ b/src/providers/persistence/postgres-run-recorder.ts
@@ -5,8 +5,12 @@ import type {
AdapterReply,
CheckpointAssertion,
CheckpointResult,
+ DedupScore,
+ DemotionScore,
JsonValue,
Persona,
+ ProcedureScore,
+ RetrievalScore,
Rubric,
RubricScore,
RunResult,
@@ -529,6 +533,106 @@ export class PostgresRunRecorder {
`;
}
+ async recordRetrievalResult(
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: RetrievalScore },
+ ): Promise {
+ const passed = options.score.passed;
+ const returnedJson = json(redactValue(options.score.returned));
+ for (const metric of options.score.metrics) {
+ await this.sql`
+ insert into retrieval_scores (
+ scenario_run_id, metric, value, weight, k,
+ weighted_score, pass_threshold, passed, total_relevant,
+ total_returned, hit_count, forbidden_hits, source,
+ returned_json, created_at
+ ) values (
+ ${scenarioRunId}, ${metric.metric}, ${metric.value},
+ ${metric.weight}, ${options.score.k},
+ ${options.score.weightedScore}, ${options.score.passThreshold},
+ ${passed}, ${options.score.totalRelevant},
+ ${options.score.totalReturned}, ${options.score.hitCount},
+ ${options.score.forbiddenHits}, ${options.score.source},
+ ${returnedJson}::jsonb, now()
+ )
+ `;
+ }
+ }
+
+ async recordDemotionResult(
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: DemotionScore },
+ ): Promise {
+ const passed = options.score.passed;
+ const cascade = options.score.cascadeBounded ?? null;
+ const observedJson = json(redactValue(options.score.observed));
+ const expectedJson = json(redactValue(options.score.expected));
+ for (const metric of options.score.metrics) {
+ await this.sql`
+ insert into demotion_scores (
+ scenario_run_id, metric, value, weight,
+ weighted_score, pass_threshold, passed,
+ timestamp_violation_count, cascade_bounded,
+ source, observed_json, expected_json, created_at
+ ) values (
+ ${scenarioRunId}, ${metric.metric}, ${metric.value}, ${metric.weight},
+ ${options.score.weightedScore}, ${options.score.passThreshold},
+ ${passed}, ${options.score.timestampViolationCount},
+ ${cascade},
+ ${options.score.source},
+ ${observedJson}::jsonb, ${expectedJson}::jsonb, now()
+ )
+ `;
+ }
+ }
+
+ async recordProcedureResult(
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: ProcedureScore },
+ ): Promise {
+ const passed = options.score.passed;
+ const predictedJson = json(redactValue(options.score.predictedSteps));
+ const goldenJson = json(redactValue(options.score.goldenSteps));
+ for (const metric of options.score.metrics) {
+ await this.sql`
+ insert into procedure_scores (
+ scenario_run_id, metric, value, weight,
+ weighted_score, pass_threshold, passed, source,
+ predicted_json, golden_json, created_at
+ ) values (
+ ${scenarioRunId}, ${metric.metric}, ${metric.value}, ${metric.weight},
+ ${options.score.weightedScore}, ${options.score.passThreshold},
+ ${passed}, ${options.score.source},
+ ${predictedJson}::jsonb, ${goldenJson}::jsonb, now()
+ )
+ `;
+ }
+ }
+
+ async recordDedupResult(
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: DedupScore },
+ ): Promise {
+ const passed = options.score.passed;
+ const predictedJson = json(redactValue(options.score.predictedClusters));
+ const goldenJson = json(redactValue(options.score.goldenClusters));
+ for (const metric of options.score.metrics) {
+ await this.sql`
+ insert into dedup_scores (
+ scenario_run_id, metric, value, weight,
+ weighted_score, pass_threshold, passed, item_count,
+ source, predicted_json, golden_json, created_at
+ ) values (
+ ${scenarioRunId}, ${metric.metric}, ${metric.value}, ${metric.weight},
+ ${options.score.weightedScore}, ${options.score.passThreshold},
+ ${passed}, ${options.score.itemCount},
+ ${options.score.source},
+ ${predictedJson}::jsonb, ${goldenJson}::jsonb, now()
+ )
+ `;
+ }
+ }
+
async recordScenarioFinished(
scenarioRunId: number,
options: { result: ScenarioRunResult },
diff --git a/src/providers/persistence/sqlite-backend.ts b/src/providers/persistence/sqlite-backend.ts
index 35c1cb0..5dfec39 100644
--- a/src/providers/persistence/sqlite-backend.ts
+++ b/src/providers/persistence/sqlite-backend.ts
@@ -220,6 +220,10 @@ function projectRunRecord(
toolCalls: [],
checkpoints: [],
judgeDimensionScores: [],
+ retrievalScores: [],
+ demotionScores: [],
+ procedureScores: [],
+ dedupScores: [],
}
: scenario,
);
diff --git a/src/providers/persistence/sqlite-run-history.ts b/src/providers/persistence/sqlite-run-history.ts
index 3d0107f..e8158ca 100644
--- a/src/providers/persistence/sqlite-run-history.ts
+++ b/src/providers/persistence/sqlite-run-history.ts
@@ -7,11 +7,15 @@ import type {
AdapterReply,
CheckpointAssertion,
CheckpointResult,
+ DedupScore,
+ DemotionScore,
Endpoints,
JsonValue,
Persona,
PresetRecord,
PresetSnapshot,
+ ProcedureScore,
+ RetrievalScore,
Rubric,
RubricScore,
RunRecord,
@@ -29,7 +33,7 @@ import { redactDbUrl } from "./url.ts";
export const DEFAULT_DB_DIRNAME = ".agentprobe";
export const DEFAULT_DB_FILENAME = "runs.sqlite3";
-export const SCHEMA_VERSION = 8;
+export const SCHEMA_VERSION = 10;
const REDACTED_VALUE = "[REDACTED]";
const sensitiveExactKeys = new Set([
"access_token",
@@ -285,6 +289,18 @@ function migrateDatabase(database: Database, currentVersion: number): void {
database.query("update meta set schema_version = ? where id = 1").run(8);
version = 8;
}
+ if (version < 9) {
+ ensureRetrievalScoresTable(database);
+ database.query("update meta set schema_version = ? where id = 1").run(9);
+ version = 9;
+ }
+ if (version < 10) {
+ ensureDemotionScoresTable(database);
+ ensureProcedureScoresTable(database);
+ ensureDedupScoresTable(database);
+ database.query("update meta set schema_version = ? where id = 1").run(10);
+ version = 10;
+ }
if (version !== SCHEMA_VERSION) {
throw new AgentProbeRuntimeError(
@@ -335,6 +351,105 @@ function ensureHumanDimensionScoresTable(database: Database): void {
`);
}
+function ensureRetrievalScoresTable(database: Database): void {
+ database.exec(`
+ create table if not exists retrieval_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ k integer not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ total_relevant integer not null,
+ total_returned integer not null,
+ hit_count integer not null,
+ forbidden_hits integer not null,
+ source text not null,
+ returned_json text,
+ created_at text not null
+ );
+ create index if not exists idx_retrieval_scores_scenario_run
+ on retrieval_scores(scenario_run_id);
+ create index if not exists idx_retrieval_scores_metric
+ on retrieval_scores(metric);
+ `);
+}
+
+function ensureDemotionScoresTable(database: Database): void {
+ database.exec(`
+ create table if not exists demotion_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ timestamp_violation_count integer not null,
+ cascade_bounded integer,
+ source text not null,
+ observed_json text,
+ expected_json text,
+ created_at text not null
+ );
+ create index if not exists idx_demotion_scores_scenario_run
+ on demotion_scores(scenario_run_id);
+ create index if not exists idx_demotion_scores_metric
+ on demotion_scores(metric);
+ `);
+}
+
+function ensureProcedureScoresTable(database: Database): void {
+ database.exec(`
+ create table if not exists procedure_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ source text not null,
+ predicted_json text,
+ golden_json text,
+ created_at text not null
+ );
+ create index if not exists idx_procedure_scores_scenario_run
+ on procedure_scores(scenario_run_id);
+ create index if not exists idx_procedure_scores_metric
+ on procedure_scores(metric);
+ `);
+}
+
+function ensureDedupScoresTable(database: Database): void {
+ database.exec(`
+ create table if not exists dedup_scores (
+ id integer primary key autoincrement,
+ scenario_run_id integer not null references scenario_runs(id) on delete cascade,
+ metric text not null,
+ value real not null,
+ weight real not null,
+ weighted_score real not null,
+ pass_threshold real not null,
+ passed integer not null,
+ item_count integer not null,
+ source text not null,
+ predicted_json text,
+ golden_json text,
+ created_at text not null
+ );
+ create index if not exists idx_dedup_scores_scenario_run
+ on dedup_scores(scenario_run_id);
+ create index if not exists idx_dedup_scores_metric
+ on dedup_scores(metric);
+ `);
+}
+
function ensurePhase2Schema(database: Database): void {
ensurePhase2RunColumns(database);
database.exec(`
@@ -527,6 +642,10 @@ export function initDb(dbUrl?: string): void {
ensureAppSettingsTable(database);
ensureEndpointOverridesTable(database);
ensureHumanDimensionScoresTable(database);
+ ensureRetrievalScoresTable(database);
+ ensureDemotionScoresTable(database);
+ ensureProcedureScoresTable(database);
+ ensureDedupScoresTable(database);
const meta = database
.query("select schema_version from meta where id = 1")
@@ -1090,6 +1209,162 @@ export class SqliteRunRecorder {
);
}
+ async recordRetrievalResult(
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: RetrievalScore },
+ ): Promise {
+ const createdAt = utcNow();
+ const passedFlag = options.score.passed ? 1 : 0;
+ const returnedJson = encodeJson(redactValue(options.score.returned));
+ for (const metric of options.score.metrics) {
+ this.database
+ .query(
+ `
+ insert into retrieval_scores (
+ scenario_run_id, metric, value, weight, k,
+ weighted_score, pass_threshold, passed, total_relevant,
+ total_returned, hit_count, forbidden_hits, source,
+ returned_json, created_at
+ ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ `,
+ )
+ .run(
+ scenarioRunId,
+ metric.metric,
+ metric.value,
+ metric.weight,
+ options.score.k,
+ options.score.weightedScore,
+ options.score.passThreshold,
+ passedFlag,
+ options.score.totalRelevant,
+ options.score.totalReturned,
+ options.score.hitCount,
+ options.score.forbiddenHits,
+ options.score.source,
+ returnedJson,
+ createdAt,
+ );
+ }
+ }
+
+ async recordDemotionResult(
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: DemotionScore },
+ ): Promise {
+ const createdAt = utcNow();
+ const passedFlag = options.score.passed ? 1 : 0;
+ const cascade =
+ options.score.cascadeBounded === undefined
+ ? null
+ : options.score.cascadeBounded
+ ? 1
+ : 0;
+ const observedJson = encodeJson(redactValue(options.score.observed));
+ const expectedJson = encodeJson(redactValue(options.score.expected));
+ for (const metric of options.score.metrics) {
+ this.database
+ .query(
+ `
+ insert into demotion_scores (
+ scenario_run_id, metric, value, weight,
+ weighted_score, pass_threshold, passed,
+ timestamp_violation_count, cascade_bounded,
+ source, observed_json, expected_json, created_at
+ ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ `,
+ )
+ .run(
+ scenarioRunId,
+ metric.metric,
+ metric.value,
+ metric.weight,
+ options.score.weightedScore,
+ options.score.passThreshold,
+ passedFlag,
+ options.score.timestampViolationCount,
+ cascade,
+ options.score.source,
+ observedJson,
+ expectedJson,
+ createdAt,
+ );
+ }
+ }
+
+ async recordProcedureResult(
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: ProcedureScore },
+ ): Promise {
+ const createdAt = utcNow();
+ const passedFlag = options.score.passed ? 1 : 0;
+ const predictedJson = encodeJson(redactValue(options.score.predictedSteps));
+ const goldenJson = encodeJson(redactValue(options.score.goldenSteps));
+ for (const metric of options.score.metrics) {
+ this.database
+ .query(
+ `
+ insert into procedure_scores (
+ scenario_run_id, metric, value, weight,
+ weighted_score, pass_threshold, passed, source,
+ predicted_json, golden_json, created_at
+ ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ `,
+ )
+ .run(
+ scenarioRunId,
+ metric.metric,
+ metric.value,
+ metric.weight,
+ options.score.weightedScore,
+ options.score.passThreshold,
+ passedFlag,
+ options.score.source,
+ predictedJson,
+ goldenJson,
+ createdAt,
+ );
+ }
+ }
+
+ async recordDedupResult(
+ scenarioRunId: number,
+ options: { scenario: Scenario; score: DedupScore },
+ ): Promise {
+ const createdAt = utcNow();
+ const passedFlag = options.score.passed ? 1 : 0;
+ const predictedJson = encodeJson(
+ redactValue(options.score.predictedClusters),
+ );
+ const goldenJson = encodeJson(redactValue(options.score.goldenClusters));
+ for (const metric of options.score.metrics) {
+ this.database
+ .query(
+ `
+ insert into dedup_scores (
+ scenario_run_id, metric, value, weight,
+ weighted_score, pass_threshold, passed, item_count,
+ source, predicted_json, golden_json, created_at
+ ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+ `,
+ )
+ .run(
+ scenarioRunId,
+ metric.metric,
+ metric.value,
+ metric.weight,
+ options.score.weightedScore,
+ options.score.passThreshold,
+ passedFlag,
+ options.score.itemCount,
+ options.score.source,
+ predictedJson,
+ goldenJson,
+ createdAt,
+ );
+ }
+ }
+
async recordScenarioFinished(
scenarioRunId: number,
options: { result: ScenarioRunResult },
@@ -1921,6 +2196,26 @@ function getScenarioRecords(
"select * from judge_dimension_scores where scenario_run_id = ? order by dimension_id asc",
)
.all(scenarioRunId) as Array>;
+ const retrievalScores = database
+ .query(
+ "select * from retrieval_scores where scenario_run_id = ? order by id asc",
+ )
+ .all(scenarioRunId) as Array>;
+ const demotionScores = database
+ .query(
+ "select * from demotion_scores where scenario_run_id = ? order by id asc",
+ )
+ .all(scenarioRunId) as Array>;
+ const procedureScores = database
+ .query(
+ "select * from procedure_scores where scenario_run_id = ? order by id asc",
+ )
+ .all(scenarioRunId) as Array>;
+ const dedupScores = database
+ .query(
+ "select * from dedup_scores where scenario_run_id = ? order by id asc",
+ )
+ .all(scenarioRunId) as Array>;
return {
scenarioRunId,
@@ -2038,6 +2333,60 @@ function getScenarioRecords(
reasoning: String(score.reasoning),
evidence: decodeJson(score.evidence_json) ?? [],
})),
+ retrievalScores: retrievalScores.map((score) => ({
+ metric: String(score.metric),
+ value: Number(score.value),
+ weight: Number(score.weight),
+ k: Number(score.k),
+ weighted_score: Number(score.weighted_score),
+ pass_threshold: Number(score.pass_threshold),
+ passed: Number(score.passed) === 1,
+ total_relevant: Number(score.total_relevant),
+ total_returned: Number(score.total_returned),
+ hit_count: Number(score.hit_count),
+ forbidden_hits: Number(score.forbidden_hits),
+ source: String(score.source),
+ returned: decodeJson(score.returned_json) ?? [],
+ })),
+ demotionScores: demotionScores.map((score) => ({
+ metric: String(score.metric),
+ value: Number(score.value),
+ weight: Number(score.weight),
+ weighted_score: Number(score.weighted_score),
+ pass_threshold: Number(score.pass_threshold),
+ passed: Number(score.passed) === 1,
+ timestamp_violation_count: Number(score.timestamp_violation_count),
+ cascade_bounded:
+ score.cascade_bounded === null || score.cascade_bounded === undefined
+ ? null
+ : Number(score.cascade_bounded) === 1,
+ source: String(score.source),
+ observed: decodeJson(score.observed_json) ?? [],
+ expected: decodeJson(score.expected_json) ?? [],
+ })),
+ procedureScores: procedureScores.map((score) => ({
+ metric: String(score.metric),
+ value: Number(score.value),
+ weight: Number(score.weight),
+ weighted_score: Number(score.weighted_score),
+ pass_threshold: Number(score.pass_threshold),
+ passed: Number(score.passed) === 1,
+ source: String(score.source),
+ predicted: decodeJson(score.predicted_json) ?? [],
+ golden: decodeJson(score.golden_json) ?? [],
+ })),
+ dedupScores: dedupScores.map((score) => ({
+ metric: String(score.metric),
+ value: Number(score.value),
+ weight: Number(score.weight),
+ weighted_score: Number(score.weighted_score),
+ pass_threshold: Number(score.pass_threshold),
+ passed: Number(score.passed) === 1,
+ item_count: Number(score.item_count),
+ source: String(score.source),
+ predicted: decodeJson(score.predicted_json) ?? [],
+ golden: decodeJson(score.golden_json) ?? [],
+ })),
error: decodeJson>(row.error_json) ?? null,
startedAt: String(row.started_at),
completedAt:
diff --git a/src/runtime/server/default-presets.ts b/src/runtime/server/default-presets.ts
index 8302902..3eae9c0 100644
--- a/src/runtime/server/default-presets.ts
+++ b/src/runtime/server/default-presets.ts
@@ -40,7 +40,95 @@ export const PRE_RELEASE_DEFAULT_PRESET: PresetWriteInput = {
dryRun: false,
};
-const DEFAULT_PRESETS = [PRE_RELEASE_DEFAULT_PRESET] as const;
+// The three memory packs that together cover the full dream-system
+// roadmap surface: multi-session conversational (judge), retrieval
+// ranking, and the dream-validation trio (demotion/procedure/dedup).
+// All three vendor a per-file selection so the preset is stable across
+// scenario reorders and additive YAML changes.
+const MULTI_SESSION_MEMORY_SCENARIO_IDS = [
+ "mem-retention-basic-identity",
+ "mem-retention-incidental-facts",
+ "mem-distill-authed-http-image-gen",
+ "mem-distill-onboarding-workflow",
+ "mem-distill-weekly-report-format",
+ "mem-distill-implicit-tool-preferences",
+ "mem-distill-lead-cleaning-procedure",
+ "mem-rigidity-email-tone-override",
+ "mem-rigidity-tool-migration",
+ "mem-rigidity-pricing-update",
+ "mem-abstain-ambiguous-reference",
+ "mem-abstain-no-fabricated-preferences",
+ "mem-temporal-stale-team-member",
+ "mem-temporal-deprecated-procedure",
+ "mem-continuation-interrupted-task",
+ "mem-continuation-project-state",
+ "mem-crossdomain-business-identity",
+ "mem-crossdomain-customer-allergy-with-negative",
+ "mem-crossdomain-pricing-structure-with-negative",
+ "mem-crossdomain-shipping-schedule-reasoning",
+ "mem-crossdomain-notion-rate-limit",
+ "mem-procupdate-clean-replacement",
+ "mem-procupdate-additive",
+ "mem-compositional-board-prep",
+ "mem-introspection-what-do-you-know",
+ "mem-introspection-gaps",
+ "mem-longtail-lawyer-recall",
+ "mem-hygiene-bounded-time",
+ "mem-hygiene-temporary-status",
+ "mem-negative-one-off-qualifier",
+ "mem-negative-forget-on-request",
+] as const;
+
+const RETRIEVAL_MEMORY_SCENARIO_IDS = [
+ "mem-retrieval-forget-on-request",
+ "mem-retrieval-warm-context-sarah",
+ "mem-retrieval-stale-fact-demotion",
+ "mem-retrieval-scope-filter-project",
+ "mem-retrieval-cascading-expiry",
+] as const;
+
+const DREAM_VALIDATION_SCENARIO_IDS = [
+ "dream-demotion-retract-discipline",
+ "dream-demotion-snodgrass-violation",
+ "dream-demotion-stale-fact",
+ "dream-demotion-cascade-bounded",
+ "dream-demotion-cascade-runaway",
+ "dream-procedure-weekly-report",
+ "dream-procedure-client-onboarding",
+ "dream-dedup-near-duplicates",
+ "dream-dedup-false-positive",
+] as const;
+
+export const FULL_MEMORY_DEFAULT_PRESET: PresetWriteInput = {
+ name: "Full Memory Suite",
+ description:
+ "All memory-related scenarios in one preset: multi-session conversational, retrieval ranking, and dream-system validation (demotion / procedure / dedup). Covers the full P-1 -> P2 dream-system roadmap surface.",
+ endpoint: "autogpt-endpoint.yaml",
+ personas: "personas.yaml",
+ rubric: "rubric.yaml",
+ selection: [
+ ...MULTI_SESSION_MEMORY_SCENARIO_IDS.map((id) => ({
+ file: "multi-session-memory.yaml",
+ id,
+ })),
+ ...RETRIEVAL_MEMORY_SCENARIO_IDS.map((id) => ({
+ file: "retrieval-memory.yaml",
+ id,
+ })),
+ ...DREAM_VALIDATION_SCENARIO_IDS.map((id) => ({
+ file: "dream-validation.yaml",
+ id,
+ })),
+ ],
+ parallel: { enabled: false, limit: null },
+ repeat: 1,
+ dryRun: false,
+};
+
+const DEFAULT_PRESETS = [
+ PRE_RELEASE_DEFAULT_PRESET,
+ FULL_MEMORY_DEFAULT_PRESET,
+] as const;
export type DefaultPresetSeedResult = {
name: string;
diff --git a/src/shared/types/contracts.ts b/src/shared/types/contracts.ts
index 5bea5ba..98b7bc5 100644
--- a/src/shared/types/contracts.ts
+++ b/src/shared/types/contracts.ts
@@ -386,6 +386,125 @@ export type Session = {
turns: TurnType[];
};
+export type RetrievalMatchPolicy = "exact" | "substring" | "regex";
+
+export type RetrievalMetricKey =
+ | "precision_at_k"
+ | "recall_at_k"
+ | "mrr"
+ | "ndcg_at_k";
+
+export type RetrievalMetricWeights = Partial<
+ Record
+>;
+
+export type RetrievalSource = {
+ /**
+ * Path resolved relative to the scenario YAML file.
+ * The file must contain a JSON array of strings or objects with a `label` field.
+ */
+ fixture?: string;
+ /**
+ * Key on the last assistant reply's `rawExchange` payload that holds the
+ * returned items. Defaults to `retrieved`.
+ */
+ rawExchangeKey?: string;
+};
+
+export type RetrievalConfig = {
+ /** Ordered list of items the retriever is expected to surface. */
+ golden: string[];
+ /**
+ * Items that MUST NOT appear in the top-k. A forbidden hit forces a fail
+ * regardless of the weighted score (used for forget / scope-filter probes).
+ */
+ forbidden: string[];
+ /** Rank cutoff. Defaults to max(|golden|, |returned|, 1) when omitted. */
+ k?: number;
+ /** Per-metric weights for the weighted aggregate score. */
+ weights: Required;
+ /** Pass threshold on the weighted aggregate score. Defaults to 0.5. */
+ passThreshold: number;
+ /** Match policy applied to each pair of returned vs golden / forbidden items. */
+ match: RetrievalMatchPolicy;
+ /** Where to look for the retrieved list at scoring time. */
+ source?: RetrievalSource;
+};
+
+/** A `(rawExchangeKey | fixture)` resolution mirroring `RetrievalSource`. */
+export type DreamSource = {
+ fixture?: string;
+ rawExchangeKey?: string;
+};
+
+export type DemotionMetricKey =
+ | "set_precision"
+ | "set_recall"
+ | "set_f1"
+ | "timestamp_discipline"
+ | "cascade_bounded"
+ | "cascade_direct_f1";
+
+export type DemotionMetricWeights = Partial>;
+
+export type DemotionAction = {
+ uuid: string;
+ label?: string;
+ expiredAtSet: boolean;
+ invalidAtSet: boolean;
+ status?: string;
+};
+
+export type DemotionCascade = {
+ /** Edges that should be touched (1-hop neighbors of the invalidated entity). */
+ expectedDirectNeighbors: string[];
+ /** Edges that MUST NOT be touched (2+ hops). */
+ tangentialEdges: string[];
+};
+
+export type DemotionConfig = {
+ /** Expected set of demoted edge / memory UUIDs. */
+ expectedDemotions: string[];
+ /** Optional retract-vs-soft-delete discipline check. */
+ expectedRetracts?: string[];
+ /** Optional cascade check (P0.3b). */
+ cascade?: DemotionCascade;
+ weights: Required;
+ passThreshold: number;
+ source?: DreamSource;
+};
+
+export type ProcedureMetricKey =
+ | "step_coverage"
+ | "step_order"
+ | "parameter_coverage";
+
+export type ProcedureMetricWeights = Partial<
+ Record
+>;
+
+export type ProcedureConfig = {
+ /** Ordered list of expected step labels. */
+ goldenSteps: string[];
+ /** Optional parameter names the procedure must surface. */
+ goldenParameters?: string[];
+ weights: Required;
+ passThreshold: number;
+ source?: DreamSource;
+};
+
+export type DedupConfig = {
+ /**
+ * Expected clusters. Each inner list is a cluster of item IDs that should
+ * be merged together. Items present in the predicted set but absent here
+ * are treated as singletons.
+ */
+ goldenClusters: string[][];
+ weights: { precision: number; recall: number; f1: number; ari: number };
+ passThreshold: number;
+ source?: DreamSource;
+};
+
export type Scenario = {
id: string;
name: string;
@@ -401,6 +520,10 @@ export type Scenario = {
turns: TurnType[];
sessions: Session[];
expectations: ScenarioExpectations;
+ retrieval?: RetrievalConfig;
+ demotion?: DemotionConfig;
+ procedure?: ProcedureConfig;
+ dedup?: DedupConfig;
[key: string]: unknown;
};
@@ -473,6 +596,83 @@ export type RubricScore = {
failureModeDetected?: string | null;
};
+export type RetrievalMetricScore = {
+ metric: RetrievalMetricKey;
+ value: number;
+ weight: number;
+};
+
+export type RetrievalScore = {
+ k: number;
+ totalRelevant: number;
+ totalReturned: number;
+ hitCount: number;
+ forbiddenHits: number;
+ metrics: RetrievalMetricScore[];
+ weightedScore: number;
+ passThreshold: number;
+ passed: boolean;
+ /** The returned list (in rank order) used to score, captured for replay. */
+ returned: string[];
+ /** Where the returned list came from: `fixture` | `raw_exchange` | `missing`. */
+ source: "fixture" | "raw_exchange" | "missing";
+};
+
+export type EvalSource = "fixture" | "raw_exchange" | "missing";
+
+export type DemotionMetricScore = {
+ metric: DemotionMetricKey;
+ value: number;
+ weight: number;
+};
+
+export type DemotionScore = {
+ metrics: DemotionMetricScore[];
+ weightedScore: number;
+ passThreshold: number;
+ passed: boolean;
+ observed: string[];
+ expected: string[];
+ cascadeBounded?: boolean;
+ timestampViolationCount: number;
+ source: EvalSource;
+};
+
+export type ProcedureMetricScore = {
+ metric: ProcedureMetricKey;
+ value: number;
+ weight: number;
+};
+
+export type ProcedureScore = {
+ metrics: ProcedureMetricScore[];
+ weightedScore: number;
+ passThreshold: number;
+ passed: boolean;
+ predictedSteps: string[];
+ goldenSteps: string[];
+ source: EvalSource;
+};
+
+export type DedupMetricKey = "precision" | "recall" | "f1" | "ari";
+
+export type DedupMetricScore = {
+ metric: DedupMetricKey;
+ value: number;
+ weight: number;
+};
+
+export type DedupScore = {
+ metrics: DedupMetricScore[];
+ weightedScore: number;
+ passThreshold: number;
+ passed: boolean;
+ predictedClusters: string[][];
+ goldenClusters: string[][];
+ itemCount: number;
+ source: EvalSource;
+};
+
export type ScenarioRunResult = {
scenarioId: string;
scenarioName: string;
@@ -486,6 +686,10 @@ export type ScenarioRunResult = {
checkpoints: CheckpointResult[];
toolCallsByTurn?: Record;
judgeScore?: RubricScore;
+ retrievalScore?: RetrievalScore;
+ demotionScore?: DemotionScore;
+ procedureScore?: ProcedureScore;
+ dedupScore?: DedupScore;
renderedTurns?: Array>;
};
@@ -622,6 +826,10 @@ export type ScenarioRecord = {
toolCalls: Array>;
checkpoints: Array>;
judgeDimensionScores: Array>;
+ retrievalScores?: Array>;
+ demotionScores?: Array>;
+ procedureScores?: Array>;
+ dedupScores?: Array>;
error?: Record | null;
startedAt: string;
completedAt?: string | null;
diff --git a/tests/unit/db.test.ts b/tests/unit/db.test.ts
index f9774d3..96ba1fa 100644
--- a/tests/unit/db.test.ts
+++ b/tests/unit/db.test.ts
@@ -214,6 +214,10 @@ describe("sqlite recorder", () => {
"checkpoints",
"judge_dimension_scores",
"human_dimension_scores",
+ "retrieval_scores",
+ "demotion_scores",
+ "procedure_scores",
+ "dedup_scores",
"presets",
"preset_scenarios",
"app_settings",
@@ -223,7 +227,7 @@ describe("sqlite recorder", () => {
}
expect(
database.query("select schema_version from meta where id = 1").get(),
- ).toEqual({ schema_version: 8 });
+ ).toEqual({ schema_version: 10 });
} finally {
database.close();
}
@@ -706,7 +710,7 @@ describe("sqlite recorder", () => {
}
expect(
migrated.query("select schema_version from meta where id = 1").get(),
- ).toEqual({ schema_version: 8 });
+ ).toEqual({ schema_version: 10 });
} finally {
migrated.close();
}
diff --git a/tests/unit/dream-validation.test.ts b/tests/unit/dream-validation.test.ts
new file mode 100644
index 0000000..04fa01d
--- /dev/null
+++ b/tests/unit/dream-validation.test.ts
@@ -0,0 +1,120 @@
+import { describe, expect, test } from "bun:test";
+import { existsSync } from "node:fs";
+import { resolve } from "node:path";
+
+import { scoreScenarioDedup } from "../../src/domains/evaluation/dedup-scorer.ts";
+import { scoreScenarioDemotion } from "../../src/domains/evaluation/demotion-scorer.ts";
+import { scoreScenarioProcedure } from "../../src/domains/evaluation/procedure-scorer.ts";
+import { parseScenarioYaml } from "../../src/domains/validation/load-suite.ts";
+
+const SCENARIOS_PATH = resolve(
+ import.meta.dir,
+ "..",
+ "..",
+ "data",
+ "dream-validation.yaml",
+);
+
+describe("dream-validation pack", () => {
+ const parsed = parseScenarioYaml(SCENARIOS_PATH);
+ const scenarios = parsed.scenarios;
+
+ function requireScenario(id: string) {
+ const scenario = scenarios.find((s) => s.id === id);
+ if (!scenario) {
+ throw new Error(`Missing scenario: ${id}`);
+ }
+ return scenario;
+ }
+
+ test("ships at least four demotion, two procedure, and two dedup scenarios", () => {
+ const demotion = scenarios.filter((s) => s.demotion !== undefined);
+ const procedure = scenarios.filter((s) => s.procedure !== undefined);
+ const dedup = scenarios.filter((s) => s.dedup !== undefined);
+ expect(demotion.length).toBeGreaterThanOrEqual(4);
+ expect(procedure.length).toBeGreaterThanOrEqual(2);
+ expect(dedup.length).toBeGreaterThanOrEqual(2);
+ });
+
+ test("every scenario references a fixture that exists on disk", () => {
+ for (const scenario of scenarios) {
+ const fixture =
+ scenario.demotion?.source?.fixture ??
+ scenario.procedure?.source?.fixture ??
+ scenario.dedup?.source?.fixture;
+ expect(fixture).toBeDefined();
+ const resolved = resolve(SCENARIOS_PATH, "..", fixture ?? "");
+ expect(existsSync(resolved)).toBe(true);
+ }
+ });
+
+ test("Snodgrass-respecting retract scenario passes", () => {
+ const scenario = requireScenario("dream-demotion-retract-discipline");
+ const score = scoreScenarioDemotion(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.source).toBe("fixture");
+ expect(score?.timestampViolationCount).toBe(0);
+ expect(score?.passed).toBe(true);
+ });
+
+ test("Snodgrass-violating retract scenario fails on timestamp discipline", () => {
+ const scenario = requireScenario("dream-demotion-snodgrass-violation");
+ const score = scoreScenarioDemotion(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.timestampViolationCount).toBeGreaterThan(0);
+ expect(score?.passed).toBe(false);
+ });
+
+ test("bounded cascade scenario passes (single-hop discipline held)", () => {
+ const scenario = requireScenario("dream-demotion-cascade-bounded");
+ const score = scoreScenarioDemotion(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.cascadeBounded).toBe(true);
+ expect(score?.passed).toBe(true);
+ });
+
+ test("runaway cascade scenario fails on cascade_bounded", () => {
+ const scenario = requireScenario("dream-demotion-cascade-runaway");
+ const score = scoreScenarioDemotion(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.cascadeBounded).toBe(false);
+ expect(score?.passed).toBe(false);
+ });
+
+ test("weekly-report procedure scenario passes against its golden", () => {
+ const scenario = requireScenario("dream-procedure-weekly-report");
+ const score = scoreScenarioProcedure(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.source).toBe("fixture");
+ expect(score?.passed).toBe(true);
+ });
+
+ test("client-onboarding procedure scenario passes against its golden", () => {
+ const scenario = requireScenario("dream-procedure-client-onboarding");
+ const score = scoreScenarioProcedure(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.passed).toBe(true);
+ });
+
+ test("clean dedup scenario passes (no over-merge, no under-merge)", () => {
+ const scenario = requireScenario("dream-dedup-near-duplicates");
+ const score = scoreScenarioDedup(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.passed).toBe(true);
+ });
+
+ test("over-merge dedup scenario fails on pairwise precision + ARI", () => {
+ const scenario = requireScenario("dream-dedup-false-positive");
+ const score = scoreScenarioDedup(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.passed).toBe(false);
+ });
+});
diff --git a/tests/unit/load-suite.test.ts b/tests/unit/load-suite.test.ts
index 7bc02b3..a4a23e6 100644
--- a/tests/unit/load-suite.test.ts
+++ b/tests/unit/load-suite.test.ts
@@ -49,6 +49,171 @@ describe("scenario parsing", () => {
expect(scenario?.context?.copilotMode).toBe("fast");
});
+ test("parses a retrieval block with defaults", () => {
+ const path = join(
+ makeTempDir("scenario-retrieval-basic"),
+ "scenarios.yaml",
+ );
+ writeFileSync(
+ path,
+ [
+ "defaults:",
+ " persona: shopper",
+ " rubric: support",
+ "scenarios:",
+ " - id: retrieval-basic",
+ ' name: "Retrieval basic"',
+ " turns:",
+ " - role: user",
+ ' content: "what do we have on Sarah?"',
+ " expectations:",
+ ' expected_behavior: "Surface gold items."',
+ " expected_outcome: resolved",
+ " retrieval:",
+ " golden:",
+ ' - "Sarah\\u0027s email"',
+ ' - "Atlas project status"',
+ " k: 5",
+ "",
+ ].join("\n"),
+ "utf8",
+ );
+
+ const parsed = parseScenarioYaml(path);
+ const scenario = parsed.scenarios[0];
+
+ expect(scenario?.retrieval).toBeDefined();
+ expect(scenario?.retrieval?.golden).toEqual([
+ "Sarah's email",
+ "Atlas project status",
+ ]);
+ expect(scenario?.retrieval?.k).toBe(5);
+ expect(scenario?.retrieval?.weights.precision_at_k).toBe(1);
+ expect(scenario?.retrieval?.weights.recall_at_k).toBe(1);
+ expect(scenario?.retrieval?.weights.mrr).toBe(1);
+ expect(scenario?.retrieval?.weights.ndcg_at_k).toBe(1);
+ expect(scenario?.retrieval?.passThreshold).toBe(0.5);
+ expect(scenario?.retrieval?.match).toBe("substring");
+ });
+
+ test("parses retrieval block with custom weights, forbidden, threshold, and source", () => {
+ const path = join(
+ makeTempDir("scenario-retrieval-custom"),
+ "scenarios.yaml",
+ );
+ writeFileSync(
+ path,
+ [
+ "defaults:",
+ " persona: shopper",
+ " rubric: support",
+ "scenarios:",
+ " - id: retrieval-custom",
+ ' name: "Retrieval custom"',
+ " turns:",
+ " - role: user",
+ ' content: "what is our Q2 budget?"',
+ " expectations:",
+ ' expected_behavior: "Honor forget request."',
+ " expected_outcome: resolved",
+ " retrieval:",
+ " golden:",
+ ' - "I do not have that"',
+ " forbidden:",
+ ' - "$50K"',
+ " k: 3",
+ " pass_threshold: 0.6",
+ " match: substring",
+ " weight:",
+ " precision_at_k: 0.5",
+ " recall_at_k: 2.0",
+ " mrr: 1.0",
+ " ndcg_at_k: 1.5",
+ " source:",
+ ' raw_exchange_key: "memories"',
+ "",
+ ].join("\n"),
+ "utf8",
+ );
+
+ const parsed = parseScenarioYaml(path);
+ const scenario = parsed.scenarios[0];
+
+ expect(scenario?.retrieval?.forbidden).toEqual(["$50K"]);
+ expect(scenario?.retrieval?.k).toBe(3);
+ expect(scenario?.retrieval?.passThreshold).toBeCloseTo(0.6, 6);
+ expect(scenario?.retrieval?.weights.recall_at_k).toBe(2);
+ expect(scenario?.retrieval?.weights.precision_at_k).toBe(0.5);
+ expect(scenario?.retrieval?.source?.rawExchangeKey).toBe("memories");
+ });
+
+ test("rejects retrieval config with empty golden", () => {
+ const path = join(
+ makeTempDir("scenario-retrieval-empty-golden"),
+ "scenarios.yaml",
+ );
+ writeFileSync(
+ path,
+ [
+ "defaults:",
+ " persona: shopper",
+ " rubric: support",
+ "scenarios:",
+ " - id: retrieval-empty",
+ ' name: "Retrieval empty"',
+ " turns:",
+ " - role: user",
+ ' content: "x"',
+ " expectations:",
+ ' expected_behavior: "x"',
+ " expected_outcome: resolved",
+ " retrieval:",
+ " golden: []",
+ "",
+ ].join("\n"),
+ "utf8",
+ );
+
+ expect(() => parseScenarioYaml(path)).toThrow(
+ /retrieval.golden must be a non-empty/,
+ );
+ });
+
+ test("rejects unknown retrieval metric weight keys", () => {
+ const path = join(
+ makeTempDir("scenario-retrieval-bad-weight"),
+ "scenarios.yaml",
+ );
+ writeFileSync(
+ path,
+ [
+ "defaults:",
+ " persona: shopper",
+ " rubric: support",
+ "scenarios:",
+ " - id: retrieval-bad-weight",
+ ' name: "Retrieval bad weight"',
+ " turns:",
+ " - role: user",
+ ' content: "x"',
+ " expectations:",
+ ' expected_behavior: "x"',
+ " expected_outcome: resolved",
+ " retrieval:",
+ " golden:",
+ ' - "foo"',
+ " weight:",
+ " hit_rate: 1.0",
+ "",
+ ].join("\n"),
+ "utf8",
+ );
+
+ expect(() => parseScenarioYaml(path)).toThrow(
+ /Unknown retrieval metric key/,
+ );
+ });
+
test("parses session max_turns and scenario base_date", () => {
const path = join(makeTempDir("scenario-sessions"), "scenarios.yaml");
writeFileSync(
diff --git a/tests/unit/persistence/drizzle-schema.test.ts b/tests/unit/persistence/drizzle-schema.test.ts
index a3b5b1a..c28462e 100644
--- a/tests/unit/persistence/drizzle-schema.test.ts
+++ b/tests/unit/persistence/drizzle-schema.test.ts
@@ -12,12 +12,16 @@ import {
const expectedTables = [
"app_settings",
"checkpoints",
+ "dedup_scores",
+ "demotion_scores",
"endpoint_overrides",
"human_dimension_scores",
"judge_dimension_scores",
"meta",
"preset_scenarios",
"presets",
+ "procedure_scores",
+ "retrieval_scores",
"runs",
"scenario_runs",
"target_events",
@@ -33,12 +37,12 @@ function schemaTableNames(schema: Record): string[] {
describe("Drizzle schema mirrors persistence schema contracts", () => {
test("declares the complete SQLite table inventory for the current target version", () => {
- expect(SQLITE_TARGET_VERSION).toBe(8);
+ expect(SQLITE_TARGET_VERSION).toBe(10);
expect(schemaTableNames(sqliteSchema)).toEqual(expectedTables);
});
test("declares the complete Postgres table inventory for the current target version", () => {
- expect(POSTGRES_TARGET_VERSION).toBe(4);
+ expect(POSTGRES_TARGET_VERSION).toBe(6);
expect(schemaTableNames(postgresSchema)).toEqual(expectedTables);
});
});
diff --git a/tests/unit/persistence/migrations.test.ts b/tests/unit/persistence/migrations.test.ts
index 5b5a4d1..4e91bba 100644
--- a/tests/unit/persistence/migrations.test.ts
+++ b/tests/unit/persistence/migrations.test.ts
@@ -76,7 +76,7 @@ describe("migration dispatcher", () => {
const url = `sqlite:///${path}`;
const report = await runMigrations(url);
expect(report.currentVersion).toBe(1);
- expect(report.applied).toEqual([2, 3, 4, 5, 6, 7, 8]);
+ expect(report.applied).toEqual([2, 3, 4, 5, 6, 7, 8, 9, 10]);
expect(report.targetVersion).toBe(SQLITE_TARGET_VERSION);
});
@@ -94,7 +94,7 @@ describe("migration dispatcher", () => {
const report = await runMigrations(url);
expect(report.currentVersion).toBe(2);
- expect(report.applied).toEqual([3, 4]);
+ expect(report.applied).toEqual([3, 4, 5, 6]);
expect(report.targetVersion).toBe(POSTGRES_TARGET_VERSION);
const check = await checkSchemaVersion(url);
diff --git a/tests/unit/retrieval-memory.test.ts b/tests/unit/retrieval-memory.test.ts
new file mode 100644
index 0000000..46fd8bb
--- /dev/null
+++ b/tests/unit/retrieval-memory.test.ts
@@ -0,0 +1,113 @@
+import { describe, expect, test } from "bun:test";
+import { resolve } from "node:path";
+import { scoreRetrieval } from "../../src/domains/evaluation/retrieval-scorer.ts";
+import {
+ parseRubricsYaml,
+ parseScenarioYaml,
+} from "../../src/domains/validation/load-suite.ts";
+
+const SCENARIOS_PATH = resolve(
+ import.meta.dir,
+ "..",
+ "..",
+ "data",
+ "retrieval-memory.yaml",
+);
+const RUBRIC_PATH = resolve(import.meta.dir, "..", "..", "data", "rubric.yaml");
+
+describe("retrieval-memory pack", () => {
+ const parsed = parseScenarioYaml(SCENARIOS_PATH);
+ const scenarios = parsed.scenarios;
+
+ test("declares at least five ranking-scored scenarios", () => {
+ const withRetrieval = scenarios.filter((s) => s.retrieval !== undefined);
+ expect(withRetrieval.length).toBeGreaterThanOrEqual(5);
+ });
+
+ test("every scenario references a known memory rubric", () => {
+ const rubrics = parseRubricsYaml(RUBRIC_PATH);
+ const rubricIds = new Set(rubrics.rubrics.map((r) => r.id));
+ for (const scenario of scenarios) {
+ expect(scenario.rubric).toBeDefined();
+ expect(rubricIds.has(scenario.rubric ?? "")).toBe(true);
+ }
+ });
+
+ test("each retrieval block uses fixture source that exists relative to the YAML", () => {
+ for (const scenario of scenarios) {
+ const fixture = scenario.retrieval?.source?.fixture;
+ expect(fixture).toBeDefined();
+ // Resolve relative to YAML dir.
+ const resolved = resolve(SCENARIOS_PATH, "..", fixture ?? "");
+ // Sanity-check the file exists (Bun.file.exists is sync via existsSync)
+ const exists = require("node:fs").existsSync(resolved);
+ expect(exists).toBe(true);
+ }
+ });
+
+ function requireScenario(id: string) {
+ const scenario = scenarios.find((s) => s.id === id);
+ if (!scenario) {
+ throw new Error(`Missing scenario in pack: ${id}`);
+ }
+ return scenario;
+ }
+
+ test("forget-on-request scenario forbids the budget figure and passes against the fixture", () => {
+ const scenario = requireScenario("mem-retrieval-forget-on-request");
+ expect(scenario.retrieval?.forbidden ?? []).toContain("$50K");
+
+ const score = scoreRetrieval(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.forbiddenHits).toBe(0);
+ });
+
+ test("warm-context scenario scores its happy-path fixture as passed", () => {
+ const scenario = requireScenario("mem-retrieval-warm-context-sarah");
+
+ const score = scoreRetrieval(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.source).toBe("fixture");
+ expect(score?.passed).toBe(true);
+ expect(score?.hitCount).toBeGreaterThanOrEqual(2);
+ });
+
+ test("stale-fact demotion scenario passes when only the new pricing surfaces", () => {
+ const scenario = requireScenario("mem-retrieval-stale-fact-demotion");
+
+ const score = scoreRetrieval(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ // The fixture intentionally still contains the superseded item to prove
+ // the forbidden-hit check is active. So this scenario, when run against
+ // its committed fixture, should fail. That documents the negative-test
+ // intent of the YAML: swap the fixture for an actual retrieval payload,
+ // and a correctly-functioning dream pass would have demoted the old
+ // pricing out of the top-k.
+ expect(score?.source).toBe("fixture");
+ expect(score?.forbiddenHits).toBeGreaterThan(0);
+ expect(score?.passed).toBe(false);
+ });
+
+ test("scope-filter scenario passes against its in-scope-only fixture", () => {
+ const scenario = requireScenario("mem-retrieval-scope-filter-project");
+
+ const score = scoreRetrieval(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.forbiddenHits).toBe(0);
+ expect(score?.passed).toBe(true);
+ });
+
+ test("cascading-expiry scenario passes when the entity's facts are gone but adjacent facts remain", () => {
+ const scenario = requireScenario("mem-retrieval-cascading-expiry");
+
+ const score = scoreRetrieval(scenario, {
+ scenariosPath: SCENARIOS_PATH,
+ });
+ expect(score?.forbiddenHits).toBe(0);
+ expect(score?.passed).toBe(true);
+ });
+});
diff --git a/tests/unit/runner.test.ts b/tests/unit/runner.test.ts
index 05c0fa6..bc27ee3 100644
--- a/tests/unit/runner.test.ts
+++ b/tests/unit/runner.test.ts
@@ -1642,4 +1642,111 @@ describe("runner", () => {
// Judge must NOT have been called.
expect(client.calls).toHaveLength(0);
});
+
+ test("runScenario scores retrieval from rawExchange and surfaces it on the result", async () => {
+ const adapter = new FakeAdapter([
+ adapterReply("Here are the relevant memories.", {
+ rawExchange: {
+ retrieved: ["Sarah's email", "Atlas project status"],
+ } as unknown as Record,
+ }),
+ ]);
+ const client = new FakeResponsesClient([
+ buildPersonaStep("completed"),
+ buildScore(),
+ ]);
+
+ const scenario = buildScenario({
+ turns: [
+ {
+ role: "user",
+ content: "What do you remember about Sarah?",
+ useExactMessage: true,
+ attachments: [],
+ },
+ ],
+ });
+ scenario.retrieval = {
+ golden: ["Sarah's email", "Atlas project status"],
+ forbidden: [],
+ weights: {
+ precision_at_k: 1,
+ recall_at_k: 1,
+ mrr: 1,
+ ndcg_at_k: 1,
+ },
+ passThreshold: 0.5,
+ match: "substring",
+ k: 2,
+ };
+
+ const result = await runScenario(
+ adapter,
+ scenario,
+ buildPersona(),
+ buildRubric(),
+ {
+ client: asResponsesClient(client) as never,
+ },
+ );
+
+ expect(result.retrievalScore).toBeDefined();
+ expect(result.retrievalScore?.passed).toBe(true);
+ expect(result.retrievalScore?.hitCount).toBe(2);
+ expect(result.retrievalScore?.source).toBe("raw_exchange");
+ expect(result.passed).toBe(true);
+ });
+
+ test("runScenario fails when retrieval contains a forbidden hit, even if the judge passes", async () => {
+ const adapter = new FakeAdapter([
+ adapterReply("Sure thing.", {
+ rawExchange: {
+ retrieved: ["I do not have that on file", "Old budget was $50K"],
+ } as unknown as Record,
+ }),
+ ]);
+ const client = new FakeResponsesClient([
+ buildPersonaStep("completed"),
+ buildScore({ score: 5 }),
+ ]);
+
+ const scenario = buildScenario({
+ turns: [
+ {
+ role: "user",
+ content: "What's our Q2 budget?",
+ useExactMessage: true,
+ attachments: [],
+ },
+ ],
+ });
+ scenario.retrieval = {
+ golden: ["I do not have that"],
+ forbidden: ["$50K"],
+ weights: {
+ precision_at_k: 1,
+ recall_at_k: 1,
+ mrr: 1,
+ ndcg_at_k: 1,
+ },
+ passThreshold: 0.2,
+ match: "substring",
+ k: 5,
+ };
+
+ const result = await runScenario(
+ adapter,
+ scenario,
+ buildPersona(),
+ buildRubric(),
+ {
+ client: asResponsesClient(client) as never,
+ },
+ );
+
+ expect(result.judgeScore?.passed).toBe(true);
+ expect(result.retrievalScore?.forbiddenHits).toBe(1);
+ expect(result.retrievalScore?.passed).toBe(false);
+ expect(result.passed).toBe(false);
+ });
});
diff --git a/tests/unit/server/default-presets.test.ts b/tests/unit/server/default-presets.test.ts
index 0e2790e..d3f975c 100644
--- a/tests/unit/server/default-presets.test.ts
+++ b/tests/unit/server/default-presets.test.ts
@@ -5,6 +5,7 @@ import { join } from "node:path";
import { SqliteRepository } from "../../../src/providers/persistence/sqlite-backend.ts";
import { SuiteController } from "../../../src/runtime/server/controllers/suite-controller.ts";
import {
+ FULL_MEMORY_DEFAULT_PRESET,
PRE_RELEASE_DEFAULT_PRESET,
seedDefaultPresets,
} from "../../../src/runtime/server/default-presets.ts";
@@ -25,14 +26,23 @@ describe("default preset seeding", () => {
repository,
suiteController,
});
- expect(results[0]).toMatchObject({
+ const preReleaseResult = results.find(
+ (r) => r.name === PRE_RELEASE_DEFAULT_PRESET.name,
+ );
+ expect(preReleaseResult).toMatchObject({
name: PRE_RELEASE_DEFAULT_PRESET.name,
status: "created",
});
+ expect(
+ results.find((r) => r.name === FULL_MEMORY_DEFAULT_PRESET.name),
+ ).toMatchObject({
+ name: FULL_MEMORY_DEFAULT_PRESET.name,
+ status: "created",
+ });
const presets = await repository.listPresets();
- expect(presets).toHaveLength(1);
- const preset = presets[0];
+ expect(presets).toHaveLength(2);
+ const preset = presets.find((p) => p.name === "Pre Release Checks");
expect(preset).toMatchObject({
name: "Pre Release Checks",
description: null,
@@ -44,17 +54,31 @@ describe("default preset seeding", () => {
dryRun: false,
});
expect(preset?.selection).toEqual(PRE_RELEASE_DEFAULT_PRESET.selection);
+ const memoryPreset = presets.find(
+ (p) => p.name === FULL_MEMORY_DEFAULT_PRESET.name,
+ );
+ expect(memoryPreset?.selection).toEqual(
+ FULL_MEMORY_DEFAULT_PRESET.selection,
+ );
const secondPass = await seedDefaultPresets({
repository,
suiteController,
});
- expect(secondPass[0]).toMatchObject({
+ expect(
+ secondPass.find((r) => r.name === PRE_RELEASE_DEFAULT_PRESET.name),
+ ).toMatchObject({
name: PRE_RELEASE_DEFAULT_PRESET.name,
status: "existing",
presetId: preset?.id,
});
- expect(await repository.listPresets()).toHaveLength(1);
+ expect(
+ secondPass.find((r) => r.name === FULL_MEMORY_DEFAULT_PRESET.name),
+ ).toMatchObject({
+ name: FULL_MEMORY_DEFAULT_PRESET.name,
+ status: "existing",
+ });
+ expect(await repository.listPresets()).toHaveLength(2);
});
test("restores a soft-deleted default preset by name", async () => {
@@ -62,24 +86,31 @@ describe("default preset seeding", () => {
await repository.initialize();
const suiteController = new SuiteController({ dataPath: DATA_DIR });
await seedDefaultPresets({ repository, suiteController });
- const seeded = (await repository.listPresets())[0];
+ const seeded = (await repository.listPresets()).find(
+ (p) => p.name === PRE_RELEASE_DEFAULT_PRESET.name,
+ );
expect(seeded).toBeDefined();
await repository.softDeletePreset(seeded?.id ?? "");
- expect(await repository.listPresets()).toHaveLength(0);
+ expect(await repository.listPresets()).toHaveLength(1);
const results = await seedDefaultPresets({
repository,
suiteController,
});
- expect(results[0]).toMatchObject({
+ expect(
+ results.find((r) => r.name === PRE_RELEASE_DEFAULT_PRESET.name),
+ ).toMatchObject({
name: PRE_RELEASE_DEFAULT_PRESET.name,
status: "restored",
presetId: seeded?.id,
});
const presets = await repository.listPresets();
- expect(presets).toHaveLength(1);
- expect(presets[0]?.deletedAt ?? null).toBeNull();
+ expect(presets).toHaveLength(2);
+ expect(
+ presets.find((p) => p.name === PRE_RELEASE_DEFAULT_PRESET.name)
+ ?.deletedAt ?? null,
+ ).toBeNull();
});
test("skips seeding when the data root does not include packaged default files", async () => {
@@ -95,10 +126,18 @@ describe("default preset seeding", () => {
repository,
suiteController: new SuiteController({ dataPath }),
});
- expect(results[0]).toMatchObject({
+ expect(
+ results.find((r) => r.name === PRE_RELEASE_DEFAULT_PRESET.name),
+ ).toMatchObject({
name: PRE_RELEASE_DEFAULT_PRESET.name,
status: "skipped",
});
+ expect(
+ results.find((r) => r.name === FULL_MEMORY_DEFAULT_PRESET.name),
+ ).toMatchObject({
+ name: FULL_MEMORY_DEFAULT_PRESET.name,
+ status: "skipped",
+ });
expect(await repository.listPresets()).toHaveLength(0);
});
});