diff --git a/dashboard/dist/index.html b/dashboard/dist/index.html index efac6bf..19d87d5 100644 --- a/dashboard/dist/index.html +++ b/dashboard/dist/index.html @@ -24,7 +24,7 @@ } })(); - - +`).trim()}function Xme({turn:e}){const t=e.parts??[],n=t.length>0,{reasoning:r,response:i}=n?Yme(t):{reasoning:[],response:[]},l=n?null:(()=>{const{reasoning:d,body:g}=Ume(e.content??"");return{reasoning:d,body:g,tools:e.tool_calls??[]}})(),s=n?AP(i):null,u=n&&r.length>0?AP(r):null,f=n?Wme(i.length>0?i:t):l?.body??e.content??"";return p.jsxs(G4,{from:"assistant",children:[p.jsxs(W4,{children:[n?p.jsxs(p.Fragment,{children:[u&&p.jsx(Dme,{count:r.length,children:p.jsx(CP,{segments:u})}),s&&p.jsx(CP,{segments:s})]}):l&&p.jsxs(p.Fragment,{children:[l.reasoning&&p.jsx(DS,{children:p.jsx(Af,{className:"text-xs text-muted-foreground",children:l.reasoning})}),l.body&&p.jsx("div",{className:"text-[1rem] leading-relaxed text-foreground",children:eg(l.body)}),l.tools.length>0&&p.jsx("div",{className:"flex flex-col gap-1.5",children:l.tools.map((d,g)=>p.jsx(J4,{name:d.name,input:d.args},g))})]}),p.jsx(Kme,{checkpoints:e.checkpoints??[]})]}),p.jsxs("div",{className:"flex items-center justify-between gap-2",children:[p.jsx(Z4,{turn:e}),f.length>0&&p.jsx(Pme,{children:p.jsx(Rme,{text:f})})]})]})}function zS({detail:e}){const t=Ime(e);return p.jsx("div",{className:xe("flex flex-col gap-6 px-1 py-2"),children:t.map((n,r)=>$me(n)?p.jsx(Fme,{turn:n},r):n.role==="user"?p.jsx(qme,{turn:n},r):n.role==="assistant"?p.jsx(Xme,{turn:n},r):p.jsx(Hme,{turn:n},r))})}function _P(e){return Number.isFinite(e)?e.toFixed(3).replace(/0+$/,"").replace(/\.$/,""):"n/a"}function kP(e){return Number.isFinite(e)?`${Math.round(e*100)}%`:"n/a"}function Qme({value:e}){const t=Math.max(0,Math.min(1,e));return p.jsx("div",{className:"h-1.5 w-full overflow-hidden rounded-full bg-muted",children:p.jsx("div",{className:"h-full rounded-full bg-foreground/70",style:{width:`${t*100}%`}})})}function Zme({passed:e}){return e?p.jsxs("span",{className:"inline-flex items-center gap-1 rounded-full border border-emerald-500/40 bg-emerald-500/10 px-2 py-0.5 text-[10px] font-semibold uppercase tracking-wider text-emerald-700 dark:text-emerald-400",children:[p.jsx(g1,{size:11,strokeWidth:2.5}),"Pass"]}):p.jsxs("span",{className:"inline-flex items-center gap-1 rounded-full border border-rose-500/40 bg-rose-500/10 px-2 py-0.5 text-[10px] font-semibold uppercase tracking-wider text-rose-700 dark:text-rose-400",children:[p.jsx(EM,{size:11,strokeWidth:2.5}),"Fail"]})}function Jme({children:e,count:t}){return p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx("div",{className:"text-[10px] font-semibold uppercase tracking-[0.16em] text-muted-foreground",children:e}),t!=null&&p.jsx("div",{className:"font-mono text-[10px] text-muted-foreground/70",children:t}),p.jsx("div",{className:"h-px flex-1 bg-border"})]})}function tg({m:e}){return p.jsxs("div",{className:"flex items-center gap-3 rounded-md border border-border bg-card/40 px-3 py-2",children:[p.jsx("div",{className:"min-w-[150px] font-mono text-[11px] text-foreground",children:e.metric}),p.jsx("div",{className:"flex-1",children:p.jsx(Qme,{value:e.value})}),p.jsx("div",{className:"min-w-[48px] text-right font-mono text-[11px] text-foreground",children:_P(e.value)}),p.jsxs("div",{className:"min-w-[36px] text-right font-mono text-[10px] text-muted-foreground",children:["×",_P(e.weight)]})]})}function ng({icon:e,title:t,subtitle:n,weightedScore:r,passThreshold:i,passed:l,source:s}){return p.jsxs("header",{className:"flex items-center justify-between gap-4 rounded-md border border-border bg-muted/30 px-3 py-2",children:[p.jsxs("div",{className:"flex items-center gap-2.5",children:[p.jsx("div",{className:"flex h-7 w-7 items-center justify-center rounded-md border border-border bg-card text-muted-foreground",children:e}),p.jsxs("div",{className:"flex flex-col",children:[p.jsx("div",{className:"text-sm font-semibold text-foreground",children:t}),n&&p.jsx("div",{className:"text-[11px] text-muted-foreground",children:n})]})]}),p.jsxs("div",{className:"flex items-center gap-3 text-[11px]",children:[p.jsxs("div",{className:"flex flex-col items-end leading-tight",children:[p.jsxs("div",{className:"font-mono text-foreground",children:[kP(r),p.jsxs("span",{className:"text-muted-foreground",children:[" ","/ ",kP(i)]})]}),p.jsxs("div",{className:"text-[10px] text-muted-foreground",children:["source: ",s]})]}),p.jsx(Zme,{passed:l})]})]})}function rg(e){if(e.length===0)return{weightedScore:0,passThreshold:0,passed:!1,source:"missing"};const t=e[0];return{weightedScore:t.weighted_score,passThreshold:t.pass_threshold,passed:t.passed,source:t.source}}function ege({metrics:e}){if(e.length===0)return null;const t=rg(e),n=e[0];return p.jsxs("section",{className:"flex flex-col gap-2",children:[p.jsx(ng,{icon:p.jsx(AM,{size:14,strokeWidth:2.25}),title:"Retrieval ranking",subtitle:`k=${n?.k??"n/a"}, ${n?.hit_count??0}/${n?.total_relevant??0} hits, ${n?.forbidden_hits??0} forbidden`,weightedScore:t.weightedScore,passThreshold:t.passThreshold,passed:t.passed,source:t.source}),p.jsx("div",{className:"flex flex-col gap-1.5",children:e.map((r,i)=>p.jsx(tg,{m:r},`${r.metric}-${i}`))})]})}function tge({metrics:e}){if(e.length===0)return null;const t=rg(e),n=e[0],r=[];return n&&(r.push(`timestamp violations: ${n.timestamp_violation_count}`),n.cascade_bounded===!0?r.push("cascade: bounded"):n.cascade_bounded===!1&&r.push("cascade: RUNAWAY")),p.jsxs("section",{className:"flex flex-col gap-2",children:[p.jsx(ng,{icon:p.jsx(Z8,{size:14,strokeWidth:2.25}),title:"Demotion correctness",subtitle:r.join(" · "),weightedScore:t.weightedScore,passThreshold:t.passThreshold,passed:t.passed,source:t.source}),p.jsx("div",{className:"flex flex-col gap-1.5",children:e.map((i,l)=>p.jsx(tg,{m:i},`${i.metric}-${l}`))})]})}function nge({metrics:e}){if(e.length===0)return null;const t=rg(e),n=e[0],r=Array.isArray(n?.predicted)?n.predicted:[],i=Array.isArray(n?.golden)?n.golden:[];return p.jsxs("section",{className:"flex flex-col gap-2",children:[p.jsx(ng,{icon:p.jsx(q8,{size:14,strokeWidth:2.25}),title:"Procedure extraction",subtitle:`predicted ${r.length} steps · golden ${i.length} steps`,weightedScore:t.weightedScore,passThreshold:t.passThreshold,passed:t.passed,source:t.source}),p.jsx("div",{className:"flex flex-col gap-1.5",children:e.map((l,s)=>p.jsx(tg,{m:l},`${l.metric}-${s}`))}),(r.length>0||i.length>0)&&p.jsxs("div",{className:"grid grid-cols-1 gap-3 sm:grid-cols-2",children:[p.jsx(NP,{title:"Predicted",items:r}),p.jsx(NP,{title:"Golden",items:i})]})]})}function NP({title:e,items:t}){return p.jsxs("div",{className:"rounded-md border border-border bg-background p-2.5",children:[p.jsx("div",{className:"mb-1.5 text-[10px] font-semibold uppercase tracking-[0.16em] text-muted-foreground",children:e}),t.length===0?p.jsx("div",{className:"text-[11px] text-muted-foreground",children:"(empty)"}):p.jsx("ol",{className:"flex flex-col gap-1 font-mono text-[11px] text-foreground",children:t.map((n,r)=>p.jsxs("li",{className:"flex gap-2",children:[p.jsxs("span",{className:"text-muted-foreground",children:[r+1,"."]}),p.jsx("span",{children:n})]},`${e}-${r}`))})]})}function rge({metrics:e}){if(e.length===0)return null;const t=rg(e),n=e[0],r=Array.isArray(n?.predicted)?n.predicted:[],i=Array.isArray(n?.golden)?n.golden:[];return p.jsxs("section",{className:"flex flex-col gap-2",children:[p.jsx(ng,{icon:p.jsx(dF,{size:14,strokeWidth:2.25}),title:"Deduplication",subtitle:`items: ${n?.item_count??0} · predicted ${r.length} clusters · golden ${i.length} clusters`,weightedScore:t.weightedScore,passThreshold:t.passThreshold,passed:t.passed,source:t.source}),p.jsx("div",{className:"flex flex-col gap-1.5",children:e.map((l,s)=>p.jsx(tg,{m:l},`${l.metric}-${s}`))}),(r.length>0||i.length>0)&&p.jsxs("div",{className:"grid grid-cols-1 gap-3 sm:grid-cols-2",children:[p.jsx(TP,{title:"Predicted",clusters:r}),p.jsx(TP,{title:"Golden",clusters:i})]})]})}function TP({title:e,clusters:t}){return p.jsxs("div",{className:"rounded-md border border-border bg-background p-2.5",children:[p.jsx("div",{className:"mb-1.5 text-[10px] font-semibold uppercase tracking-[0.16em] text-muted-foreground",children:e}),t.length===0?p.jsx("div",{className:"text-[11px] text-muted-foreground",children:"(empty)"}):p.jsx("div",{className:"flex flex-col gap-1.5",children:t.map((n,r)=>p.jsx("div",{className:xe("rounded border border-border bg-muted/30 px-2 py-1 font-mono text-[11px] leading-relaxed text-foreground"),children:n.join(", ")},`${e}-${r}`))})]})}function e5({detail:e}){const t=e.retrieval_scores??[],n=e.demotion_scores??[],r=e.procedure_scores??[],i=e.dedup_scores??[],l=t.length+n.length+r.length+i.length;return l===0?p.jsxs("div",{className:"flex flex-col items-center justify-center gap-2 rounded-md border border-dashed border-border px-4 py-12 text-center",children:[p.jsx(AM,{size:18,strokeWidth:2,className:"text-muted-foreground"}),p.jsx("div",{className:"text-sm font-medium text-foreground",children:"No quantitative eval scores"}),p.jsx("p",{className:"max-w-md text-[12px] text-muted-foreground",children:"This scenario didn't declare a retrieval, demotion, procedure, or dedup block. Add one to its YAML to get IR-style metrics here."})]}):p.jsxs("div",{className:"flex flex-col gap-5",children:[p.jsx(Jme,{count:l,children:"Eval scores"}),p.jsx(ege,{metrics:t}),p.jsx(tge,{metrics:n}),p.jsx(nge,{metrics:r}),p.jsx(rge,{metrics:i})]})}function t5(e){return(e.retrieval_scores?.length??0)>0||(e.demotion_scores?.length??0)>0||(e.procedure_scores?.length??0)>0||(e.dedup_scores?.length??0)>0}function ige(e){return e.passed?"pass":e.status==="error"||e.overall_score==null&&e.passed===void 0?"unknown":"fail"}function n5({children:e,count:t}){return p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx("div",{className:"text-[10px] font-semibold uppercase tracking-[0.16em] text-muted-foreground",children:e}),t!=null&&p.jsx("div",{className:"text-[10px] font-mono text-muted-foreground/70",children:t}),p.jsx("div",{className:"h-px flex-1 bg-border"})]})}function age({verdict:e}){return e==="pass"?p.jsxs("span",{className:"inline-flex items-center gap-1.5 rounded-full border border-emerald-500/40 bg-emerald-500/10 px-2.5 py-1 text-xs font-semibold uppercase tracking-wider text-emerald-700 dark:text-emerald-400",children:[p.jsx(g1,{size:14,strokeWidth:2.5}),"Pass"]}):e==="fail"?p.jsxs("span",{className:"inline-flex items-center gap-1.5 rounded-full border border-rose-500/40 bg-rose-500/10 px-2.5 py-1 text-xs font-semibold uppercase tracking-wider text-rose-700 dark:text-rose-400",children:[p.jsx(EM,{size:14,strokeWidth:2.5}),"Fail"]}):p.jsxs("span",{className:"inline-flex items-center gap-1.5 rounded-full border border-border bg-muted px-2.5 py-1 text-xs font-semibold uppercase tracking-wider text-muted-foreground",children:[p.jsx(_8,{size:14,strokeWidth:2.5}),"Unknown"]})}function lge({detail:e}){const t=ige(e),n=e.overall_score,r=e.pass_threshold,i=Bp(n),l=r!=null?Math.max(0,Math.min(100,Math.round(r*100))):null,s=t==="pass"?"bg-emerald-500":t==="fail"?"bg-rose-500":"bg-muted-foreground/40";return p.jsxs("section",{className:"flex flex-col gap-3",children:[p.jsxs("div",{className:"flex items-center gap-2 text-[10px] font-semibold uppercase tracking-[0.18em] text-muted-foreground",children:[p.jsx(z8,{size:12,strokeWidth:2.5}),p.jsx("span",{children:"Verdict"})]}),p.jsxs("div",{className:"flex items-end justify-between gap-4",children:[p.jsxs("div",{className:"flex items-baseline gap-3",children:[p.jsx("div",{className:xe("font-mono text-4xl font-semibold tabular-nums leading-none tracking-tight",t==="pass"&&"text-emerald-700 dark:text-emerald-400",t==="fail"&&"text-rose-700 dark:text-rose-400",t==="unknown"&&"text-muted-foreground"),children:n!=null?n.toFixed(2):"—"}),r!=null&&p.jsxs("div",{className:"font-mono text-xs text-muted-foreground",children:["/ threshold ",r.toFixed(2)]})]}),p.jsx(age,{verdict:t})]}),p.jsxs("div",{className:"relative",children:[p.jsx("div",{className:"h-1.5 w-full overflow-hidden rounded-full bg-muted",children:p.jsx("div",{className:xe("h-full rounded-full transition-[width]",s),style:{width:`${i}%`}})}),l!=null&&p.jsx("div",{className:"absolute top-[-3px] h-3 w-px bg-foreground/60",style:{left:`${l}%`},"aria-hidden":!0,title:`Pass threshold ${r?.toFixed(2)}`})]}),e.judge?.model&&p.jsxs("div",{className:"flex flex-wrap items-center gap-1.5 text-[11px] text-muted-foreground",children:[p.jsx("span",{className:"text-muted-foreground/70",children:"Judged by"}),p.jsxs("span",{className:"font-mono text-foreground",children:[e.judge.provider?`${e.judge.provider} · `:"",e.judge.model]}),e.judge.temperature!=null&&p.jsxs("span",{className:"font-mono text-muted-foreground/80",children:["· t=",e.judge.temperature]})]})]})}function oge({notes:e}){return p.jsxs("section",{className:"flex flex-col gap-2",children:[p.jsx(n5,{children:"Notes"}),p.jsxs("blockquote",{className:"relative rounded-md border-l-2 border-foreground/40 bg-muted/40 px-4 py-3 text-sm leading-relaxed text-foreground",children:[p.jsx(nF,{className:"absolute -left-px -top-2 h-3 w-3 -translate-x-1/2 rotate-180 text-muted-foreground/40",strokeWidth:2.5,"aria-hidden":!0}),p.jsx(Af,{children:e})]})]})}function sge(e){const t=e.normalized_score;return t==null?{text:"text-muted-foreground",bar:"bg-muted-foreground/40",border:"border-border"}:t>=.85?{text:"text-emerald-700 dark:text-emerald-400",bar:"bg-emerald-500",border:"border-emerald-500/30"}:t>=.6?{text:"text-foreground",bar:"bg-foreground/70",border:"border-border"}:t>=.4?{text:"text-amber-700 dark:text-amber-400",bar:"bg-amber-500",border:"border-amber-500/30"}:{text:"text-rose-700 dark:text-rose-400",bar:"bg-rose-500",border:"border-rose-500/30"}}function uge({d:e}){const t=Bp(e.normalized_score),n=sge(e),r=e.raw_score!=null?`${e.raw_score}${e.scale_points!=null?`/${e.scale_points}`:""}`:"—",i=e.evidence??[],l=!!e.reasoning?.trim(),s=i.length>0;return p.jsxs("article",{className:xe("flex flex-col gap-3 rounded-lg border bg-background/50 px-4 py-3",n.border),children:[p.jsxs("header",{className:"flex items-start justify-between gap-3",children:[p.jsxs("div",{className:"min-w-0",children:[p.jsx("h3",{className:"truncate text-sm font-medium text-foreground",children:e.dimension_name}),p.jsx("p",{className:"truncate font-mono text-[11px] text-muted-foreground",children:e.dimension_id})]}),p.jsxs("div",{className:"flex shrink-0 items-baseline gap-2 text-right",children:[p.jsx("span",{className:xe("font-mono text-base font-semibold tabular-nums leading-none",n.text),children:r}),e.weight!=null&&p.jsxs("span",{className:"inline-flex items-center gap-1 rounded-sm border border-border bg-muted/60 px-1.5 py-0.5 text-[10px] font-mono text-muted-foreground",children:[p.jsx(lF,{size:10,strokeWidth:2.5}),"×",e.weight]})]})]}),p.jsxs("div",{className:"flex items-center gap-3",children:[p.jsx("div",{className:"h-1 flex-1 overflow-hidden rounded-full bg-muted",children:p.jsx("div",{className:xe("h-full rounded-full transition-[width]",n.bar),style:{width:`${t}%`}})}),p.jsxs("span",{className:"w-9 shrink-0 text-right font-mono text-[10px] tabular-nums text-muted-foreground",children:[t,"%"]})]}),(l||s)&&p.jsxs(DS,{label:"Reasoning",children:[l&&p.jsx(Af,{className:"text-xs text-muted-foreground",children:e.reasoning}),s&&p.jsxs("div",{className:"mt-2 flex flex-col gap-1.5",children:[p.jsx("div",{className:"text-[10px] font-semibold uppercase tracking-[0.16em] text-muted-foreground/70",children:"Evidence"}),i.map((u,f)=>p.jsx("div",{className:"rounded border-l-2 border-border bg-muted/30 px-2 py-1 font-mono text-[11px] leading-relaxed text-muted-foreground",children:u},f))]})]})]})}function r5({detail:e}){const t=[...e.judge_dimension_scores??[]].sort((i,l)=>(l.weight??0)-(i.weight??0)),n=e.judge?.overall_notes,r=e.judge?.output;return p.jsxs("div",{className:"flex flex-col gap-5",children:[p.jsx(lge,{detail:e}),n&&p.jsx(oge,{notes:n}),p.jsxs("section",{className:"flex flex-col gap-3",children:[p.jsx(n5,{count:t.length||void 0,children:"Dimensions"}),t.length>0?p.jsx("div",{className:"flex flex-col gap-2.5",children:t.map((i,l)=>p.jsx(uge,{d:i},l))}):p.jsx("p",{className:"text-sm text-muted-foreground",children:"No rubric dimensions recorded."})]}),r&&p.jsx(X4,{icon:p.jsx(OM,{size:14,strokeWidth:2.25}),title:"Raw judge output",description:"Full structured response from the judge model",children:p.jsx("pre",{className:"overflow-x-auto whitespace-pre-wrap break-words rounded-md border border-border bg-background p-3 font-mono text-[11px] leading-relaxed text-foreground",children:JSON.stringify(r,null,2)})})]})}function cge({detail:e,onClose:t}){const n=t5(e),[r,i]=x.useState("conversation"),l=e.status==="running",s=e.overall_score!=null?e.overall_score.toFixed(2):l?"...":"n/a",u=e.pass_threshold!=null?e.pass_threshold.toFixed(2):"n/a",f=l?"RUNNING":e.passed?"PASS":"FAIL",d=l?"detail-running":e.passed?"detail-pass":"detail-fail",g=typeof e.judge?.output=="object"&&e.judge?.output!=null?e.judge.output.failure_mode_detected:null;return p.jsxs(p.Fragment,{children:[p.jsx("div",{className:"detail-backdrop open",role:"presentation",onClick:t,onKeyDown:h=>{h.key==="Escape"&&t()}}),p.jsx("div",{className:"detail-overlay open",children:p.jsxs("div",{className:"detail-panel",children:[p.jsxs("div",{className:"detail-top",children:[p.jsx("button",{type:"button",className:"detail-close",onClick:t,children:"×"}),p.jsxs("div",{className:`detail-score-header ${d}`,children:[p.jsxs("div",{className:"detail-title-block",children:[p.jsxs("div",{className:"detail-name",children:[e.scenario_name,l&&p.jsxs("span",{className:"live-badge",style:{marginLeft:12,verticalAlign:"middle"},children:[p.jsx("span",{className:"live-dot"})," LIVE"]})]}),p.jsxs("div",{className:"detail-sid",children:[e.scenario_id,e.user_id?` / ${e.user_id}`:""]})]}),p.jsxs("div",{className:"detail-score-block",children:[p.jsxs("div",{className:"detail-score-group",children:[p.jsx("div",{className:"detail-score-label",children:"Score"}),p.jsx("div",{className:"detail-score-value",children:s})]}),p.jsxs("div",{className:"detail-score-group",children:[p.jsx("div",{className:"detail-score-label",children:"Threshold"}),p.jsx("div",{className:"detail-score-value",children:u})]}),p.jsxs("div",{className:"detail-score-group",children:[p.jsx("div",{className:"detail-score-label",children:"Status"}),p.jsx("div",{className:"detail-score-value",children:f})]}),typeof g=="string"&&g&&p.jsxs("div",{className:"detail-score-group",children:[p.jsx("div",{className:"detail-score-label",children:"Failure"}),p.jsx("div",{className:"detail-score-value",children:g})]})]}),p.jsx("div",{className:"detail-bar",children:p.jsx("div",{className:"detail-bar-fill",style:{width:`${Bp(e.overall_score)}%`}})})]}),p.jsxs("div",{className:"detail-tabs",children:[p.jsx("button",{type:"button",className:`tab-btn${r==="conversation"?" tab-active":""}`,onClick:()=>i("conversation"),children:"Conversation"}),p.jsx("button",{type:"button",className:`tab-btn${r==="rubric"?" tab-active":""}`,onClick:()=>i("rubric"),children:"Rubric"}),n&&p.jsx("button",{type:"button",className:`tab-btn${r==="evals"?" tab-active":""}`,onClick:()=>i("evals"),children:"Eval scores"})]})]}),p.jsxs("div",{className:"detail-body",children:[r==="conversation"&&p.jsx(zS,{detail:e}),r==="rubric"&&p.jsx(r5,{detail:e}),r==="evals"&&p.jsx(e5,{detail:e})]})]})})]})}function i5({data:e}){const t=e.total||1,n=e.passed/t*100,r=e.failed/t*100,i=e.running/t*100;return p.jsxs("div",{className:"progress-bar",style:{display:"flex"},children:[p.jsx("div",{className:"progress-fill progress-pass",style:{width:`${n}%`}}),p.jsx("div",{className:"progress-fill progress-fail",style:{width:`${r}%`}}),p.jsx("div",{className:"progress-fill progress-running",style:{width:`${i}%`}})]})}const fge={pending:"PENDING",running:"RUNNING",pass:"PASS",fail:"FAIL",error:"ERROR"};function dge({scenario:e}){const[t,n]=x.useState(Date.now()),r=x.useRef(void 0);if(x.useEffect(()=>{if(e.status==="running"&&e.started_at!=null)return r.current=setInterval(()=>n(Date.now()),1e3),()=>clearInterval(r.current);clearInterval(r.current)},[e.status,e.started_at]),e.started_at==null)return p.jsx(p.Fragment,{children:"-"});if(e.finished_at!=null)return p.jsxs(p.Fragment,{children:[(e.finished_at-e.started_at).toFixed(1),"s"]});const i=t/1e3-e.started_at;return p.jsx(p.Fragment,{children:i>0?`${i.toFixed(0)}s`:"-"})}function a5({data:e,runId:t,onSelect:n}){const r=i=>t?`/runs/${encodeURIComponent(t)}/scenarios/${i}`:null;return p.jsxs(p.Fragment,{children:[p.jsxs("div",{className:"section-title",children:["Scenarios"," ",p.jsx("span",{style:{color:"var(--muted)",fontWeight:400,fontSize:12},children:"(click a row to view conversation & rubric)"})]}),p.jsxs("table",{children:[p.jsx("thead",{children:p.jsxs("tr",{children:[p.jsx("th",{children:"ID"}),p.jsx("th",{children:"Name"}),p.jsx("th",{children:"Status"}),p.jsx("th",{style:{textAlign:"right"},children:"Score"}),p.jsx("th",{style:{textAlign:"right"},children:"Duration"}),p.jsx("th",{children:"Error"})]})}),p.jsx("tbody",{children:e.scenarios.map((i,l)=>{const s=l in e.details,u=r(l);return p.jsxs("tr",{className:`status-${i.status}${s?" clickable-row":""}`,onClick:s?()=>n(l):void 0,children:[p.jsx("td",{className:"id-cell",children:s&&u?p.jsx("a",{href:u,onClick:f=>f.stopPropagation(),style:{color:"inherit",textDecoration:"none"},children:i.scenario_id}):i.scenario_id}),p.jsx("td",{children:i.scenario_name??""}),p.jsx("td",{className:"status-badge",children:p.jsx("span",{children:fge[i.status]??i.status.toUpperCase()})}),p.jsx("td",{className:"score-cell",children:i.score!=null?i.score.toFixed(2):"-"}),p.jsx("td",{className:"duration-cell",children:p.jsx(dge,{scenario:i})}),p.jsx("td",{children:i.error&&p.jsx("span",{className:"error-text",title:i.error,children:i.error.slice(0,60)})})]},`${i.scenario_id}-${l}`)})})]})]})}function hge(e,t){const[n,r]=x.useState(0);return x.useEffect(()=>{if(t)return;const i=performance.now(),l=setInterval(()=>{r((performance.now()-i)/1e3)},500);return()=>{clearInterval(l),r(0)}},[e,t]),t?e:e+n}function pge(e){const t=Math.floor(e/60),n=Math.floor(e%60);return`${t}m ${n}s`}function Vl({value:e,label:t,color:n}){return p.jsxs("div",{className:"stat",children:[p.jsx("div",{className:"stat-value",style:{color:n},children:e}),p.jsx("div",{className:"stat-label",children:t})]})}function l5({data:e}){const t=hge(e.elapsed,e.all_done),n=e.total>0?Math.round(e.done/e.total*100):0;return p.jsxs("div",{className:"stats",children:[p.jsx(Vl,{value:`${e.done}/${e.total}`,label:"Completed",color:"var(--text)"}),p.jsx(Vl,{value:`${e.passed}`,label:"Passed",color:"var(--green)"}),p.jsx(Vl,{value:`${e.failed}`,label:"Failed",color:"var(--red)"}),p.jsx(Vl,{value:`${e.errored}`,label:"Errors",color:"var(--amber)"}),p.jsx(Vl,{value:`${e.running}`,label:"Running",color:"var(--blue)"}),p.jsx(Vl,{value:pge(t),label:"Elapsed",color:"var(--muted)"}),p.jsx(Vl,{value:`${n}%`,label:"Progress",color:"var(--indigo)"})]})}const o5="agentprobe:theme";function a1(){try{const e=window.localStorage.getItem(o5);if(e==="light"||e==="dark")return e}catch{}return null}function mge(){const e=a1();return e||(typeof window<"u"&&window.matchMedia?.("(prefers-color-scheme: dark)").matches?"dark":"light")}function gge(e){const t=document.documentElement;e==="dark"?t.classList.add("dark"):t.classList.remove("dark")}function vge(){const[e,t]=x.useState(()=>mge());return x.useEffect(()=>{gge(e);try{window.localStorage.setItem(o5,e)}catch{}},[e]),x.useEffect(()=>{if(a1())return;const r=window.matchMedia("(prefers-color-scheme: dark)"),i=l=>{a1()||t(l.matches?"dark":"light")};return r.addEventListener("change",i),()=>r.removeEventListener("change",i)},[]),{theme:e,setTheme:t,toggle:()=>t(n=>n==="dark"?"light":"dark")}}function yge(){const{theme:e,toggle:t}=vge(),n=e==="dark"?pF:X8;return p.jsx(fM,{variant:"ghost",size:"icon",onClick:t,"aria-label":e==="dark"?"Switch to light mode":"Switch to dark mode",title:e==="dark"?"Light mode":"Dark mode",className:"size-8",children:p.jsx(n,{className:"h-4 w-4"})})}const bge=2e3;function xge(){const[e,t]=x.useState(null),[n,r]=x.useState(null),i=x.useRef(!1);return x.useEffect(()=>{let l,s=!1;async function u(){try{const f=await fetch("/api/state");if(!f.ok)throw new Error(`HTTP ${f.status}`);const d=await f.json();if(s)return;t(d),r(null),i.current=d.all_done}catch(f){if(s)return;r(f instanceof Error?f.message:"Unknown error")}}return u(),l=setInterval(()=>{i.current||u()},bge),()=>{s=!0,clearInterval(l)}},[]),{data:e,error:n}}function wge({request:e}){const[t,n]=x.useState(null),[r,i]=x.useState({}),[l,s]=x.useState(null),u=x.useCallback(async()=>{try{const[g,h]=await Promise.all([e("/api/suites"),e("/api/endpoint-overrides")]);n(g);const v={};for(const b of h.overrides)v[b.endpoint_path]={baseUrl:b.base_url,autogptJwtSecret:b.autogpt_jwt_secret};i(v),s(null)}catch(g){s(g instanceof Error?g.message:String(g))}},[e]);if(x.useEffect(()=>{let g=!1;return(async()=>g||await u())(),()=>{g=!0}},[u]),l&&!t)return p.jsx(yt,{message:l});if(!t)return p.jsx(rf,{});const f=t.suites.filter(g=>g.schema==="endpoints").map(g=>({relativePath:g.relativePath})),d=f.filter(g=>{const h=r[g.relativePath];return!!(h?.baseUrl||h?.autogptJwtSecret)}).length;return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Endpoints",title:"Endpoint overrides",meta:`${f.length} endpoint${f.length===1?"":"s"} · ${d} with overrides`}),l?p.jsx(yt,{message:l}):null,p.jsxs("p",{className:"text-sm text-muted-foreground mb-4",children:["Override values from any endpoint YAML. Saved overrides are applied whenever the dashboard server uses that endpoint, taking precedence over the YAML defaults (and any ",p.jsxs("code",{children:["$","{VAR}"]})," placeholders)."]}),f.length===0?p.jsx(rt,{className:"p-6 text-sm text-muted-foreground",children:"No endpoint suites found in your data path."}):p.jsx("div",{className:"flex flex-col gap-4",children:f.map(g=>p.jsx(Sge,{relativePath:g.relativePath,request:e,onChanged:()=>{u()}},g.relativePath))})]})}function Sge({relativePath:e,request:t,onChanged:n}){const[r,i]=x.useState(null),[l,s]=x.useState(!0),[u,f]=x.useState(""),[d,g]=x.useState(""),[h,v]=x.useState(!1),[b,S]=x.useState(null),[j,E]=x.useState(null),O=x.useCallback(async()=>{s(!0);try{const k=await t(`/api/endpoint-overrides/${encodeURIComponent(e)}`);i(k),f(k.override?.base_url??""),g(k.override?.autogpt_jwt_secret??""),E(null)}catch(k){E(k instanceof Error?k.message:String(k))}finally{s(!1)}},[e,t]);x.useEffect(()=>{let k=!1;return(async()=>k||await O())(),()=>{k=!0}},[O]);const C=async k=>{k.preventDefault(),v(!0),S(null);try{const I=u.trim(),R=d.trim(),F=await t(`/api/endpoint-overrides/${encodeURIComponent(e)}`,Si("PUT",{base_url:I||null,autogpt_jwt_secret:R||null}));i(D=>D&&{...D,override:I||R?F.override:null}),S(I||R?"Saved.":"Cleared."),E(null),n()}catch(I){E(I instanceof Error?I.message:String(I))}finally{v(!1)}},_=async()=>{v(!0),S(null);try{await t(`/api/endpoint-overrides/${encodeURIComponent(e)}`,Si("DELETE")),f(""),g(""),i(k=>k&&{...k,override:null}),S("Cleared."),E(null),n()}catch(k){E(k instanceof Error?k.message:String(k))}finally{v(!1)}},P=!!(r?.override?.base_url||r?.override?.autogpt_jwt_secret),T=r?.defaults.preset==="autogpt";return p.jsxs(rt,{className:"p-4",children:[p.jsx("div",{className:"flex items-center justify-between gap-3 mb-3",children:p.jsxs("div",{className:"flex items-center gap-2 min-w-0",children:[p.jsx("span",{className:"font-mono text-sm break-all",children:e}),r?.defaults.transport?p.jsx(Pt,{tone:"info",children:r.defaults.transport}):null,P?p.jsx(Pt,{tone:"warn",children:"override saved"}):null]})}),l?p.jsx("div",{className:"text-xs text-muted-foreground",children:"Loading…"}):p.jsxs("form",{onSubmit:C,className:"flex flex-col gap-3",children:[p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-3",children:[p.jsx(Xe,{label:"YAML default",hint:r?.defaults.base_url&&r.defaults.base_url_resolved&&r.defaults.base_url!==r.defaults.base_url_resolved?`Resolves to ${r.defaults.base_url_resolved}`:"From the endpoint YAML's connection.base_url / connection.url",children:p.jsx(Kt,{value:r?.defaults.base_url??"",readOnly:!0,disabled:!0,className:"font-mono text-xs"})}),p.jsx(Xe,{label:"Override",hint:"Applied for every run that uses this endpoint. Leave blank to remove.",children:p.jsx(Kt,{value:u,onChange:k=>f(k.currentTarget.value),placeholder:r?.defaults.base_url_resolved??"https://staging.example",className:"font-mono text-xs"})})]}),T?p.jsx(Xe,{label:"AutoGPT JWT secret override",hint:"Overrides the secret used to forge AutoGPT bearer tokens for this endpoint. Leave blank to remove and fall back to AUTOGPT_JWT_SECRET, JWT_SECRET, or the built-in dev default.",children:p.jsx(Kt,{type:"password",value:d,onChange:k=>g(k.currentTarget.value),placeholder:"your-super-secret-jwt-token...",autoComplete:"off",className:"font-mono text-xs"})}):null,p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx(Le,{type:"submit",disabled:h,children:h?"Saving…":"Save"}),P?p.jsx(Le,{type:"button",variant:"ghost",disabled:h,onClick:()=>{_()},children:"Clear override"}):null,b?p.jsx("span",{className:"text-xs text-success",children:b}):null,j?p.jsx("span",{className:"text-xs text-destructive",children:j}):null]})]})]})}function kt(e){return typeof e=="string"&&e.trim()?e:null}function ra(e){return Array.isArray(e)?e:[]}function Xr(e){return e&&typeof e=="object"&&!Array.isArray(e)?e:null}function jge(e){switch(e){case"user":return{variant:"info",label:"User"};case"assistant":return{variant:"default",label:"Assistant"};case"checkpoint":return{variant:"warning",label:"Checkpoint"};case"inject":return{variant:"destructive",label:"Inject"};default:return{variant:"secondary",label:e}}}function Es({children:e}){return p.jsx("div",{className:"text-[10px] uppercase tracking-[0.12em] text-muted-foreground font-semibold mt-5 mb-2",children:e})}function Xl({label:e,value:t,mono:n}){return p.jsxs("div",{className:"grid grid-cols-[120px_1fr] gap-2 py-1.5 text-sm border-b border-border last:border-b-0",children:[p.jsx("div",{className:"text-xs text-muted-foreground",children:e}),p.jsx("div",{className:`min-w-0 break-words ${n?"font-mono text-xs":""}`,children:t})]})}function io({text:e}){return p.jsx("pre",{className:"whitespace-pre-wrap text-sm text-foreground bg-secondary/40 border border-border rounded-md px-3 py-2 font-mono leading-relaxed overflow-x-auto",children:e})}function s5({turn:e,ordinal:t}){const n=String(e.role??"turn"),r=jge(n),i=kt(e.content),l=e.useExactMessage===!0,s=ra(e.attachments),u=ra(e.assertions);return p.jsxs(rt,{className:"p-3",children:[p.jsx("div",{className:"flex items-center justify-between mb-2 gap-2",children:p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx("span",{className:"font-mono text-xs text-muted-foreground",children:t.toString().padStart(2,"0")}),p.jsx(ea,{variant:r.variant,className:"uppercase tracking-wider",children:r.label}),l?p.jsx(Pt,{tone:"warn",children:"verbatim"}):null]})}),i?p.jsx(io,{text:i}):null,s.length>0?p.jsx("div",{className:"mt-2 flex flex-wrap gap-1.5",children:s.map((f,d)=>{const g=Xr(f),h=kt(g?.path)??"",v=kt(g?.name)??h.split("/").pop();return p.jsx(Pt,{tone:"info",children:v||h},`${h}-${d}`)})}):null,n==="checkpoint"&&u.length>0?p.jsx("ul",{className:"mt-2 space-y-2",children:u.map((f,d)=>{const g=Xr(f);return g?p.jsxs("li",{className:"text-xs bg-secondary/40 border border-border rounded-md p-2 space-y-1",children:[kt(g.toolCalled)?p.jsxs("div",{children:[p.jsx("span",{className:"text-muted-foreground",children:"tool:"})," ",p.jsx("span",{className:"font-mono",children:g.toolCalled})]}):null,Array.isArray(g.responseContainsAny)&&g.responseContainsAny.length>0?p.jsxs("div",{children:[p.jsx("span",{className:"text-muted-foreground",children:"contains any:"})," ",p.jsx("span",{className:"font-mono",children:g.responseContainsAny.join(" | ")})]}):null,Array.isArray(g.responseMustNotContain)&&g.responseMustNotContain.length>0?p.jsxs("div",{children:[p.jsx("span",{className:"text-muted-foreground",children:"must not contain:"})," ",p.jsx("span",{className:"font-mono",children:g.responseMustNotContain.join(" | ")})]}):null,kt(g.responseMentions)?p.jsxs("div",{children:[p.jsx("span",{className:"text-muted-foreground",children:"mentions:"})," ",p.jsx("span",{className:"font-mono",children:g.responseMentions})]}):null,Xr(g.withArgs)?p.jsx("pre",{className:"font-mono text-[11px] mt-1 whitespace-pre-wrap",children:JSON.stringify(g.withArgs,null,2)}):null]},`assert-${d}`):null})}):null]})}function Ege({session:e,index:t}){const n=ra(e.turns);return p.jsxs("div",{children:[p.jsxs("div",{className:"flex items-center gap-2 mb-2",children:[p.jsxs(ea,{variant:"secondary",className:"uppercase",children:["Session ",t+1]}),kt(e.id)?p.jsx("span",{className:"font-mono text-xs text-muted-foreground",children:e.id}):null,kt(e.timeOffset)?p.jsxs(Pt,{children:["+",e.timeOffset]}):null,kt(e.reset)?p.jsxs(Pt,{children:["reset: ",e.reset]}):null,typeof e.maxTurns=="number"?p.jsxs(Pt,{children:["max turns: ",e.maxTurns]}):null]}),p.jsxs("div",{className:"flex flex-col gap-2",children:[n.map((r,i)=>{const l=Xr(r);return l?p.jsx(s5,{turn:l,ordinal:i+1},`t-${i}`):null}),n.length===0?p.jsx("div",{className:"text-sm text-muted-foreground italic",children:"No turns in this session."}):null]})]})}function Oge({expectations:e}){const t=ra(e.mustInclude),n=ra(e.mustNotInclude),r=ra(e.expectedTools),i=ra(e.failureModes),l=kt(e.expectedBehavior),s=kt(e.expectedOutcome),u=kt(e.groundTruth),f=kt(e.testerNote);return t.length===0&&n.length===0&&r.length===0&&i.length===0&&!l&&!s&&!u&&!f?null:p.jsxs(rt,{className:"p-4 space-y-3",children:[s?p.jsx(Xl,{label:"Outcome",value:s}):null,l?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Expected behavior"}),p.jsx(io,{text:l})]}):null,t.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Response must include"}),p.jsx("div",{className:"flex flex-wrap gap-1.5",children:t.map((g,h)=>p.jsx(Pt,{tone:"success",children:g},`m-${h}`))})]}):null,n.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Response must NOT include"}),p.jsx("div",{className:"flex flex-wrap gap-1.5",children:n.map((g,h)=>p.jsx(Pt,{tone:"warn",children:g},`mn-${h}`))})]}):null,r.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Expected tool calls"}),p.jsx("ul",{className:"text-sm font-mono space-y-1",children:r.map((g,h)=>{const v=Xr(g);return v?p.jsxs("li",{className:"flex items-center gap-2 text-xs",children:[p.jsx("span",{children:kt(v.name)??"?"}),v.required?p.jsx(Pt,{tone:"warn",children:"required"}):null,typeof v.callOrder=="number"?p.jsxs(Pt,{children:["order: ",v.callOrder]}):null]},`tool-${h}`):null})})]}):null,i.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Failure modes"}),p.jsx("ul",{className:"space-y-2",children:i.map((g,h)=>{const v=Xr(g);return v?p.jsxs("li",{className:"bg-secondary/40 border border-border rounded-md p-2",children:[p.jsx("div",{className:"font-medium text-sm",children:kt(v.name)??"(unnamed)"}),kt(v.description)?p.jsx("div",{className:"text-xs text-muted-foreground mt-0.5",children:kt(v.description)}):null]},`fm-${h}`):null})})]}):null,u?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Ground truth"}),p.jsx(io,{text:u})]}):null,f?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Tester note"}),p.jsx(io,{text:f})]}):null]})}function Age({context:e}){const t=kt(e.systemPrompt),n=kt(e.userName),r=kt(e.copilotMode),i=Xr(e.injectedData);return!t&&!n&&!r&&!i?null:p.jsxs(rt,{className:"p-4 space-y-3",children:[n||r?p.jsxs("div",{className:"flex flex-wrap gap-2",children:[n?p.jsxs(Pt,{tone:"info",children:["user: ",n]}):null,r?p.jsxs(Pt,{tone:"info",children:["mode: ",r]}):null]}):null,t?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"System prompt"}),p.jsx(io,{text:t})]}):null,i&&Object.keys(i).length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-xs text-muted-foreground mb-1",children:"Injected data"}),p.jsx(io,{text:JSON.stringify(i,null,2)})]}):null]})}function u5({open:e,target:t,request:n,onClose:r}){const[i,l]=x.useState(null),[s,u]=x.useState(null),[f,d]=x.useState(!1),[g,h]=x.useState(!1);if(x.useEffect(()=>{if(!e||!t)return;let F=!1;return d(!0),u(null),l(null),h(!1),n(`/api/scenarios/lookup?file=${encodeURIComponent(t.file)}&id=${encodeURIComponent(t.id)}`).then(D=>{F||l(D)}).catch(D=>{F||u(D instanceof Error?D.message:String(D))}).finally(()=>{F||d(!1)}),()=>{F=!0}},[e,t,n]),!t)return null;const v=i?.scenario??null,b=kt(v?.name)??t.name??t.id,S=kt(v?.description)??t.description??null,j=v?.tags??t.tags??[],E=kt(v?.priority)??t.priority??null,O=kt(v?.persona),C=kt(v?.rubric),_=v?.maxTurns,P=kt(v?.baseDate),T=ra(v?.sessions),k=ra(v?.turns),I=Xr(v?.expectations),R=Xr(v?.context);return p.jsxs(M1,{open:e,onClose:r,size:"lg",title:p.jsxs("div",{className:"flex items-baseline gap-2 flex-wrap pr-6",children:[p.jsx("span",{className:"text-foreground",children:b}),E?p.jsx(Pt,{tone:"info",children:E}):null]}),description:p.jsxs("span",{className:"font-mono text-xs text-muted-foreground",children:[t.id," · ",t.file]}),footer:p.jsxs(p.Fragment,{children:[p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>h(F=>!F),disabled:!v,children:g?"Hide raw spec":"View raw spec"}),p.jsx(Le,{onClick:r,children:"Close"})]}),children:[f?p.jsx(rf,{label:"Loading scenario…"}):null,s?p.jsx(yt,{message:s}):null,v?p.jsxs("div",{className:"space-y-1",children:[S?p.jsx("p",{className:"text-sm text-foreground leading-relaxed",children:S}):null,p.jsx(Es,{children:"Metadata"}),p.jsxs(rt,{className:"px-4 py-2",children:[p.jsx(Xl,{label:"Suite",value:t.file,mono:!0}),O?p.jsx(Xl,{label:"Persona",value:O,mono:!0}):null,C?p.jsx(Xl,{label:"Rubric",value:C,mono:!0}):null,typeof _=="number"?p.jsx(Xl,{label:"Max turns",value:_,mono:!0}):null,P?p.jsx(Xl,{label:"Base date",value:P,mono:!0}):null,j.length>0?p.jsx(Xl,{label:"Tags",value:p.jsx("div",{className:"flex flex-wrap gap-1.5",children:j.map(F=>p.jsx(Pt,{children:F},F))})}):null]}),R?p.jsxs(p.Fragment,{children:[p.jsx(Es,{children:"Context"}),p.jsx(Age,{context:R})]}):null,T.length>0?p.jsxs(p.Fragment,{children:[p.jsx(Es,{children:"Sessions"}),p.jsx("div",{className:"space-y-4",children:T.map((F,D)=>{const H=Xr(F);return H?p.jsx(Ege,{session:H,index:D},`s-${D}`):null})})]}):null,T.length===0&&k.length>0?p.jsxs(p.Fragment,{children:[p.jsx(Es,{children:"Turns"}),p.jsx("div",{className:"space-y-2",children:k.map((F,D)=>{const H=Xr(F);return H?p.jsx(s5,{turn:H,ordinal:D+1},`t-${D}`):null})})]}):null,I?p.jsxs(p.Fragment,{children:[p.jsx(Es,{children:"Expectations"}),p.jsx(Oge,{expectations:I})]}):null,g?p.jsxs(p.Fragment,{children:[p.jsx(T7,{className:"my-4"}),p.jsx(Es,{children:"Raw spec"}),p.jsx(io,{text:JSON.stringify(v,null,2)})]}):null]}):null]})}function Kl(e,t){return`${e}::${t}`}function Cge({presetId:e,request:t,navigate:n}){const[r,i]=x.useState(null),[l,s]=x.useState(null),[u,f]=x.useState(null),[d,g]=x.useState(null),[h,v]=x.useState(!1),[b,S]=x.useState(""),[j,E]=x.useState(""),[O,C]=x.useState(""),[_,P]=x.useState(""),[T,k]=x.useState(""),[I,R]=x.useState(1),[F,D]=x.useState(!1),[H,$]=x.useState(2),[J,G]=x.useState(!1),[Q,B]=x.useState(new Set),[Y,ie]=x.useState(""),[ae,M]=x.useState(""),[L,te]=x.useState(""),[z,he]=x.useState("all"),[ve,pe]=x.useState(null);x.useEffect(()=>{let de=!1;return Promise.all([t(`/api/presets/${encodeURIComponent(e)}`),t("/api/scenarios"),t("/api/suites")]).then(([Re,Rt,wn])=>{if(de)return;const sn=Re.preset;i(sn),s(Rt),f(wn),S(sn.name),E(sn.description??""),C(sn.endpoint),P(sn.personas),k(sn.rubric),R(sn.repeat),D(sn.parallel.enabled),$(sn.parallel.limit??2),G(sn.dry_run),B(new Set(sn.selection.map(Co=>Kl(Co.file,Co.id))))}).catch(Re=>{de||g(Re instanceof Error?Re.message:String(Re))}),()=>{de=!0}},[t,e]);const W=x.useMemo(()=>{if(!l)return[];const de=new Set;for(const Re of l.scenarios)for(const Rt of Re.tags)de.add(Rt);return[...de].sort()},[l]),ge=x.useMemo(()=>{if(!l)return[];const de=new Set;for(const Re of l.scenarios)Re.priority&&de.add(Re.priority);return[...de].sort()},[l]),ne=x.useMemo(()=>{if(!l)return[];const de=Y.trim().toLowerCase();return l.scenarios.filter(Re=>{if(de&&!Re.id.toLowerCase().includes(de)&&!Re.name.toLowerCase().includes(de)&&!(Re.description??"").toLowerCase().includes(de)&&!Re.sourcePath.toLowerCase().includes(de)||ae&&!Re.tags.includes(ae)||L&&Re.priority!==L)return!1;if(z!=="all"){const Rt=Q.has(Kl(Re.sourcePath,Re.id));if(z==="selected"&&!Rt||z==="unselected"&&Rt)return!1}return!0})},[l,Y,ae,L,z,Q]),se=x.useMemo(()=>u?.suites.filter(de=>de.schema==="endpoints")??[],[u]),ye=x.useMemo(()=>u?.suites.filter(de=>de.schema==="personas")??[],[u]),oe=x.useMemo(()=>u?.suites.filter(de=>de.schema==="rubrics")??[],[u]),$e=(de,Re)=>{const Rt=Kl(de,Re),wn=new Set(Q);wn.has(Rt)?wn.delete(Rt):wn.add(Rt),B(wn)},ke=()=>{const de=new Set(Q);for(const Re of ne)de.add(Kl(Re.sourcePath,Re.id));B(de)},xt=()=>{const de=new Set(Q);for(const Re of ne)de.delete(Kl(Re.sourcePath,Re.id));B(de)},wt=async()=>{v(!0),g(null);try{if(!l)throw new Error("Scenarios not loaded.");const de=[];for(const Re of l.scenarios){const Rt=Kl(Re.sourcePath,Re.id);Q.has(Rt)&&de.push({file:Re.sourcePath,id:Re.id})}if(de.length===0)throw new Error("Select at least one scenario.");await t(`/api/presets/${encodeURIComponent(e)}`,Si("PUT",{name:b.trim(),description:j.trim()||null,endpoint:O,personas:_,rubric:T,selection:de,parallel:{enabled:F,limit:F?H:null},repeat:I,dry_run:J})),n(`/presets/${encodeURIComponent(e)}`)}catch(de){g(de instanceof Error?de.message:String(de))}finally{v(!1)}};return d&&!r?p.jsx(yt,{message:d}):!r||!l||!u?p.jsx(rf,{}):p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Edit Preset",title:b||r.name,meta:`${Q.size} scenario${Q.size===1?"":"s"} selected`,actions:p.jsxs(p.Fragment,{children:[p.jsx(Le,{variant:"secondary",onClick:()=>n(`/presets/${encodeURIComponent(e)}`),children:"Cancel"}),p.jsx(Le,{onClick:()=>void wt(),disabled:h,children:h?"Saving…":"Save changes"})]})}),d?p.jsx(yt,{message:d}):null,p.jsxs("div",{className:"grid grid-cols-1 lg:grid-cols-[minmax(0,1fr)_320px] gap-4 mb-6",children:[p.jsxs(rt,{className:"p-4 flex flex-col gap-3",children:[p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-3",children:[p.jsx(Xe,{label:"Name",children:p.jsx(Kt,{value:b,onChange:de=>S(de.currentTarget.value),required:!0})}),p.jsx(Xe,{label:"Description",children:p.jsx(Kt,{value:j,onChange:de=>E(de.currentTarget.value),placeholder:"Short summary"})})]}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3",children:[p.jsx(Xe,{label:"Endpoint",children:p.jsx(sr,{value:O,onValueChange:C,options:se.map(de=>({value:de.relativePath,label:de.relativePath})),emptyLabel:"No endpoint suites"})}),p.jsx(Xe,{label:"Personas",children:p.jsx(sr,{value:_,onValueChange:P,options:ye.map(de=>({value:de.relativePath,label:de.relativePath})),emptyLabel:"No persona suites"})}),p.jsx(Xe,{label:"Rubric",children:p.jsx(sr,{value:T,onValueChange:k,options:oe.map(de=>({value:de.relativePath,label:de.relativePath})),emptyLabel:"No rubric suites"})})]})]}),p.jsxs(rt,{className:"p-4 flex flex-col gap-3",children:[p.jsx(Xe,{label:"Repeat",children:p.jsx(Kt,{type:"number",min:1,value:I,onChange:de=>R(Number(de.currentTarget.value))})}),p.jsx(Xe,{label:"Parallel",hint:"Run multiple scenarios concurrently. Limit caps concurrency (2–4 is typical; higher = faster but more LLM cost spikes).",children:p.jsxs("div",{className:"flex items-center gap-3",children:[p.jsx(bi,{checked:F,onChange:D,label:"Enabled"}),p.jsx(Kt,{type:"number",min:1,value:H,disabled:!F,onChange:de=>$(Number(de.currentTarget.value)),className:"w-20"})]})}),p.jsx(Xe,{label:"Mode",hint:"Dry run records run + scenario rows but skips the live adapter, judge, and scorers. Use to validate config / preset shape without spending LLM tokens.",children:p.jsx(bi,{checked:J,onChange:G,label:"Dry run by default"})})]})]}),p.jsxs(rt,{className:"overflow-hidden",children:[p.jsxs("div",{className:"p-3 border-b border-border flex flex-wrap items-center gap-2",children:[p.jsx(Kt,{placeholder:"Filter by id, name, or path…",value:Y,onChange:de=>ie(de.currentTarget.value),className:"max-w-xs"}),p.jsx(sr,{value:ae||"__all_tags__",onValueChange:de=>M(de==="__all_tags__"?"":de),className:"max-w-xs",options:[{value:"__all_tags__",label:"All tags"},...W.map(de=>({value:de,label:de}))]}),p.jsx(sr,{value:L||"__all_priorities__",onValueChange:de=>te(de==="__all_priorities__"?"":de),className:"max-w-xs",options:[{value:"__all_priorities__",label:"All priorities"},...ge.map(de=>({value:de,label:de}))]}),p.jsx(sr,{value:z,onValueChange:de=>he(de),className:"max-w-xs",options:[{value:"all",label:"All scenarios"},{value:"selected",label:"Included only"},{value:"unselected",label:"Not included"}]}),p.jsx("div",{className:"flex-1"}),p.jsxs("span",{className:"text-xs text-muted-foreground mr-2",children:[ne.length," matching · ",Q.size," selected"]}),p.jsx(Le,{variant:"secondary",size:"sm",onClick:ke,children:"Select shown"}),p.jsx(Le,{variant:"ghost",size:"sm",onClick:xt,children:"Clear shown"})]}),p.jsx("div",{className:"max-h-[480px] overflow-y-auto divide-y divide-border",children:ne.length===0?p.jsx("div",{className:"p-6 text-center text-muted-foreground text-sm",children:"No scenarios match."}):ne.map(de=>{const Re=Kl(de.sourcePath,de.id),Rt=Q.has(Re);return p.jsxs("div",{className:`flex items-start gap-3 px-3 py-2.5 hover:bg-secondary ${Rt?"bg-primary/5":""}`,children:[p.jsxs("label",{className:"flex items-start gap-3 flex-1 min-w-0 cursor-pointer",children:[p.jsx("input",{type:"checkbox",checked:Rt,onChange:()=>$e(de.sourcePath,de.id),className:"size-4 mt-0.5 accent-primary shrink-0"}),p.jsxs("div",{className:"flex-1 min-w-0",children:[p.jsxs("div",{className:"flex items-baseline gap-2 flex-wrap",children:[p.jsx("span",{className:"text-sm font-medium text-foreground",children:de.name||de.id}),p.jsx("span",{className:"font-mono text-[11px] text-muted-foreground",children:de.id}),de.priority?p.jsx(Pt,{tone:"info",children:de.priority}):null]}),de.description?p.jsx("div",{className:"text-xs text-muted-foreground mt-0.5 line-clamp-2",children:de.description}):null,p.jsxs("div",{className:"flex items-center gap-1.5 mt-1 flex-wrap",children:[de.tags.slice(0,5).map(wn=>p.jsx(Pt,{children:wn},wn)),p.jsx("span",{className:"text-[10px] text-muted-foreground/70 font-mono",children:de.sourcePath})]})]})]}),p.jsx(Le,{type:"button",variant:"ghost",size:"sm",className:"shrink-0 self-start",onClick:()=>pe({file:de.sourcePath,id:de.id,name:de.name,description:de.description,tags:de.tags,priority:de.priority}),children:"Details"})]},Re)})})]}),p.jsx(u5,{open:ve!=null,target:ve,request:t,onClose:()=>pe(null)})]})}function _ge(e){if(!e)return"—";try{return new Date(e).toLocaleString()}catch{return e}}function kge(e,t){const n=Date.parse(e);if(Number.isNaN(n))return null;const r=t?Date.parse(t):Date.now();return Number.isNaN(r)?null:Math.max(0,(r-n)/1e3)}function Nge(e){if(e==null)return"—";if(e<60)return`${Math.round(e)}s`;const t=Math.floor(e/60),n=Math.round(e%60);return`${t}m ${n}s`}function Tge({runs:e,navigate:t,presetName:n}){const[r,i]=x.useState(new Set),l=x.useMemo(()=>[...e].sort((h,v)=>Date.parse(v.startedAt)-Date.parse(h.startedAt)),[e]),s=h=>{const v=new Set(r);v.has(h)?v.delete(h):v.add(h),i(v)},u=()=>{if(r.size<2)return;const h=l.filter(v=>r.has(v.runId)).slice(0,10).map(v=>encodeURIComponent(v.runId)).join(",");t(`/compare?run_ids=${h}`)},f=()=>{if(l.length<2)return;const h=l.slice(0,2).map(v=>encodeURIComponent(v.runId)).join(",");t(`/compare?run_ids=${h}`)};if(l.length===0)return p.jsx(Jp,{title:"No runs yet",description:`Launch ${n} to see results here.`});const d=r.size<2,g=r.size>10;return p.jsxs(p.Fragment,{children:[p.jsxs("div",{className:"flex flex-wrap items-center gap-2 mb-3",children:[p.jsx("div",{className:"text-sm text-muted-foreground",children:r.size===0?`${l.length} run${l.length===1?"":"s"} · select 2–10 to compare`:`${r.size} selected${g?" (max 10)":""}`}),p.jsx("div",{className:"flex-1"}),l.length>=2?p.jsx(Le,{variant:"secondary",size:"sm",onClick:f,children:"Compare latest two"}):null,p.jsxs(Le,{size:"sm",onClick:u,disabled:d||g,children:["Compare ",r.size>0?`(${Math.min(r.size,10)})`:""]})]}),p.jsx(rt,{className:"overflow-hidden",children:p.jsx("div",{className:"overflow-x-auto",children:p.jsxs("table",{className:"w-full text-sm",children:[p.jsx("thead",{className:"bg-secondary",children:p.jsxs("tr",{className:"text-left text-muted-foreground text-xs uppercase tracking-wider",children:[p.jsx("th",{className:"px-3 py-2 w-8"}),p.jsx("th",{className:"px-3 py-2",children:"Name"}),p.jsx("th",{className:"px-3 py-2",children:"Status"}),p.jsx("th",{className:"px-3 py-2",children:"Started"}),p.jsx("th",{className:"px-3 py-2",children:"Duration"}),p.jsx("th",{className:"px-3 py-2 text-right",children:"Pass / Total"}),p.jsx("th",{className:"px-3 py-2",children:"Notes"})]})}),p.jsx("tbody",{className:"divide-y divide-border",children:l.map(h=>{const v=r.has(h.runId);return p.jsxs("tr",{className:v?"bg-primary/5 hover:bg-primary/10":"hover:bg-secondary",children:[p.jsx("td",{className:"px-3 py-2 align-top",children:p.jsx(bi,{checked:v,onChange:()=>s(h.runId)})}),p.jsxs("td",{className:"px-3 py-2 align-top",children:[p.jsx("a",{href:`/runs/${encodeURIComponent(h.runId)}`,className:"text-foreground hover:text-primary block",children:h.label?p.jsx("span",{className:"font-medium",children:h.label}):p.jsxs("span",{className:"font-mono text-xs text-muted-foreground",children:[h.runId.slice(0,12),"…"]})}),h.label?p.jsx("span",{className:"font-mono text-[10px] text-muted-foreground/70",children:h.runId.slice(0,12)}):null]}),p.jsx("td",{className:"px-3 py-2 align-top",children:p.jsx(Qp,{run:h})}),p.jsx("td",{className:"px-3 py-2 align-top text-muted-foreground",children:_ge(h.startedAt)}),p.jsx("td",{className:"px-3 py-2 align-top text-muted-foreground",children:Nge(kge(h.startedAt,h.completedAt))}),p.jsxs("td",{className:"px-3 py-2 align-top text-right font-mono",children:[h.aggregateCounts.scenarioPassedCount,"/",h.aggregateCounts.scenarioTotal]}),p.jsx("td",{className:"px-3 py-2 align-top text-muted-foreground max-w-[280px] truncate",children:h.notes??"—"})]},h.runId)})})]})})})]})}function c5({open:e,options:t,request:n,onClose:r,onLaunched:i,suites:l}){const[s,u]=x.useState(""),[f,d]=x.useState(""),[g,h]=x.useState(""),[v,b]=x.useState(""),[S,j]=x.useState(!1),[E,O]=x.useState(2),[C,_]=x.useState(1),[P,T]=x.useState(!1),[k,I]=x.useState(""),[R,F]=x.useState(""),[D,H]=x.useState(null),[$,J]=x.useState(!1);x.useEffect(()=>{!e||!t||(u(t.defaults.endpoint),d(""),h(t.defaults.personas),b(t.defaults.rubric),j(t.defaults.parallelEnabled),O(t.defaults.parallelLimit??2),_(t.defaults.repeat),T(t.defaults.dryRun),I(""),F(""),H(null))},[e,t]);const G=x.useMemo(()=>l?.suites.filter(ae=>ae.schema==="endpoints")??[],[l]),Q=x.useMemo(()=>l?.suites.filter(ae=>ae.schema==="personas")??[],[l]),B=x.useMemo(()=>l?.suites.filter(ae=>ae.schema==="rubrics")??[],[l]),Y=x.useMemo(()=>{const ae=s.toLowerCase();return ae.includes("autogpt")?"autogpt":ae.includes("openclaw")?"openclaw":ae.includes("opencode")?"opencode":"custom"},[s]);if(!t)return null;const ie=async ae=>{ae.preventDefault(),J(!0),H(null);try{const M={parallel:{enabled:S,limit:S?E:void 0},repeat:C,dry_run:P};s&&s!==t.defaults.endpoint&&(M.endpoint=s);const L=f.trim();L&&(M.base_url=L),g&&g!==t.defaults.personas&&(M.personas=g),v&&v!==t.defaults.rubric&&(M.rubric=v);const te={overrides:M};k.trim()&&(te.label=k.trim()),R.trim()&&(te.notes=R.trim());const z=await n(`/api/presets/${encodeURIComponent(t.presetId)}/runs`,Si("POST",te));i(z.run_id)}catch(M){H(M instanceof Error?M.message:String(M))}finally{J(!1)}};return p.jsxs(M1,{open:e,onClose:r,title:`Run ${t.presetName}`,size:"lg",footer:p.jsxs(p.Fragment,{children:[p.jsx(Le,{variant:"ghost",onClick:r,disabled:$,children:"Cancel"}),p.jsx(Le,{type:"submit",form:"run-launch-form",disabled:$,children:$?"Starting…":"Start run"})]}),children:[D?p.jsx(yt,{message:D}):null,p.jsxs("form",{id:"run-launch-form",onSubmit:ie,className:"flex flex-col gap-4",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Endpoint"}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-[2fr_auto] gap-2 items-center",children:[p.jsx(sr,{value:s,onValueChange:u,options:[...G.map(ae=>({value:ae.relativePath,label:ae.relativePath})),...s&&!G.find(ae=>ae.relativePath===s)?[{value:s,label:s}]:[]],emptyLabel:"No endpoint suites found"}),p.jsx(Pt,{tone:Y==="custom"?"warn":"info",children:Y})]}),p.jsx("div",{className:"text-xs text-muted-foreground mt-1",children:"Pick a different endpoint YAML to retarget the run (e.g. an autogpt staging endpoint vs. an openclaw gateway)."})]}),p.jsx(Xe,{label:"Base URL override",hint:"Replaces connection.base_url (HTTP) or connection.url (WebSocket) from the endpoint YAML for this run only. Leave blank to use the YAML default.",children:p.jsx(Kt,{value:f,onChange:ae=>d(ae.currentTarget.value),placeholder:"e.g. https://staging.autogpt.example or ws://10.0.0.5:18789"})}),p.jsxs("details",{className:"rounded-md border border-border bg-secondary p-3",children:[p.jsx("summary",{className:"cursor-pointer text-sm font-medium text-muted-foreground hover:text-foreground",children:"Override personas / rubric"}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-3 mt-3",children:[p.jsx(Xe,{label:"Personas",children:p.jsx(sr,{value:g,onValueChange:h,options:Q.map(ae=>({value:ae.relativePath,label:ae.relativePath})),emptyLabel:"No persona suites"})}),p.jsx(Xe,{label:"Rubric",children:p.jsx(sr,{value:v,onValueChange:b,options:B.map(ae=>({value:ae.relativePath,label:ae.relativePath})),emptyLabel:"No rubric suites"})})]})]}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3",children:[p.jsx(Xe,{label:"Parallel",hint:"Run multiple scenarios concurrently. Scenarios still complete in order, but several at a time.",children:p.jsx("div",{className:"flex items-center gap-2",children:p.jsx(bi,{checked:S,onChange:j,label:"Enabled"})})}),p.jsx(Xe,{label:"Parallel limit",hint:"Max concurrent scenarios when parallel is on. Higher = faster but more LLM cost spikes; 2–4 is typical.",children:p.jsx(Kt,{type:"number",min:1,value:E,disabled:!S,onChange:ae=>O(Number(ae.currentTarget.value))})}),p.jsx(Xe,{label:"Repeat",children:p.jsx(Kt,{type:"number",min:1,value:C,onChange:ae=>_(Number(ae.currentTarget.value))})})]}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-3",children:[p.jsx(Xe,{label:"Run name",hint:"Shows in the run list — useful for comparing runs later.",children:p.jsx(Kt,{value:k,onChange:ae=>I(ae.currentTarget.value),placeholder:"e.g. autogpt-staging baseline",maxLength:200})}),p.jsx(Xe,{label:"Mode",hint:"Dry run records run + scenario rows but skips the live adapter, judge, and scorers. Use to validate config / preset shape without spending LLM tokens.",children:p.jsx(bi,{checked:P,onChange:T,label:"Dry run"})})]}),p.jsx(Xe,{label:"Notes",hint:"Optional context — diff vs. last run, hypotheses, etc.",children:p.jsx(eD,{value:R,onChange:ae=>F(ae.currentTarget.value),rows:3,maxLength:4e3,placeholder:"Why this run?"})})]})]})}function Pge({run:e,request:t,onUpdated:n}){const[r,i]=x.useState(!1),[l,s]=x.useState(e.label??""),[u,f]=x.useState(!1),[d,g]=x.useState(e.notes??""),[h,v]=x.useState(!1),[b,S]=x.useState(null);x.useEffect(()=>{s(e.label??""),g(e.notes??"")},[e.label,e.notes]);const j=async C=>{v(!0),S(null);try{const _=await t(`/api/runs/${encodeURIComponent(e.runId)}`,Si("PATCH",C));n(_.run)}catch(_){throw S(_ instanceof Error?_.message:String(_)),_}finally{v(!1)}},E=async()=>{try{await j({label:l.trim()?l.trim():null}),i(!1)}catch{}},O=async()=>{try{await j({notes:d.trim()?d.trim():null}),f(!1)}catch{}};return p.jsxs(rt,{className:"p-4 mb-4",children:[b?p.jsx(yt,{message:b}):null,p.jsxs("div",{className:"mb-4",children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Run name"}),r?p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx(Kt,{value:l,onChange:C=>s(C.currentTarget.value),maxLength:200,autoFocus:!0,placeholder:"e.g. autogpt staging baseline",onKeyDown:C=>{C.key==="Enter"&&E(),C.key==="Escape"&&(s(e.label??""),i(!1))}}),p.jsx(Le,{onClick:()=>void E(),disabled:h,size:"sm",children:"Save"}),p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>{s(e.label??""),i(!1)},disabled:h,children:"Cancel"})]}):p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx("span",{className:"text-base text-foreground",children:e.label??p.jsx("span",{className:"text-muted-foreground/70 italic",children:"Untitled run — click rename to add a name"})}),p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>i(!0),children:"Rename"})]})]}),p.jsxs("div",{children:[p.jsxs("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1 flex items-center justify-between",children:[p.jsx("span",{children:"Notes"}),!u&&p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>f(!0),children:e.notes?"Edit":"Add notes"})]}),u?p.jsxs("div",{className:"flex flex-col gap-2",children:[p.jsx(eD,{value:d,onChange:C=>g(C.currentTarget.value),rows:4,maxLength:4e3,autoFocus:!0,placeholder:"Hypotheses, observations, comparison context…"}),p.jsxs("div",{className:"flex items-center gap-2 justify-end",children:[p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>{g(e.notes??""),f(!1)},disabled:h,children:"Cancel"}),p.jsx(Le,{onClick:()=>void O(),disabled:h,size:"sm",children:"Save notes"})]})]}):e.notes?p.jsx("div",{className:"text-sm text-foreground whitespace-pre-wrap",children:e.notes}):p.jsx("div",{className:"text-sm text-muted-foreground/70 italic",children:"No notes yet."})]})]})}function Mge(e){if(e===null||!Number.isFinite(e))return{label:"—",className:"bg-secondary text-muted-foreground"};const t=e.toFixed(2);return e>=.7?{label:`r=${t}`,className:"bg-green-500/15 text-green-600 dark:text-green-400 border border-green-500/30"}:e>=.3?{label:`r=${t}`,className:"bg-amber-500/15 text-amber-600 dark:text-amber-400 border border-amber-500/30"}:{label:`r=${t}`,className:"bg-red-500/15 text-red-600 dark:text-red-400 border border-red-500/30"}}function l1(e){const t=e.labels??{},n=Object.keys(t).map(i=>({value:Number(i),label:t[i]??""})).filter(i=>Number.isFinite(i.value));if(n.length>0)return n.sort((i,l)=>i.value-l.value),n;if(e.type==="binary")return[{value:0,label:"0"},{value:1,label:"1"}];const r=e.points??5;return Array.from({length:r},(i,l)=>({value:l+1,label:String(l+1)}))}function Rge({request:e,navigate:t}){const[n,r]=x.useState(null),[i,l]=x.useState(null);return x.useEffect(()=>{let s=!1;return e("/api/human-scoring/rubrics").then(u=>{s||(r(u),l(null))}).catch(u=>{s||l(u instanceof Error?u.message:String(u))}),()=>{s=!0}},[e]),i?p.jsx(yt,{message:i}):n?n.rubrics.length===0?p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Human scoring",title:"Score completed runs",meta:"No completed runs found yet."}),p.jsx(rt,{className:"p-6",children:p.jsx("div",{className:"text-sm text-muted-foreground",children:"Run an evaluation, then return here to score it by hand."})})]}):p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Human scoring",title:"Score completed runs",meta:"Pick a rubric dimension. You'll click through completed runs scoring just that dimension, one chat at a time."}),n.rubrics.map(s=>p.jsxs(rt,{className:"p-4 mb-3",children:[p.jsxs("div",{className:"flex items-baseline justify-between mb-1",children:[p.jsx("h3",{className:"text-base font-semibold tracking-tight m-0",children:s.rubricName}),p.jsx("span",{className:"text-xs text-muted-foreground font-mono",children:s.rubricId})]}),p.jsxs("div",{className:"text-xs text-muted-foreground mb-3",children:[s.totalScenarios," completed scenario",s.totalScenarios===1?"":"s"]}),p.jsx("div",{className:"space-y-2",children:s.dimensions.map(u=>{const f=u.unscored===0,d=`/score/${encodeURIComponent(s.rubricId)}/${encodeURIComponent(u.id)}`,g=Mge(u.correlation),h=u.correlation===null?u.pairedCount===0?"No human scores yet.":`Only ${u.pairedCount} paired score${u.pairedCount===1?"":"s"} — need 2+ for correlation.`:`Pearson correlation between ${u.pairedCount} paired human and judge scores.`;return p.jsxs("a",{href:d,className:`flex items-center justify-between gap-3 px-3 py-2.5 rounded-md border border-border bg-secondary/40 hover:border-primary hover:bg-secondary no-underline transition-colors ${f?"opacity-50":""}`,onClick:v=>{v.preventDefault(),t(d)},children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-sm font-medium text-foreground",children:u.name}),p.jsxs("div",{className:"text-xs text-muted-foreground",children:["weight ",u.weight," · ",u.scale.type]})]}),p.jsxs("div",{className:"flex items-center gap-2 whitespace-nowrap",children:[p.jsx("span",{title:h,className:`px-2 py-0.5 rounded-full font-mono text-[11px] tabular-nums ${g.className}`,children:g.label}),p.jsxs("span",{className:"font-mono text-xs text-foreground",children:[u.unscored," ",p.jsxs("span",{className:"text-muted-foreground",children:["/ ",s.totalScenarios," unscored"]})]})]})]},u.id)})})]},s.rubricId))]}):p.jsx(Ci,{withMeta:!0})}function PP(e){return typeof e=="string"&&e.trim()?e.trim():null}function MP(e){return Array.isArray(e)?e.map(t=>{if(typeof t=="string")return t.trim();if(t&&typeof t=="object"){const n=t.fact;if(typeof n=="string")return n.trim()}return""}).filter(t=>t.length>0):[]}function Dge(e){const t=e.expectations&&typeof e.expectations=="object"&&!Array.isArray(e.expectations)?e.expectations:{},n=PP(t.expected_behavior),r=PP(t.expected_outcome),i=MP(t.must_include),l=MP(t.must_not_include),s=Array.isArray(t.expected_tools)?t.expected_tools.map(u=>{if(typeof u=="string")return u;if(u&&typeof u=="object"){const f=u.name;return typeof f=="string"?f:""}return""}).filter(u=>u.length>0):[];return{scenarioName:e.scenarioName,scenarioId:e.scenarioId,description:e.scenarioDescription,expectedBehavior:n,expectedOutcome:r,mustInclude:i,mustNotInclude:l,expectedTools:s}}function zge(e){return{scenario_id:e.scenarioId,scenario_name:e.scenarioName,passed:!1,overall_score:e.overallScore,pass_threshold:e.passThreshold,status:"completed",turns:e.turns??[],tool_calls:e.toolCalls??[],target_events:e.targetEvents??[],checkpoints:[],judge_dimension_scores:[]}}const RP=420;function Ige({dimension:e,onSelect:t,submitting:n}){const r=l1(e.scale);return p.jsxs("div",{className:"space-y-3 text-sm",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Dimension"}),p.jsx("div",{className:"text-foreground font-medium",children:e.name}),p.jsxs("div",{className:"text-[11px] text-muted-foreground font-mono mt-0.5",children:["weight ",e.weight," · ",e.scale.type,e.scale.points?` · 1–${e.scale.points}`:""]})]}),p.jsxs("div",{className:"text-[11px] text-muted-foreground",children:["Press ",p.jsx("span",{className:"font-mono",children:"1"}),"–",p.jsx("span",{className:"font-mono",children:r.length})," on your keyboard, or click a level below."]}),p.jsx("div",{className:"space-y-2",children:r.map(i=>p.jsxs("button",{type:"button",disabled:n,onClick:()=>t(i.value),className:"w-full grid grid-cols-[40px_1fr] gap-3 items-start px-3 py-2.5 rounded-md border border-border bg-secondary/40 hover:border-primary hover:bg-secondary text-left disabled:opacity-60 disabled:cursor-progress transition-colors",children:[p.jsx("span",{className:"text-xl font-bold font-mono text-primary text-center leading-tight pt-0.5",children:i.value}),p.jsx("span",{className:"text-xs whitespace-pre-wrap leading-relaxed",children:i.label})]},i.value))})]})}function Lge({objective:e}){return p.jsxs("div",{className:"space-y-4 text-sm",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Scenario"}),p.jsx("div",{className:"text-foreground font-medium",children:e.scenarioName}),p.jsx("div",{className:"text-[11px] text-muted-foreground font-mono mt-0.5",children:e.scenarioId})]}),e.description?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Description"}),p.jsx("div",{className:"text-foreground whitespace-pre-wrap",children:e.description})]}):null,e.expectedBehavior?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Expected behavior"}),p.jsx("div",{className:"text-foreground whitespace-pre-wrap",children:e.expectedBehavior})]}):null,e.expectedOutcome?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Expected outcome"}),p.jsx("div",{className:"text-foreground whitespace-pre-wrap",children:e.expectedOutcome})]}):null,e.mustInclude.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Must include"}),p.jsx("ul",{className:"list-disc list-inside text-foreground space-y-0.5",children:e.mustInclude.map((t,n)=>p.jsx("li",{children:t},n))})]}):null,e.mustNotInclude.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Must not include"}),p.jsx("ul",{className:"list-disc list-inside text-foreground space-y-0.5",children:e.mustNotInclude.map((t,n)=>p.jsx("li",{children:t},n))})]}):null,e.expectedTools.length>0?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground font-semibold mb-1",children:"Expected tools"}),p.jsx("div",{className:"font-mono text-foreground",children:e.expectedTools.join(", ")})]}):null]})}function Bge({toolCalls:e}){const[t,n]=x.useState(null);return e.length===0?p.jsx("div",{className:"text-xs text-muted-foreground",children:"No tool calls recorded for this run."}):p.jsx("div",{className:"space-y-2",children:e.map((r,i)=>{const l=t===i,s=r.args===void 0||r.args===null?"—":JSON.stringify(r.args,null,2),u=r.raw===void 0||r.raw===null?null:JSON.stringify(r.raw,null,2);return p.jsxs("div",{className:"rounded-md border border-border bg-secondary/30",children:[p.jsxs("button",{type:"button",onClick:()=>n(l?null:i),className:"w-full flex items-center justify-between gap-2 px-3 py-2 text-left hover:bg-secondary",children:[p.jsxs("div",{className:"min-w-0",children:[p.jsx("div",{className:"text-sm font-mono truncate",children:r.name??"(unnamed)"}),p.jsxs("div",{className:"text-[11px] text-muted-foreground",children:["turn ",r.turn_index,r.call_order!==null?` · order ${r.call_order}`:""]})]}),p.jsx("span",{className:"text-muted-foreground text-xs shrink-0",children:l?"▾":"▸"})]}),l?p.jsxs("div",{className:"border-t border-border px-3 py-2 space-y-2",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground mb-1",children:"Args"}),p.jsx("pre",{className:"text-[11px] font-mono whitespace-pre-wrap break-words bg-background rounded p-2 max-h-72 overflow-auto",children:s})]}),u?p.jsxs("div",{children:[p.jsx("div",{className:"text-[10px] uppercase tracking-wider text-muted-foreground mb-1",children:"Raw"}),p.jsx("pre",{className:"text-[11px] font-mono whitespace-pre-wrap break-words bg-background rounded p-2 max-h-72 overflow-auto",children:u})]}):null]}):null]},`${r.turn_index}-${r.call_order??i}-${i}`)})})}function $ge(){return p.jsxs("div",{className:"fixed bottom-4 left-4 z-40 flex flex-col gap-2",children:[p.jsx("button",{type:"button",onClick:()=>window.scrollTo({top:0,behavior:"smooth"}),className:"size-10 rounded-md border border-border bg-background shadow-lg hover:bg-secondary text-foreground flex items-center justify-center transition-colors","aria-label":"Jump to top",title:"Jump to top",children:p.jsx("span",{className:"text-lg leading-none",children:"↑"})}),p.jsx("button",{type:"button",onClick:()=>window.scrollTo({top:document.documentElement.scrollHeight,behavior:"smooth"}),className:"size-10 rounded-md border border-border bg-background shadow-lg hover:bg-secondary text-foreground flex items-center justify-center transition-colors","aria-label":"Jump to bottom",title:"Jump to bottom",children:p.jsx("span",{className:"text-lg leading-none",children:"↓"})})]})}function Uge({objective:e,dimension:t,toolCalls:n,onSubmitScore:r,submitting:i}){const[l,s]=x.useState("rubric"),u=n.length>0;x.useEffect(()=>{if(l===null)return;const d=document.body.style.paddingRight,g=document.body.style.transition;return document.body.style.transition="padding-right 150ms ease",document.body.style.paddingRight=`${RP}px`,()=>{document.body.style.paddingRight=d,document.body.style.transition=g}},[l]);const f=({panelKey:d,label:g})=>p.jsx("button",{type:"button",onClick:()=>s(d),className:`px-3 py-1 rounded-md text-sm transition-colors ${l===d?"bg-secondary text-foreground font-medium":"text-muted-foreground hover:text-foreground"}`,children:g});return p.jsxs(p.Fragment,{children:[p.jsxs("div",{className:"fixed bottom-4 right-4 z-40 flex flex-col items-end gap-2",children:[p.jsx("button",{type:"button",onClick:()=>s(d=>d==="rubric"?null:"rubric"),className:`px-4 py-2 rounded-md border text-sm font-medium shadow-lg transition-colors ${l==="rubric"?"bg-primary text-primary-foreground border-primary":"bg-background border-border hover:bg-secondary"}`,children:"Rubric"}),p.jsx("button",{type:"button",onClick:()=>s(d=>d==="objective"?null:"objective"),className:`px-4 py-2 rounded-md border text-sm font-medium shadow-lg transition-colors ${l==="objective"?"bg-primary text-primary-foreground border-primary":"bg-background border-border hover:bg-secondary"}`,children:"Objective"}),u?p.jsxs("button",{type:"button",onClick:()=>s(d=>d==="tools"?null:"tools"),className:`px-4 py-2 rounded-md border text-sm font-medium shadow-lg transition-colors ${l==="tools"?"bg-primary text-primary-foreground border-primary":"bg-background border-border hover:bg-secondary"}`,children:["Tool calls (",n.length,")"]}):null]}),l!==null?p.jsxs("div",{className:"fixed top-0 right-0 bottom-0 z-50 border-l border-border bg-background shadow-2xl flex flex-col",style:{width:`${RP}px`},children:[p.jsxs("div",{className:"px-4 py-3 border-b border-border flex items-center gap-2",children:[p.jsxs("div",{className:"flex-1 flex items-center gap-1 flex-wrap",children:[p.jsx(f,{panelKey:"rubric",label:"Rubric"}),p.jsx(f,{panelKey:"objective",label:"Objective"}),u?p.jsx(f,{panelKey:"tools",label:p.jsxs(p.Fragment,{children:["Tool calls"," ",p.jsxs("span",{className:"text-muted-foreground font-normal",children:["· ",n.length]})]})}):null]}),p.jsx("button",{type:"button",onClick:()=>s(null),className:"text-muted-foreground hover:text-foreground text-lg leading-none px-1","aria-label":"Close panel",children:"×"})]}),p.jsxs("div",{className:"flex-1 min-h-0 overflow-y-auto overscroll-contain p-4",children:[l==="rubric"?p.jsx(Ige,{dimension:t,onSelect:r,submitting:i}):null,l==="objective"&&e?p.jsx(Lge,{objective:e}):null,l==="tools"?p.jsx(Bge,{toolCalls:n}):null]})]}):null]})}function Fge({rubricId:e,dimensionId:t,request:n,navigate:r}){const[i,l]=x.useState(null),[s,u]=x.useState(null),[f,d]=x.useState(null),[g,h]=x.useState(!0),[v,b]=x.useState(!1),[S,j]=x.useState(null);x.useEffect(()=>{let T=!1;return h(!0),Promise.all([n("/api/human-scoring/rubrics"),n(`/api/human-scoring/next?rubric_id=${encodeURIComponent(e)}&dimension_id=${encodeURIComponent(t)}`)]).then(([k,I])=>{if(T)return;const R=k.rubrics.find(D=>D.rubricId===e)??null,F=R?.dimensions.find(D=>D.id===t)??null;l(R),u(F),d(I.item),j(null),h(!1)}).catch(k=>{T||(j(k instanceof Error?k.message:String(k)),h(!1))}),()=>{T=!0}},[n,e,t]);const E=x.useCallback(async T=>{if(!(!f||v)){b(!0),j(null);try{const k=await n("/api/human-scoring/scores",Si("POST",{scenario_run_id:f.scenarioRunId,rubric_id:e,dimension_id:t,raw_score:T}));d(k.next)}catch(k){j(k instanceof Error?k.message:String(k))}finally{b(!1)}}},[n,f,v,e,t]);if(x.useEffect(()=>{if(!f||v||!s)return;const T=l1(s.scale),k=new Set(T.map(R=>R.value));function I(R){const F=R.target;if(F instanceof HTMLElement&&/input|textarea|select/i.test(F.tagName))return;const D=Number(R.key);!Number.isFinite(D)||!k.has(D)||(R.preventDefault(),E(D))}return window.addEventListener("keydown",I),()=>window.removeEventListener("keydown",I)},[f,s,v,E]),g)return p.jsx(Ci,{withMeta:!0});if(S&&!f)return p.jsx(yt,{message:S});if(!i||!s)return p.jsxs(p.Fragment,{children:[p.jsx(yt,{message:"Unknown rubric or dimension."}),p.jsx(Le,{variant:"secondary",onClick:()=>r("/score"),children:"Back to scoring"})]});if(!f)return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:`${i.rubricName} · ${s.name}`,title:"Queue empty",meta:"No more unscored chats for this dimension."}),p.jsx("div",{className:"flex gap-2",children:p.jsx(Le,{onClick:()=>r("/score"),children:"Back to scoring"})})]});const O=zge(f),C=l1(s.scale),_=f.judgeDimensionRawScore!==null&&f.judgeDimensionRawScore!==void 0?`Judge scored this dimension ${f.judgeDimensionRawScore}`+(f.overallScore!==null&&f.overallScore!==void 0?` · overall ${f.overallScore.toFixed(2)}`:""):null,P=Dge(f);return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:`${i.rubricName} · ${s.name}`,title:f.scenarioName,meta:p.jsxs("span",{children:[p.jsx("span",{className:"font-mono",children:f.scenarioId})," · run ",p.jsx("a",{href:`/runs/${encodeURIComponent(f.runId)}`,children:f.runId.slice(0,8)})," · ",p.jsx("a",{href:`/runs/${encodeURIComponent(f.runId)}/scenarios/${f.ordinal}`,children:"detail"})," · ",p.jsx("a",{href:"/score",onClick:T=>{T.preventDefault(),r("/score")},children:"back"})]})}),p.jsxs("div",{className:"text-xs text-muted-foreground mb-3 font-mono",children:[p.jsx("strong",{className:"text-foreground text-base",children:f.remaining})," ","remaining"]}),_?p.jsx(rt,{className:"p-3 mb-4 border-l-4 border-l-muted-foreground/50",children:p.jsx("div",{className:"text-xs text-foreground",children:_})}):null,S?p.jsx(yt,{message:S}):null,p.jsx(rt,{className:"p-4 mb-4",children:p.jsx(zS,{detail:O})}),p.jsx("div",{className:"space-y-2 pb-24",children:C.map(T=>p.jsxs("button",{type:"button",disabled:v,onClick:()=>void E(T.value),className:"w-full grid grid-cols-[56px_1fr] gap-3 items-center px-4 py-3 rounded-md border border-border bg-secondary/40 hover:border-primary hover:bg-secondary text-left disabled:opacity-60 disabled:cursor-progress transition-colors",children:[p.jsx("span",{className:"text-2xl font-bold font-mono text-primary text-center",children:T.value}),p.jsx("span",{className:"text-sm whitespace-pre-wrap",children:T.label})]},T.value))}),p.jsx($ge,{}),p.jsx(Uge,{objective:P,dimension:s,toolCalls:f.toolCalls??[],onSubmitScore:T=>void E(T),submitting:v})]})}function Ip(e){if(!e)return null;const t=Date.parse(e);return Number.isNaN(t)?null:t/1e3}function Hge(e,t){const n=Ip(e);if(n==null)return 0;const r=Ip(t)??Date.now()/1e3;return Math.max(0,r-n)}function qge(e){return e.status==="running"?"running":e.status==="pending"?"pending":e.status==="error"||e.status==="runtime_error"?"error":e.passed===!0?"pass":e.passed===!1?"fail":"pending"}function Vge(e){if(!e)return null;if(typeof e=="string")return e;if(typeof e=="object"){const t=e.message;return typeof t=="string"?t:JSON.stringify(e)}return String(e)}function Kge(e){return{dimension_id:String(e.dimension_id??""),dimension_name:String(e.dimension_name??e.dimension_id??""),raw_score:typeof e.raw_score=="number"?e.raw_score:Number(e.raw_score),scale_points:e.scale_points==null?null:Number(e.scale_points),normalized_score:e.normalized_score==null?null:Number(e.normalized_score),weight:e.weight==null?null:Number(e.weight),reasoning:typeof e.reasoning=="string"?e.reasoning:"",evidence:Array.isArray(e.evidence)?e.evidence.map(String):[]}}function f5(e){return{scenario_id:e.scenarioId,scenario_name:e.scenarioName,user_id:e.userId??void 0,passed:e.passed===!0,overall_score:e.overallScore??null,pass_threshold:e.passThreshold??null,status:e.status,judge:e.judge?{provider:e.judge.provider??void 0,model:e.judge.model??void 0,temperature:e.judge.temperature??void 0,max_tokens:e.judge.maxTokens??void 0,overall_notes:e.judge.overallNotes??void 0,output:e.judge.output&&typeof e.judge.output=="object"&&!Array.isArray(e.judge.output)?e.judge.output:void 0}:void 0,turns:e.turns??[],tool_calls:e.toolCalls??[],target_events:e.targetEvents??[],checkpoints:e.checkpoints??[],judge_dimension_scores:(e.judgeDimensionScores??[]).map(Kge),retrieval_scores:e.retrievalScores??[],demotion_scores:e.demotionScores??[],procedure_scores:e.procedureScores??[],dedup_scores:e.dedupScores??[],expectations:e.expectations,error:e.error,counts:e.counts?{turn_count:e.counts.turnCount,assistant_turn_count:e.counts.assistantTurnCount,tool_call_count:e.counts.toolCallCount,checkpoint_count:e.counts.checkpointCount}:void 0}}function Yge(e){const t=e.scenarios.map(l=>({scenario_id:l.scenarioId,scenario_name:l.scenarioName,status:qge(l),score:l.overallScore??null,error:Vge(l.error),started_at:Ip(l.startedAt),finished_at:Ip(l.completedAt)})),n={};for(const l of e.scenarios)n[l.ordinal]=f5(l);const r=t.filter(l=>l.status==="running").length,i=t.filter(l=>l.status!=="running"&&l.status!=="pending").length;return{total:e.aggregateCounts.scenarioTotal||t.length,elapsed:Hge(e.startedAt,e.completedAt),passed:e.aggregateCounts.scenarioPassedCount,failed:e.aggregateCounts.scenarioFailedCount,errored:e.aggregateCounts.scenarioErroredCount,running:r,done:i,all_done:!!e.completedAt||r===0,scenarios:t,details:n,averages:[]}}function Gge(){const[e,t]=x.useState(window.location.pathname);x.useEffect(()=>{const r=()=>t(window.location.pathname);return window.addEventListener("popstate",r),()=>window.removeEventListener("popstate",r)},[]);const n=x.useCallback(r=>{window.history.pushState({},"",r),t(window.location.pathname)},[]);return{pathname:e,navigate:n}}function Wge(e){x.useEffect(()=>{const t=n=>{if(n.defaultPrevented||!(n.target instanceof Element))return;const r=n.target.closest("a");if(!r)return;const i=r.getAttribute("href");!i?.startsWith("/")||i.startsWith("//")||i.startsWith("/api/")||r.target||(n.preventDefault(),e(i))};return document.addEventListener("click",t),()=>document.removeEventListener("click",t)},[e])}function IS(e){if(!e)return"—";try{return new Date(e).toLocaleString()}catch{return e}}const xc=10;function d5({runs:e,navigate:t,selectable:n=!0}){const[r,i]=x.useState(()=>new Set);if(x.useEffect(()=>{i(h=>{const v=new Set(e.map(S=>S.runId)),b=new Set;return h.forEach(S=>{v.has(S)&&b.add(S)}),b.size===h.size?h:b})},[e]),e.length===0)return p.jsx(Jp,{title:"No runs recorded",description:"Launch a preset or start an ad-hoc run to populate this table."});const l=(h,v)=>{i(b=>{const S=new Set(b);if(v){if(S.size>=xc&&!S.has(h))return b;S.add(h)}else S.delete(h);return S})},s=e.map(h=>h.runId).filter(h=>r.has(h)),u=s.length<2||s.length>xc,f=s.length===0?"Pick 2+ runs to compare":s.length===1?"Pick at least one more run":s.length>xc?`Maximum ${xc} runs at a time`:`${s.length} runs selected`,d=`/compare?run_ids=${s.join(",")}`,g=()=>{u||(t?t(d):(window.history.pushState({},"",d),window.dispatchEvent(new PopStateEvent("popstate"))))};return p.jsxs(p.Fragment,{children:[n?p.jsxs("div",{className:"flex items-center gap-3 mb-3",children:[p.jsx(Le,{variant:"primary",size:"sm",onClick:g,disabled:u,children:"Compare selected"}),p.jsx("span",{className:"text-xs text-muted-foreground",children:f}),s.length>0?p.jsx(Le,{variant:"ghost",size:"sm",onClick:()=>i(new Set),children:"Clear"}):null]}):null,p.jsx(rt,{className:"overflow-hidden",children:p.jsx("div",{className:"overflow-x-auto",children:p.jsxs("table",{className:"w-full text-sm",children:[p.jsx("thead",{className:"bg-secondary",children:p.jsxs("tr",{className:"text-left text-muted-foreground text-xs uppercase tracking-wider",children:[n?p.jsx("th",{className:"px-3 py-2 w-8"}):null,p.jsx("th",{className:"px-3 py-2",children:"Name"}),p.jsx("th",{className:"px-3 py-2",children:"Status"}),p.jsx("th",{className:"px-3 py-2",children:"Preset"}),p.jsx("th",{className:"px-3 py-2",children:"Started"}),p.jsx("th",{className:"px-3 py-2 text-right",children:"Pass / Total"})]})}),p.jsx("tbody",{className:"divide-y divide-border",children:e.map(h=>{const v=r.has(h.runId),b=!v&&s.length>=xc;return p.jsxs("tr",{className:"hover:bg-secondary",children:[n?p.jsx("td",{className:"px-3 py-2 align-middle",children:p.jsx(bi,{checked:v,onChange:S=>{b&&S||l(h.runId,S)}})}):null,p.jsx("td",{className:"px-3 py-2",children:p.jsx("a",{href:`/runs/${encodeURIComponent(h.runId)}`,className:"text-foreground hover:text-primary",children:h.label?p.jsx("span",{className:"font-medium",children:h.label}):p.jsxs("span",{className:"font-mono text-xs text-muted-foreground",children:[h.runId.slice(0,12),"…"]})})}),p.jsx("td",{className:"px-3 py-2",children:p.jsx(Qp,{run:h})}),p.jsx("td",{className:"px-3 py-2 text-muted-foreground",children:h.preset??"—"}),p.jsx("td",{className:"px-3 py-2 text-muted-foreground",children:IS(h.startedAt)}),p.jsxs("td",{className:"px-3 py-2 text-right font-mono",children:[h.aggregateCounts.scenarioPassedCount,"/",h.aggregateCounts.scenarioTotal]})]},h.runId)})})]})})})]})}function Xge({request:e,navigate:t}){const[n,r]=x.useState(null),[i,l]=x.useState(null),[s,u]=x.useState(null);if(x.useEffect(()=>{let g=!1;return Promise.all([e("/api/runs?limit=5"),e("/api/suites")]).then(([h,v])=>{g||(r(h),l(v),u(null))}).catch(h=>{g||u(h instanceof Error?h.message:String(h))}),()=>{g=!0}},[e]),s)return p.jsx(yt,{message:s});if(!n||!i)return p.jsxs(p.Fragment,{children:[p.jsx(Ci,{}),p.jsx(JR,{}),p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Latest Runs"}),p.jsx(Zp,{rows:5})]});const f=n.runs.filter(g=>g.passed===!0).length,d=n.runs.filter(g=>g.passed===!1).length;return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Overview",title:"AgentProbe"}),p.jsxs("div",{className:"grid grid-cols-2 sm:grid-cols-4 gap-3 mb-6",children:[p.jsx(yn,{label:"Total Runs",value:n.total}),p.jsx(yn,{label:"Recent Passed",tone:"success",value:f}),p.jsx(yn,{label:"Recent Failed",tone:"danger",value:d}),p.jsx(yn,{label:"Suites",tone:"accent",value:i.suites.length})]}),p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Latest Runs"}),p.jsx(d5,{runs:n.runs,navigate:t})]})}const DP=50;function Qge({request:e,navigate:t}){const[n,r]=x.useState([]),[i,l]=x.useState(0),[s,u]=x.useState(null),[f,d]=x.useState(!1),[g,h]=x.useState(!1),[v,b]=x.useState(null);x.useEffect(()=>{let j=!1;return e(`/api/runs?limit=${DP}&offset=0`).then(E=>{j||(r(E.runs),l(E.total),u(E.next_cursor??null),h(!0),b(null))}).catch(E=>{j||b(E instanceof Error?E.message:String(E))}),()=>{j=!0}},[e]);const S=x.useCallback(()=>{!s||f||(d(!0),e(`/api/runs?limit=${DP}&offset=${encodeURIComponent(s)}`).then(j=>{r(E=>[...E,...j.runs]),l(j.total),u(j.next_cursor??null),b(null)}).catch(j=>{b(j instanceof Error?j.message:String(j))}).finally(()=>{d(!1)}))},[s,f,e]);return v&&!g?p.jsx(yt,{message:v}):g?p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"History",title:"Runs",meta:`${i} total`}),v?p.jsx(yt,{message:v}):null,p.jsx(d5,{runs:n,navigate:t}),s?p.jsx("div",{className:"mt-4 flex justify-center",children:p.jsx(Le,{variant:"secondary",onClick:S,disabled:f,children:f?"Loading...":`Load more (${n.length} of ${i})`})}):null]}):p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(Zp,{rows:8})]})}function Zge({runId:e,request:t,navigate:n}){const[r,i]=x.useState(null),[l,s]=x.useState(null),[u,f]=x.useState(!1),[d,g]=x.useState(!1),[h,v]=x.useState(""),[b,S]=x.useState(""),[j,E]=x.useState(!1),[O,C]=x.useState(null),_=x.useRef(t),P=x.useRef(e),T=x.useRef(!0);_.current=t,P.current=e,x.useEffect(()=>(T.current=!0,()=>{T.current=!1}),[]);const k=x.useCallback(async()=>{const J=e;try{const G=await _.current(`/api/runs/${encodeURIComponent(J)}`);if(!T.current||P.current!==J)return;i(G.run),s(null)}catch(G){if(!T.current||P.current!==J)return;s(G instanceof Error?G.message:String(G))}},[e]),I=x.useRef(k);I.current=k,x.useEffect(()=>{i(null),s(null),k()},[k]),x.useEffect(()=>{const J=new EventSource(`/api/runs/${encodeURIComponent(e)}/events`),G=()=>{I.current()},Q=()=>{G(),J.close()};return J.addEventListener("snapshot",G),J.addEventListener("suite_started",G),J.addEventListener("scenario_started",G),J.addEventListener("scenario_finished",G),J.addEventListener("scenario_error",G),J.addEventListener("run_finished",Q),J.addEventListener("run_cancelled",Q),J.addEventListener("run_error",Q),()=>J.close()},[e]);const R=async()=>{f(!0),s(null);try{await t(`/api/runs/${encodeURIComponent(e)}/cancel`,{method:"POST"}),await k()}catch(J){s(J instanceof Error?J.message:String(J))}finally{f(!1)}},F=x.useMemo(()=>r?Yge(r):null,[r]),D=x.useCallback(J=>{!n||!r||n(`/runs/${encodeURIComponent(r.runId)}/scenarios/${J}`)},[n,r]),H=()=>{C(null);const J=r?.label??r?.preset??`Run ${r?.runId.slice(0,8)??""}`.trim();v(J),S(""),g(!0)},$=async()=>{if(!r)return;const J=h.trim();if(J.length===0){C("Name is required.");return}E(!0),C(null);try{const G=await t(`/api/runs/${encodeURIComponent(r.runId)}/save-as-preset`,{method:"POST",headers:{"content-type":"application/json"},body:JSON.stringify({name:J,description:b.trim()||null})});g(!1),n&&G?.preset?.id&&n(`/presets/${encodeURIComponent(G.preset.id)}`)}catch(G){C(G instanceof Error?G.message:String(G))}finally{E(!1)}};return l?p.jsx(yt,{message:l}):!r||!F?p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(JR,{}),p.jsx(Qr,{lines:4,className:"mb-4"}),p.jsx(Zp,{rows:6,selectable:!1})]}):p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:r.presetId?p.jsxs("span",{children:["Run from preset"," ",p.jsx("a",{href:`/presets/${encodeURIComponent(r.presetId)}`,className:"text-primary hover:underline",children:r.preset??r.presetId})]}):"Run",title:p.jsx("span",{className:"font-mono text-base text-muted-foreground break-all",children:r.runId}),meta:p.jsxs("span",{children:["Started ",IS(r.startedAt)," · trigger ",r.trigger??"—"]}),actions:p.jsxs(p.Fragment,{children:[r.status==="running"&&p.jsx(Le,{variant:"secondary",onClick:()=>void R(),disabled:u,children:u?"Cancelling…":"Cancel"}),p.jsx(Le,{variant:"secondary",onClick:H,disabled:!r.scenarios||r.scenarios.length===0,title:"Create a preset that reuses this run's endpoint, personas, rubric, and scenarios",children:"Save as preset"}),p.jsx("a",{href:`/api/runs/${encodeURIComponent(r.runId)}/report.html`,className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-3 py-1.5 text-sm bg-secondary text-foreground border-border hover:bg-primary hover:border-border no-underline",children:"HTML report"})]})}),p.jsx(M1,{open:d,onClose:()=>{j||g(!1)},title:"Save as preset",description:"Captures this run's endpoint, personas, rubric, and the exact scenario selection so you can re-run it later.",size:"md",footer:p.jsxs("div",{className:"flex items-center justify-end gap-2",children:[p.jsx(Le,{variant:"ghost",onClick:()=>g(!1),disabled:j,children:"Cancel"}),p.jsx(Le,{variant:"primary",onClick:()=>void $(),disabled:j||h.trim().length===0,children:j?"Saving…":"Save preset"})]}),children:p.jsxs("div",{className:"flex flex-col gap-4 py-2",children:[O?p.jsx(yt,{message:O}):null,p.jsx(Xe,{label:"Name",htmlFor:"preset-from-run-name",children:p.jsx(Kt,{id:"preset-from-run-name",value:h,onChange:J=>v(J.target.value),placeholder:"e.g. Nightly smoke (gpt-4o)",autoFocus:!0})}),p.jsx(Xe,{label:"Description",htmlFor:"preset-from-run-description",children:p.jsx(Kt,{id:"preset-from-run-description",value:b,onChange:J=>S(J.target.value),placeholder:"Optional"})})]})}),p.jsx(Pge,{run:r,request:t,onUpdated:J=>i(G=>G&&{...G,...J})}),p.jsx(l5,{data:F}),p.jsx(i5,{data:F}),p.jsx(a5,{data:F,runId:r.runId,onSelect:D}),p.jsx(YP,{averages:F.averages,onSelectRun:D})]})}function Jge({runId:e,ordinal:t,request:n}){const[r,i]=x.useState(null),[l,s]=x.useState(null);if(x.useEffect(()=>{let f=!1;return n(`/api/runs/${encodeURIComponent(e)}/scenarios/${encodeURIComponent(t)}`).then(d=>{f||(i(d),s(null))}).catch(d=>{f||s(d instanceof Error?d.message:String(d))}),()=>{f=!0}},[n,e,t]),l)return p.jsx(yt,{message:l});if(!r)return p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(Qr,{lines:4,className:"mb-3"}),p.jsx(Qr,{lines:6})]});const u=f5(r.scenario);return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:p.jsxs("a",{href:`/runs/${encodeURIComponent(r.run.runId)}`,className:"text-primary hover:underline",children:["← Back to run"," ",p.jsxs("span",{className:"font-mono",children:[r.run.runId.slice(0,12),"…"]})]}),title:u.scenario_name,meta:p.jsxs("span",{children:["Scenario #",t," · ",u.scenario_id]}),actions:p.jsx(Qp,{run:{...r.run,exitCode:null,preset:null,aggregateCounts:{scenarioTotal:1,scenarioPassedCount:u.passed?1:0,scenarioFailedCount:u.passed?0:1,scenarioErroredCount:u.status==="error"?1:0}}})}),p.jsxs("div",{className:"grid grid-cols-1 lg:grid-cols-[minmax(0,1fr)_minmax(320px,0.65fr)] gap-4",children:[p.jsxs(rt,{className:"p-4",children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-3",children:"Conversation"}),p.jsx(zS,{detail:u})]}),p.jsxs(rt,{className:"p-4",children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-3",children:"Rubric"}),p.jsx(r5,{detail:u})]})]}),t5(u)&&p.jsxs(rt,{className:"mt-4 p-4",children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-3",children:"Eval scores"}),p.jsx(e5,{detail:u})]})]})}function eve({request:e}){const[t,n]=x.useState(null),[r,i]=x.useState(null),[l,s]=x.useState(null);return x.useEffect(()=>{let u=!1;return Promise.all([e("/api/suites"),e("/api/scenarios")]).then(([f,d])=>{u||(n(f),i(d),s(null))}).catch(f=>{u||s(f instanceof Error?f.message:String(f))}),()=>{u=!0}},[e]),l?p.jsx(yt,{message:l}):!t||!r?p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(Qr,{lines:5,className:"mb-3"}),p.jsx(Qr,{lines:5})]}):p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Data root",title:p.jsx("span",{className:"font-mono text-base text-foreground break-all",children:t.data_path}),meta:`${t.suites.length} suite${t.suites.length===1?"":"s"} · ${r.scenarios.length} scenario${r.scenarios.length===1?"":"s"}`}),t.errors.length>0&&p.jsx(yt,{message:`${t.errors.length} suite files had validation errors.`}),p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Suites"}),p.jsx(rt,{className:"overflow-hidden mb-6",children:p.jsx("div",{className:"overflow-x-auto",children:p.jsxs("table",{className:"w-full text-sm",children:[p.jsx("thead",{className:"bg-secondary",children:p.jsxs("tr",{className:"text-left text-muted-foreground text-xs uppercase tracking-wider",children:[p.jsx("th",{className:"px-3 py-2",children:"Suite"}),p.jsx("th",{className:"px-3 py-2",children:"Schema"}),p.jsx("th",{className:"px-3 py-2",children:"Path"}),p.jsx("th",{className:"px-3 py-2 text-right",children:"Objects"})]})}),p.jsx("tbody",{className:"divide-y divide-border",children:t.suites.map(u=>p.jsxs("tr",{className:"hover:bg-secondary",children:[p.jsx("td",{className:"px-3 py-2 font-mono text-xs",children:u.id}),p.jsx("td",{className:"px-3 py-2",children:p.jsx(Pt,{tone:"info",children:u.schema})}),p.jsx("td",{className:"px-3 py-2 font-mono text-xs text-muted-foreground break-all",children:u.relativePath}),p.jsx("td",{className:"px-3 py-2 text-right font-mono",children:u.objectCount})]},u.id))})]})})}),p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Scenarios"}),p.jsx(rt,{className:"overflow-hidden",children:p.jsx("div",{className:"overflow-x-auto",children:p.jsxs("table",{className:"w-full text-sm",children:[p.jsx("thead",{className:"bg-secondary",children:p.jsxs("tr",{className:"text-left text-muted-foreground text-xs uppercase tracking-wider",children:[p.jsx("th",{className:"px-3 py-2",children:"Scenario"}),p.jsx("th",{className:"px-3 py-2",children:"Name"}),p.jsx("th",{className:"px-3 py-2",children:"Suite"}),p.jsx("th",{className:"px-3 py-2",children:"Tags"}),p.jsx("th",{className:"px-3 py-2",children:"Rubric"})]})}),p.jsx("tbody",{className:"divide-y divide-border",children:r.scenarios.map(u=>p.jsxs("tr",{className:"hover:bg-secondary",children:[p.jsx("td",{className:"px-3 py-2 font-mono text-xs",children:u.id}),p.jsx("td",{className:"px-3 py-2",children:u.name}),p.jsx("td",{className:"px-3 py-2 text-muted-foreground",children:u.suiteId}),p.jsx("td",{className:"px-3 py-2",children:p.jsxs("div",{className:"flex flex-wrap gap-1",children:[u.tags.map(f=>p.jsx(Pt,{children:f},f)),u.tags.length===0?p.jsx("span",{className:"text-muted-foreground/70",children:"—"}):null]})}),p.jsx("td",{className:"px-3 py-2 text-muted-foreground",children:u.rubric??"—"})]},`${u.suiteId}:${u.id}`))})]})})})]})}function tve({request:e,navigate:t}){const[n,r]=x.useState(null),[i,l]=x.useState(null),[s,u]=x.useState(null),[f,d]=x.useState(""),[g,h]=x.useState(""),[v,b]=x.useState(""),[S,j]=x.useState(""),[E,O]=x.useState(new Set),[C,_]=x.useState(""),[P,T]=x.useState(!1),[k,I]=x.useState(2),[R,F]=x.useState(1),[D,H]=x.useState(!0),[$,J]=x.useState(!1),[G,Q]=x.useState(""),[B,Y]=x.useState(null),[ie,ae]=x.useState(!1),[M,L]=x.useState(null);x.useEffect(()=>{let W=!1;return Promise.all([e("/api/suites"),e("/api/scenarios"),e("/api/presets")]).then(([ge,ne,se])=>{W||(r(ge),l(ne),u(se),d(ge.suites.find(ye=>ye.schema==="endpoints")?.relativePath??""),h(ge.suites.find(ye=>ye.schema==="personas")?.relativePath??""),b(ge.suites.find(ye=>ye.schema==="rubrics")?.relativePath??""))}).catch(ge=>{W||Y(ge instanceof Error?ge.message:String(ge))}),()=>{W=!0}},[e]);const te=x.useMemo(()=>i?i.scenarios.filter(W=>E.has(`${W.sourcePath}::${W.id}`)).map(W=>({file:W.sourcePath,id:W.id})):[],[i,E]);if(B)return p.jsx(yt,{message:B});if(!n||!i||!s)return p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(Qr,{lines:6,className:"mb-3"}),p.jsx(Qr,{lines:4})]});const z=n.suites.filter(W=>W.schema==="endpoints"),he=n.suites.filter(W=>W.schema==="personas"),ve=n.suites.filter(W=>W.schema==="rubrics"),pe=async W=>{W.preventDefault(),ae(!0),Y(null);try{const ge={enabled:P,limit:P?k:void 0},ne=S?await e(`/api/presets/${encodeURIComponent(S)}/runs`,Si("POST",{label:C||void 0,overrides:{parallel:ge,repeat:R,dry_run:D}})):await e("/api/runs",Si("POST",{endpoint:f,personas:g,rubric:v,selection:te,parallel:ge,repeat:R,dry_run:D,label:C||void 0,save_as_preset:$&&G.trim()?{name:G.trim()}:void 0}));t(`/runs/${encodeURIComponent(ne.run_id)}`)}catch(ge){Y(ge instanceof Error?ge.message:String(ge))}finally{ae(!1)}};return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Start",title:"Run builder",meta:S?"Launching from preset — overrides only":`${E.size} scenario${E.size===1?"":"s"} selected`,actions:p.jsx(Le,{onClick:W=>pe(W),disabled:ie,children:ie?"Starting…":"Start run"})}),B?p.jsx(yt,{message:B}):null,p.jsxs("form",{onSubmit:pe,className:"flex flex-col gap-4",children:[p.jsxs(rt,{className:"p-4 flex flex-col gap-3",children:[p.jsx(Xe,{label:"Preset",children:p.jsx(sr,{value:S||"__adhoc__",onValueChange:W=>j(W==="__adhoc__"?"":W),options:[{value:"__adhoc__",label:"Ad-hoc (build from scratch)"},...s.presets.map(W=>({value:W.id,label:W.name}))]})}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3",children:[p.jsx(Xe,{label:"Endpoint",children:p.jsx(sr,{value:f,onValueChange:d,disabled:!!S,options:z.map(W=>({value:W.relativePath,label:W.relativePath})),emptyLabel:"No endpoint suites found"})}),p.jsx(Xe,{label:"Personas",children:p.jsx(sr,{value:g,onValueChange:h,disabled:!!S,options:he.map(W=>({value:W.relativePath,label:W.relativePath})),emptyLabel:"No persona suites found"})}),p.jsx(Xe,{label:"Rubric",children:p.jsx(sr,{value:v,onValueChange:b,disabled:!!S,options:ve.map(W=>({value:W.relativePath,label:W.relativePath})),emptyLabel:"No rubric suites found"})})]})]}),!S&&p.jsxs(rt,{className:"overflow-hidden",children:[p.jsxs("div",{className:"p-3 border-b border-border flex items-center justify-between",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold",children:"Scenarios"}),p.jsxs("div",{className:"text-xs text-muted-foreground mt-0.5",children:[i.scenarios.length," available"]})]}),p.jsx(Le,{variant:"secondary",size:"sm",onClick:()=>O(new Set(i.scenarios.map(W=>`${W.sourcePath}::${W.id}`))),children:"Select all"})]}),p.jsx("div",{className:"max-h-[420px] overflow-y-auto divide-y divide-border",children:i.scenarios.map(W=>{const ge=`${W.sourcePath}::${W.id}`,ne=E.has(ge);return p.jsxs("div",{className:`flex items-start gap-3 px-3 py-2.5 hover:bg-secondary ${ne?"bg-primary/5":""}`,children:[p.jsxs("label",{className:"flex items-start gap-3 flex-1 min-w-0 cursor-pointer",children:[p.jsx("input",{type:"checkbox",checked:ne,onChange:se=>{const ye=new Set(E);se.currentTarget.checked?ye.add(ge):ye.delete(ge),O(ye)},className:"size-4 mt-0.5 accent-primary shrink-0"}),p.jsxs("div",{className:"flex-1 min-w-0",children:[p.jsxs("div",{className:"flex items-baseline gap-2 flex-wrap",children:[p.jsx("span",{className:"text-sm font-medium text-foreground",children:W.name||W.id}),p.jsx("span",{className:"font-mono text-[11px] text-muted-foreground",children:W.id}),W.priority?p.jsx(Pt,{tone:"info",children:W.priority}):null]}),W.description?p.jsx("div",{className:"text-xs text-muted-foreground mt-0.5 line-clamp-2",children:W.description}):null,p.jsxs("div",{className:"flex items-center gap-1.5 mt-1 flex-wrap",children:[W.tags.slice(0,5).map(se=>p.jsx(Pt,{children:se},se)),p.jsx("span",{className:"font-mono text-[10px] text-muted-foreground/70",children:W.sourcePath})]})]})]}),p.jsx(Le,{type:"button",variant:"ghost",size:"sm",className:"shrink-0 self-start",onClick:()=>L({file:W.sourcePath,id:W.id,name:W.name,description:W.description,tags:W.tags,priority:W.priority}),children:"Details"})]},ge)})})]}),p.jsxs(rt,{className:"p-4 flex flex-col gap-3",children:[p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3",children:[p.jsx(Xe,{label:"Label",hint:"Shown in the run list.",children:p.jsx(Kt,{value:C,onChange:W=>_(W.currentTarget.value),maxLength:200})}),p.jsx(Xe,{label:"Repeat",children:p.jsx(Kt,{type:"number",min:1,value:R,onChange:W=>F(Number(W.currentTarget.value))})}),p.jsx(Xe,{label:"Parallel limit",hint:"Max concurrent scenarios when parallel is on. 2-4 is typical; higher = faster but more LLM cost spikes.",children:p.jsx(Kt,{type:"number",min:1,value:k,onChange:W=>I(Number(W.currentTarget.value)),disabled:!P})})]}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3",children:[p.jsx(Xe,{label:"Dry run",hint:"Records run + scenario rows but skips the live adapter, judge, and scorers. Use to validate config without spending LLM tokens.",children:p.jsx(bi,{checked:D,onChange:H,label:"Enabled"})}),p.jsx(Xe,{label:"Parallel",hint:"Run multiple scenarios concurrently. Scenarios still complete in order, but several run at a time (set the limit above).",children:p.jsx(bi,{checked:P,onChange:T,label:"Enabled"})}),S?null:p.jsx(Xe,{label:"Save as preset",hint:"Save this scenario selection + settings as a reusable preset visible on the Presets page.",children:p.jsx(bi,{checked:$,onChange:J,label:"Enabled"})})]}),$&&!S?p.jsx(Xe,{label:"Preset name",children:p.jsx(Kt,{value:G,onChange:W=>Q(W.currentTarget.value),placeholder:"e.g. Smoke suite"})}):null]}),p.jsx("div",{className:"flex justify-end",children:p.jsx(Le,{type:"submit",disabled:ie,children:ie?"Starting…":"Start run"})})]}),p.jsx(u5,{open:M!=null,target:M,request:e,onClose:()=>L(null)})]})}function h5(e){return{presetId:e.id,presetName:e.name,defaults:{endpoint:e.endpoint,personas:e.personas,rubric:e.rubric,parallelEnabled:e.parallel.enabled,parallelLimit:e.parallel.limit,repeat:e.repeat,dryRun:e.dry_run}}}function p5(e){const t=e.toLowerCase();return t.includes("autogpt")?"autogpt":t.includes("openclaw")?"openclaw":t.includes("opencode")?"opencode":"custom"}function nve({request:e,navigate:t}){const[n,r]=x.useState(null),[i,l]=x.useState(null),[s,u]=x.useState(null),[f,d]=x.useState(null);return x.useEffect(()=>{let g=!1;return Promise.all([e("/api/presets"),e("/api/suites")]).then(([h,v])=>{g||(r(h),l(v))}).catch(h=>{g||u(h instanceof Error?h.message:String(h))}),()=>{g=!0}},[e]),s?p.jsx(yt,{message:s}):n?p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Presets",title:"Saved Configurations",meta:`${n.presets.length} preset${n.presets.length===1?"":"s"}`,actions:p.jsx("a",{href:"/start",className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-3 py-1.5 text-sm bg-primary text-background border-primary hover:bg-primary/90 hover:border-primary no-underline",children:"New preset"})}),n.presets.length===0?p.jsx(Jp,{title:"No presets yet",description:"Build a run on the Start tab and save it as a preset to make it repeatable.",action:p.jsx("a",{href:"/start",className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-3 py-1.5 text-sm bg-primary text-background border-primary hover:bg-primary/90 hover:border-primary no-underline",children:"Build your first preset"})}):p.jsx("div",{className:"grid grid-cols-1 md:grid-cols-2 xl:grid-cols-3 gap-3",children:n.presets.map(g=>{const h=p5(g.endpoint);return p.jsxs(rt,{className:"p-4 hover:border-border transition-colors flex flex-col",children:[p.jsxs("div",{className:"flex items-start justify-between gap-2 mb-2",children:[p.jsx("a",{href:`/presets/${encodeURIComponent(g.id)}`,className:"text-base font-semibold text-foreground hover:text-primary no-underline truncate",title:g.name,children:g.name}),p.jsx(Pt,{tone:h==="custom"?"default":"info",children:h})]}),g.description?p.jsx("div",{className:"text-sm text-muted-foreground mb-3 line-clamp-2",children:g.description}):null,p.jsxs("div",{className:"grid grid-cols-3 gap-2 mb-3 text-xs",children:[p.jsxs("div",{children:[p.jsx("div",{className:"text-muted-foreground/70 uppercase tracking-wider text-[10px]",children:"Scenarios"}),p.jsx("div",{className:"font-mono text-foreground",children:g.selection.length})]}),p.jsxs("div",{children:[p.jsx("div",{className:"text-muted-foreground/70 uppercase tracking-wider text-[10px]",children:"Repeat"}),p.jsx("div",{className:"font-mono text-foreground",children:g.repeat})]}),p.jsxs("div",{children:[p.jsx("div",{className:"text-muted-foreground/70 uppercase tracking-wider text-[10px]",children:"Parallel"}),p.jsx("div",{className:"font-mono text-foreground",children:g.parallel.enabled?`×${g.parallel.limit??"?"}`:"off"})]})]}),p.jsx("div",{className:"text-xs text-muted-foreground mb-3 flex items-center gap-2 min-h-[1.25rem]",children:g.last_run?p.jsxs(p.Fragment,{children:[p.jsx(Qp,{run:g.last_run}),p.jsx("span",{children:IS(g.last_run.startedAt)})]}):p.jsx("span",{className:"italic text-muted-foreground/70",children:"Never run"})}),p.jsxs("div",{className:"flex items-center gap-2 mt-auto pt-3 border-t border-border",children:[p.jsx(Le,{size:"sm",onClick:()=>d(h5(g)),children:"Launch run"}),p.jsx("a",{href:`/presets/${encodeURIComponent(g.id)}/edit`,className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-2.5 py-1 text-xs bg-secondary text-foreground border-border hover:bg-primary hover:border-border no-underline",children:"Edit"}),p.jsx("a",{href:`/presets/${encodeURIComponent(g.id)}`,className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-2.5 py-1 text-xs bg-transparent text-muted-foreground border-transparent hover:bg-secondary hover:text-foreground no-underline",children:"History"})]})]},g.id)})}),p.jsx(c5,{open:f!=null,options:f,request:e,suites:i,onClose:()=>d(null),onLaunched:g=>{d(null),t(`/runs/${encodeURIComponent(g)}`)}})]}):p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 lg:grid-cols-3 gap-3",children:[p.jsx(Qr,{lines:4}),p.jsx(Qr,{lines:4}),p.jsx(Qr,{lines:4})]})]})}function rve({presetId:e,request:t,navigate:n}){const[r,i]=x.useState(null),[l,s]=x.useState(null),[u,f]=x.useState(null),[d,g]=x.useState(null),[h,v]=x.useState(null);x.useEffect(()=>{let j=!1;return Promise.all([t(`/api/presets/${encodeURIComponent(e)}`),t(`/api/presets/${encodeURIComponent(e)}/runs`),t("/api/suites")]).then(([E,O,C])=>{j||(i(E),s(O),f(C))}).catch(E=>{j||g(E instanceof Error?E.message:String(E))}),()=>{j=!0}},[t,e]);const b=async()=>{if(confirm("Delete this preset? Past runs will remain in history."))try{await t(`/api/presets/${encodeURIComponent(e)}`,{method:"DELETE"}),n("/presets")}catch(j){g(j instanceof Error?j.message:String(j))}};if(d)return p.jsx(yt,{message:d});if(!r||!l)return p.jsxs(p.Fragment,{children:[p.jsx(Ci,{withMeta:!0}),p.jsx(Qr,{lines:4,className:"mb-3"}),p.jsx(Zp,{rows:5,selectable:!1})]});const S=p5(r.preset.endpoint);return p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Preset",title:r.preset.name,meta:r.preset.description??void 0,actions:p.jsxs(p.Fragment,{children:[p.jsx(Le,{onClick:()=>v(h5(r.preset)),children:"Launch run"}),p.jsx("a",{href:`/presets/${encodeURIComponent(e)}/edit`,className:"inline-flex items-center justify-center gap-1.5 rounded-md font-medium border transition-colors px-3 py-1.5 text-sm bg-secondary text-foreground border-border hover:bg-primary hover:border-border no-underline",children:"Edit"}),p.jsx(Le,{variant:"danger",onClick:()=>void b(),children:"Delete"})]})}),r.warnings.map(j=>p.jsx(yt,{message:j.message},`${j.file}:${j.id}`)),p.jsxs("div",{className:"grid grid-cols-2 sm:grid-cols-4 gap-3 mb-6",children:[p.jsx(yn,{label:"Scenarios",value:r.preset.selection.length}),p.jsx(yn,{label:"Repeat",value:r.preset.repeat}),p.jsx(yn,{label:"Parallel",value:r.preset.parallel.enabled?`×${r.preset.parallel.limit??"?"}`:"off"}),p.jsx(yn,{label:"Endpoint",tone:S==="custom"?"default":"accent",value:p.jsx("span",{className:"text-base font-mono",children:S})})]}),p.jsxs(rt,{className:"p-4 mb-6",children:[p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Configuration"}),p.jsxs("dl",{className:"grid grid-cols-1 sm:grid-cols-3 gap-3 text-sm",children:[p.jsxs("div",{children:[p.jsx("dt",{className:"text-muted-foreground/70 text-xs",children:"Endpoint"}),p.jsx("dd",{className:"font-mono text-foreground break-all",children:r.preset.endpoint})]}),p.jsxs("div",{children:[p.jsx("dt",{className:"text-muted-foreground/70 text-xs",children:"Personas"}),p.jsx("dd",{className:"font-mono text-foreground break-all",children:r.preset.personas})]}),p.jsxs("div",{children:[p.jsx("dt",{className:"text-muted-foreground/70 text-xs",children:"Rubric"}),p.jsx("dd",{className:"font-mono text-foreground break-all",children:r.preset.rubric})]})]})]}),p.jsx("div",{className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold mb-2",children:"Run history"}),p.jsx(Tge,{runs:l.runs,navigate:n,presetName:r.preset.name}),p.jsx(c5,{open:h!=null,options:h,request:t,suites:u,onClose:()=>v(null),onLaunched:j=>{v(null),n(`/runs/${encodeURIComponent(j)}`)}})]})}function ive(e){return e?e.configured?e.source==="db"?"stored on server":e.source==="env"?"from environment variable":"configured":"not set":"loading…"}function ave({request:e}){const[t,n]=x.useState(null),[r,i]=x.useState(""),[l,s]=x.useState(!1),[u,f]=x.useState(null),[d,g]=x.useState(null),h=x.useCallback(async()=>{try{const j=await e("/api/settings/secrets/open_router_api_key");n(j.open_router_api_key),f(null)}catch(j){f(j instanceof Error?j.message:String(j))}},[e]);x.useEffect(()=>{let j=!1;return(async()=>j||await h())(),()=>{j=!0}},[h]);const v=async j=>{j.preventDefault();const E=r.trim();if(E){s(!0),g(null);try{const O=await e("/api/settings/secrets/open_router_api_key",Si("PUT",{value:E}));n(O.open_router_api_key),i(""),f(null),g("Saved.")}catch(O){f(O instanceof Error?O.message:String(O))}finally{s(!1)}}},b=async()=>{s(!0),g(null);try{const j=await e("/api/settings/secrets/open_router_api_key",Si("DELETE"));n(j.open_router_api_key),i(""),f(null),g("Cleared.")}catch(j){f(j instanceof Error?j.message:String(j))}finally{s(!1)}},S=t?.source==="db";return p.jsx(rt,{className:"p-4 mb-4",children:p.jsxs("form",{className:"flex flex-col gap-2",onSubmit:v,children:[p.jsx("label",{htmlFor:"open-router-api-key",className:"text-xs uppercase tracking-wider text-muted-foreground font-semibold",children:"OpenRouter API key"}),p.jsxs("div",{className:"flex items-center gap-2",children:[p.jsx(Kt,{id:"open-router-api-key",type:"password",value:r,onChange:j=>i(j.currentTarget.value),placeholder:"sk-or-...",autoComplete:"off"}),p.jsx(Le,{type:"submit",disabled:l||!r.trim(),children:"Save"}),S?p.jsx(Le,{type:"button",variant:"ghost",disabled:l,onClick:()=>{b()},children:"Clear"}):null]}),p.jsxs("div",{className:"text-xs text-muted-foreground",children:["Status: ",ive(t)]}),d?p.jsx("div",{className:"text-xs text-success",children:d}):null,u?p.jsx("div",{className:"text-xs text-destructive",children:u}):null]})})}function lve({request:e}){const[t,n]=x.useState(null),[r,i]=x.useState(null),[l,s]=x.useState(null);return x.useEffect(()=>{let u=!1;return Promise.all([fetch("/healthz").then(f=>f.json()),fetch("/readyz").then(f=>f.json())]).then(([f,d])=>{u||(n(f),i(d),s(null))}).catch(f=>{u||s(f instanceof Error?f.message:String(f))}),()=>{u=!0}},[]),p.jsxs(p.Fragment,{children:[p.jsx(On,{eyebrow:"Settings",title:"Server"}),l?p.jsx(yt,{message:l}):null,p.jsxs("div",{className:"grid grid-cols-2 sm:grid-cols-4 gap-3 mb-6",children:[p.jsx(yn,{label:"Health",tone:t?.status==="ok"?"success":"default",value:p.jsx("span",{className:"text-base font-mono",children:t?.status??"—"})}),p.jsx(yn,{label:"Readiness",tone:r?.status==="ready"?"success":"default",value:p.jsx("span",{className:"text-base font-mono",children:r?.status??"—"})}),p.jsx(yn,{label:"Version",value:p.jsx("span",{className:"text-base font-mono",children:t?.version??"—"})}),p.jsx(yn,{label:"Database",value:p.jsx("span",{className:"text-base font-mono",children:r?.db_url?"sqlite":"—"})})]}),p.jsx(ave,{request:e})]})}function ove(){const{pathname:e,navigate:t}=Gge(),n=K6();Wge(t);const r=(()=>{if(e==="/"||e==="/index.html")return p.jsx(Xge,{request:n,navigate:t});if(e==="/runs")return p.jsx(Qge,{request:n,navigate:t});if(e==="/start")return p.jsx(tve,{request:n,navigate:t});if(e==="/presets")return p.jsx(nve,{request:n,navigate:t});if(e==="/suites")return p.jsx(eve,{request:n});if(e==="/endpoints")return p.jsx(wge,{request:n});if(e==="/settings")return p.jsx(lve,{request:n});if(e==="/compare")return p.jsx(Zse,{});if(e==="/score")return p.jsx(Rge,{request:n,navigate:t});const l=e.match(/^\/score\/([^/]+)\/([^/]+)$/);if(l)return p.jsx(Fge,{rubricId:decodeURIComponent(l[1]??""),dimensionId:decodeURIComponent(l[2]??""),request:n,navigate:t});const s=e.match(/^\/runs\/([^/]+)\/scenarios\/([0-9]+)$/);if(s)return p.jsx(Jge,{runId:decodeURIComponent(s[1]??""),ordinal:s[2]??"0",request:n});const u=e.match(/^\/runs\/([^/]+)$/);if(u)return p.jsx(Zge,{runId:decodeURIComponent(u[1]??""),request:n,navigate:t});const f=e.match(/^\/presets\/([^/]+)\/edit$/);if(f)return p.jsx(Cge,{presetId:decodeURIComponent(f[1]??""),request:n,navigate:t});const d=e.match(/^\/presets\/([^/]+)$/);return d?p.jsx(rve,{presetId:decodeURIComponent(d[1]??""),request:n,navigate:t}):p.jsx(yt,{message:"Page not found."})})(),i=[{href:"/",label:"Overview",isActive:l=>l==="/"||l==="/index.html"},{href:"/start",label:"Start",isActive:l=>l==="/start"},{href:"/runs",label:"Runs",isActive:l=>l==="/runs"||l.startsWith("/runs/")},{href:"/score",label:"Score",isActive:l=>l==="/score"||l.startsWith("/score/")},{href:"/presets",label:"Presets",isActive:l=>l==="/presets"||l.startsWith("/presets/")},{href:"/suites",label:"Suites",isActive:l=>l.startsWith("/suites")},{href:"/endpoints",label:"Endpoints",isActive:l=>l.startsWith("/endpoints")},{href:"/settings",label:"Settings",isActive:l=>l==="/settings"}];return p.jsxs("div",{className:"min-h-screen bg-background",children:[p.jsx("header",{className:"sticky top-0 z-30 border-b border-border bg-background/85 backdrop-blur supports-[backdrop-filter]:bg-background/65",children:p.jsxs("div",{className:"mx-auto max-w-[1280px] px-6 h-14 flex items-center justify-between gap-6",children:[p.jsxs("a",{href:"/",className:"flex items-center gap-2.5 no-underline text-foreground",children:[p.jsx("span",{className:"inline-block size-2 rounded-full bg-primary shadow-[0_0_0_4px_hsl(var(--primary)/0.12)]"}),p.jsx("span",{className:"text-sm font-semibold tracking-tight",children:"AgentProbe"})]}),p.jsxs("div",{className:"flex items-center gap-1",children:[p.jsx("nav",{className:"hidden md:flex items-center gap-0.5",children:i.map(l=>{const s=l.isActive(e);return p.jsxs("a",{href:l.href,className:`relative px-3 h-14 inline-flex items-center text-sm transition-colors no-underline ${s?"text-foreground font-medium":"text-muted-foreground hover:text-foreground"}`,children:[l.label,s?p.jsx("span",{className:"absolute bottom-[-1px] left-3 right-3 h-px bg-primary"}):null]},l.href)})}),p.jsx("nav",{className:"md:hidden flex items-center gap-1 overflow-x-auto",children:i.map(l=>{const s=l.isActive(e);return p.jsx("a",{href:l.href,className:`px-2.5 h-8 inline-flex items-center rounded-md text-xs transition-colors no-underline ${s?"bg-secondary text-foreground":"text-muted-foreground hover:text-foreground"}`,children:l.label},l.href)})}),p.jsx("div",{className:"ml-2 pl-2 border-l border-border",children:p.jsx(yge,{})})]})]})}),p.jsx("main",{className:"mx-auto max-w-[1280px] px-6 py-8",children:r})]})}function sve(){const{data:e,error:t}=xge(),[n,r]=x.useState(null);if(t&&!e)return p.jsxs("div",{style:{padding:48,textAlign:"center",color:"var(--muted)"},children:[p.jsx("div",{style:{fontSize:16,marginBottom:8},children:"Waiting for run to start..."}),p.jsx("div",{style:{fontSize:12},children:t})]});if(!e)return p.jsx("div",{style:{padding:48,textAlign:"center",color:"var(--muted)"},children:"Loading..."});const i=n!=null?e.details[n]??null:null;return p.jsxs(p.Fragment,{children:[p.jsxs("div",{className:"header",children:[p.jsx("h1",{children:"AgentProbe Live Dashboard"}),p.jsxs("span",{className:"live-badge",children:[p.jsx("span",{className:e.all_done?"done-dot":"live-dot"}),e.all_done?"COMPLETE":"LIVE"]})]}),p.jsx(l5,{data:e}),p.jsx(i5,{data:e}),p.jsx(a5,{data:e,onSelect:r}),p.jsx(YP,{averages:e.averages,onSelectRun:r}),p.jsxs("div",{className:"footer",children:["AgentProbe Dashboard · ",e.done,"/",e.total," scenarios"]}),i&&p.jsx(cge,{detail:i,onClose:()=>r(null)})]})}function uve(){const[e,t]=x.useState("detecting"),[n,r]=x.useState(typeof window<"u"?window.location.pathname:"/");return x.useEffect(()=>{const i=()=>r(window.location.pathname);return window.addEventListener("popstate",i),()=>window.removeEventListener("popstate",i)},[]),x.useEffect(()=>{let i=!1;if(window.__AGENTPROBE_LIVE__){t("live");return}return fetch("/api/session",{headers:{accept:"application/json"}}).finally(()=>{i||t("server")}),()=>{i=!0}},[]),e==="detecting"?p.jsx(rf,{label:"Starting dashboard…"}):e==="live"?p.jsx(sve,{}):p.jsx(ove,{})}function cve(){return p.jsx(R$,{children:p.jsx(uve,{})})}const m5=document.getElementById("root");if(!m5)throw new Error("Missing #root element");B6.createRoot(m5).render(p.jsx(x.StrictMode,{children:p.jsx(cve,{})})); +
diff --git a/dashboard/package.json b/dashboard/package.json index 5ae9db8..385bf7f 100644 --- a/dashboard/package.json +++ b/dashboard/package.json @@ -20,14 +20,16 @@ "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "lucide-react": "^1.11.0", - "react": "^19.1.0", - "react-dom": "^19.1.0", "react-markdown": "^10.1.0", "recharts": "^3.8.1", "remark-gfm": "^4.0.1", "tailwind-merge": "^3.5.0", "tw-animate-css": "^1.4.0" }, + "peerDependencies": { + "react": "^19.1.0", + "react-dom": "^19.1.0" + }, "devDependencies": { "@tailwindcss/vite": "^4.2.4", "@types/react": "^19.1.6", diff --git a/dashboard/src/App.tsx b/dashboard/src/App.tsx index ad48dd7..c134415 100644 --- a/dashboard/src/App.tsx +++ b/dashboard/src/App.tsx @@ -31,6 +31,7 @@ import { AveragesTable } from "./components/AveragesTable.tsx"; import { CompareView } from "./components/CompareView.tsx"; import { ConversationView } from "./components/ConversationView.tsx"; import { DetailPanel } from "./components/DetailPanel.tsx"; +import { EvalScoresView, hasEvalScores } from "./components/EvalScoresView.tsx"; import { ProgressBar } from "./components/ProgressBar.tsx"; import { RubricView } from "./components/RubricView.tsx"; import { ScenarioTable } from "./components/ScenarioTable.tsx"; @@ -186,6 +187,14 @@ function scenarioDetail(scenario: ServerScenario): ScenarioDetail { judge_dimension_scores: (scenario.judgeDimensionScores ?? []).map( normalizeDimension, ), + retrieval_scores: (scenario.retrievalScores ?? + []) as unknown as ScenarioDetail["retrieval_scores"], + demotion_scores: (scenario.demotionScores ?? + []) as unknown as ScenarioDetail["demotion_scores"], + procedure_scores: (scenario.procedureScores ?? + []) as unknown as ScenarioDetail["procedure_scores"], + dedup_scores: (scenario.dedupScores ?? + []) as unknown as ScenarioDetail["dedup_scores"], expectations: scenario.expectations, error: scenario.error, counts: scenario.counts @@ -992,6 +1001,14 @@ function ScenarioDetailView({ + {hasEvalScores(detail) && ( + +
+ Eval scores +
+ +
+ )} ); } @@ -1463,7 +1480,10 @@ export function StartRunView({ onChange={(e) => setRepeat(Number(e.currentTarget.value))} /> - + -
- - + + + + - {!presetId ? ( + hint="Run multiple scenarios concurrently. Scenarios still complete in order, but several run at a time (set the limit above)." + > + + {!presetId ? ( + + + ) : null}
{saveAsPreset && !presetId ? ( diff --git a/dashboard/src/api/types.ts b/dashboard/src/api/types.ts index 42aa988..1970720 100644 --- a/dashboard/src/api/types.ts +++ b/dashboard/src/api/types.ts @@ -46,6 +46,10 @@ export type ServerScenario = { targetEvents?: Array>; checkpoints?: Array>; judgeDimensionScores?: Array>; + retrievalScores?: Array>; + demotionScores?: Array>; + procedureScores?: Array>; + dedupScores?: Array>; expectations?: unknown; error?: unknown; counts?: { diff --git a/dashboard/src/components/DetailPanel.tsx b/dashboard/src/components/DetailPanel.tsx index b28d8d7..937251c 100644 --- a/dashboard/src/components/DetailPanel.tsx +++ b/dashboard/src/components/DetailPanel.tsx @@ -2,6 +2,7 @@ import { useState } from "react"; import { scorePct } from "../helpers.ts"; import type { ScenarioDetail } from "../types.ts"; import { ConversationView } from "./ConversationView.tsx"; +import { EvalScoresView, hasEvalScores } from "./EvalScoresView.tsx"; import { RubricView } from "./RubricView.tsx"; interface Props { @@ -9,8 +10,11 @@ interface Props { onClose: () => void; } +type TabKey = "conversation" | "rubric" | "evals"; + export function DetailPanel({ detail, onClose }: Props) { - const [tab, setTab] = useState<"conversation" | "rubric">("conversation"); + const evalsVisible = hasEvalScores(detail); + const [tab, setTab] = useState("conversation"); const isRunning = detail.status === "running"; const scoreLabel = @@ -109,14 +113,21 @@ export function DetailPanel({ detail, onClose }: Props) { > Rubric + {evalsVisible && ( + + )}
- {tab === "conversation" ? ( - - ) : ( - - )} + {tab === "conversation" && } + {tab === "rubric" && } + {tab === "evals" && }
diff --git a/dashboard/src/components/EvalScoresView.tsx b/dashboard/src/components/EvalScoresView.tsx new file mode 100644 index 0000000..301e6dd --- /dev/null +++ b/dashboard/src/components/EvalScoresView.tsx @@ -0,0 +1,395 @@ +import { + CheckCircle2, + ListOrdered, + Network, + Sparkles, + Target, + XCircle, +} from "lucide-react"; +import type { ReactNode } from "react"; +import { cn } from "../lib/utils.ts"; +import type { + DedupMetricScore, + DemotionMetricScore, + ProcedureMetricScore, + RetrievalMetricScore, + ScenarioDetail, +} from "../types.ts"; + +interface Props { + detail: ScenarioDetail; +} + +type AnyMetric = + | RetrievalMetricScore + | DemotionMetricScore + | ProcedureMetricScore + | DedupMetricScore; + +function formatNumber(value: number): string { + if (!Number.isFinite(value)) return "n/a"; + return value.toFixed(3).replace(/0+$/, "").replace(/\.$/, ""); +} + +function pct(value: number): string { + if (!Number.isFinite(value)) return "n/a"; + return `${Math.round(value * 100)}%`; +} + +function MetricBar({ value }: { value: number }) { + const clamped = Math.max(0, Math.min(1, value)); + return ( +
+
+
+ ); +} + +function PassPill({ passed }: { passed: boolean }) { + return passed ? ( + + + Pass + + ) : ( + + + Fail + + ); +} + +function SectionLabel({ + children, + count, +}: { + children: ReactNode; + count?: number | string; +}) { + return ( +
+
+ {children} +
+ {count != null && ( +
+ {count} +
+ )} +
+
+ ); +} + +function MetricCard({ m }: { m: AnyMetric }) { + return ( +
+
+ {m.metric} +
+
+ +
+
+ {formatNumber(m.value)} +
+
+ ×{formatNumber(m.weight)} +
+
+ ); +} + +function ScorerHeader({ + icon, + title, + subtitle, + weightedScore, + passThreshold, + passed, + source, +}: { + icon: ReactNode; + title: string; + subtitle?: string; + weightedScore: number; + passThreshold: number; + passed: boolean; + source: string; +}) { + return ( +
+
+
+ {icon} +
+
+
{title}
+ {subtitle && ( +
{subtitle}
+ )} +
+
+
+
+
+ {pct(weightedScore)} + + {" "} + / {pct(passThreshold)} + +
+
+ source: {source} +
+
+ +
+
+ ); +} + +function aggregate(metrics: AnyMetric[]) { + if (metrics.length === 0) { + return { + weightedScore: 0, + passThreshold: 0, + passed: false, + source: "missing", + }; + } + const first = metrics[0]; + return { + weightedScore: first.weighted_score, + passThreshold: first.pass_threshold, + passed: first.passed, + source: first.source, + }; +} + +function RetrievalSection({ metrics }: { metrics: RetrievalMetricScore[] }) { + if (metrics.length === 0) return null; + const agg = aggregate(metrics); + const first = metrics[0]; + return ( +
+ } + title="Retrieval ranking" + subtitle={`k=${first?.k ?? "n/a"}, ${first?.hit_count ?? 0}/${first?.total_relevant ?? 0} hits, ${first?.forbidden_hits ?? 0} forbidden`} + weightedScore={agg.weightedScore} + passThreshold={agg.passThreshold} + passed={agg.passed} + source={agg.source} + /> +
+ {metrics.map((m, i) => ( + + ))} +
+
+ ); +} + +function DemotionSection({ metrics }: { metrics: DemotionMetricScore[] }) { + if (metrics.length === 0) return null; + const agg = aggregate(metrics); + const first = metrics[0]; + const subtitleBits: string[] = []; + if (first) { + subtitleBits.push( + `timestamp violations: ${first.timestamp_violation_count}`, + ); + if (first.cascade_bounded === true) subtitleBits.push("cascade: bounded"); + else if (first.cascade_bounded === false) + subtitleBits.push("cascade: RUNAWAY"); + } + return ( +
+ } + title="Demotion correctness" + subtitle={subtitleBits.join(" · ")} + weightedScore={agg.weightedScore} + passThreshold={agg.passThreshold} + passed={agg.passed} + source={agg.source} + /> +
+ {metrics.map((m, i) => ( + + ))} +
+
+ ); +} + +function ProcedureSection({ metrics }: { metrics: ProcedureMetricScore[] }) { + if (metrics.length === 0) return null; + const agg = aggregate(metrics); + const first = metrics[0]; + const predicted = Array.isArray(first?.predicted) + ? (first.predicted as string[]) + : []; + const golden = Array.isArray(first?.golden) ? (first.golden as string[]) : []; + return ( +
+ } + title="Procedure extraction" + subtitle={`predicted ${predicted.length} steps · golden ${golden.length} steps`} + weightedScore={agg.weightedScore} + passThreshold={agg.passThreshold} + passed={agg.passed} + source={agg.source} + /> +
+ {metrics.map((m, i) => ( + + ))} +
+ {(predicted.length > 0 || golden.length > 0) && ( +
+ + +
+ )} +
+ ); +} + +function StepList({ title, items }: { title: string; items: string[] }) { + return ( +
+
+ {title} +
+ {items.length === 0 ? ( +
(empty)
+ ) : ( +
    + {items.map((step, i) => ( +
  1. + {i + 1}. + {step} +
  2. + ))} +
+ )} +
+ ); +} + +function DedupSection({ metrics }: { metrics: DedupMetricScore[] }) { + if (metrics.length === 0) return null; + const agg = aggregate(metrics); + const first = metrics[0]; + const predicted = Array.isArray(first?.predicted) + ? (first.predicted as string[][]) + : []; + const golden = Array.isArray(first?.golden) + ? (first.golden as string[][]) + : []; + return ( +
+ } + title="Deduplication" + subtitle={`items: ${first?.item_count ?? 0} · predicted ${predicted.length} clusters · golden ${golden.length} clusters`} + weightedScore={agg.weightedScore} + passThreshold={agg.passThreshold} + passed={agg.passed} + source={agg.source} + /> +
+ {metrics.map((m, i) => ( + + ))} +
+ {(predicted.length > 0 || golden.length > 0) && ( +
+ + +
+ )} +
+ ); +} + +function ClusterList({ + title, + clusters, +}: { + title: string; + clusters: string[][]; +}) { + return ( +
+
+ {title} +
+ {clusters.length === 0 ? ( +
(empty)
+ ) : ( +
+ {clusters.map((cluster, i) => ( +
+ {cluster.join(", ")} +
+ ))} +
+ )} +
+ ); +} + +export function EvalScoresView({ detail }: Props) { + const retrieval = detail.retrieval_scores ?? []; + const demotion = detail.demotion_scores ?? []; + const procedure = detail.procedure_scores ?? []; + const dedup = detail.dedup_scores ?? []; + const totalCount = + retrieval.length + demotion.length + procedure.length + dedup.length; + + if (totalCount === 0) { + return ( +
+ +
+ No quantitative eval scores +
+

+ This scenario didn't declare a retrieval, demotion, procedure, or + dedup block. Add one to its YAML to get IR-style metrics here. +

+
+ ); + } + + return ( +
+ Eval scores + + + + +
+ ); +} + +export function hasEvalScores(detail: ScenarioDetail): boolean { + return ( + (detail.retrieval_scores?.length ?? 0) > 0 || + (detail.demotion_scores?.length ?? 0) > 0 || + (detail.procedure_scores?.length ?? 0) > 0 || + (detail.dedup_scores?.length ?? 0) > 0 + ); +} diff --git a/dashboard/src/components/copilot/Markdown.tsx b/dashboard/src/components/copilot/Markdown.tsx index 38280f4..9c560bb 100644 --- a/dashboard/src/components/copilot/Markdown.tsx +++ b/dashboard/src/components/copilot/Markdown.tsx @@ -1,4 +1,4 @@ -import { type ComponentProps, memo } from "react"; +import { type ComponentType, type JSX, memo } from "react"; import ReactMarkdown from "react-markdown"; import remarkGfm from "remark-gfm"; import { cn } from "../../lib/utils.ts"; @@ -8,7 +8,17 @@ export type MarkdownProps = { className?: string; }; -const components: ComponentProps["components"] = { +// Re-type react-markdown's `components` prop against the project's React 19 +// JSX namespace. Upstream react-markdown ships its own bundled React types, +// which collide with the project's @types/react@19. The component handlers +// themselves are unchanged — we just remap the JSX intrinsic table. +type MarkdownComponents = { + [Key in keyof JSX.IntrinsicElements]?: ComponentType< + JSX.IntrinsicElements[Key] + >; +}; + +const components: MarkdownComponents = { p: ({ className, ...props }) => (

["components"] = { function MarkdownInner({ children, className }: MarkdownProps) { return (

- + {children}
diff --git a/dashboard/src/types.ts b/dashboard/src/types.ts index ab94f17..c119abe 100644 --- a/dashboard/src/types.ts +++ b/dashboard/src/types.ts @@ -55,6 +55,61 @@ export interface DimensionScore { evidence?: string[]; } +export interface RetrievalMetricScore { + metric: string; + value: number; + weight: number; + k: number; + weighted_score: number; + pass_threshold: number; + passed: boolean; + total_relevant: number; + total_returned: number; + hit_count: number; + forbidden_hits: number; + source: string; + returned: unknown; +} + +export interface DemotionMetricScore { + metric: string; + value: number; + weight: number; + weighted_score: number; + pass_threshold: number; + passed: boolean; + timestamp_violation_count: number; + cascade_bounded: boolean | null; + source: string; + observed: unknown; + expected: unknown; +} + +export interface ProcedureMetricScore { + metric: string; + value: number; + weight: number; + weighted_score: number; + pass_threshold: number; + passed: boolean; + source: string; + predicted: unknown; + golden: unknown; +} + +export interface DedupMetricScore { + metric: string; + value: number; + weight: number; + weighted_score: number; + pass_threshold: number; + passed: boolean; + item_count: number; + source: string; + predicted: unknown; + golden: unknown; +} + export interface ScenarioDetail { scenario_id: string; scenario_name: string; @@ -76,6 +131,10 @@ export interface ScenarioDetail { target_events?: Array>; checkpoints?: Checkpoint[]; judge_dimension_scores?: DimensionScore[]; + retrieval_scores?: RetrievalMetricScore[]; + demotion_scores?: DemotionMetricScore[]; + procedure_scores?: ProcedureMetricScore[]; + dedup_scores?: DedupMetricScore[]; expectations?: unknown; error?: unknown; counts?: { diff --git a/dashboard/src/views/PresetEditorView.tsx b/dashboard/src/views/PresetEditorView.tsx index 3c9a2a0..ba81591 100644 --- a/dashboard/src/views/PresetEditorView.tsx +++ b/dashboard/src/views/PresetEditorView.tsx @@ -311,7 +311,10 @@ export function PresetEditorView({ onChange={(e) => setRepeat(Number(e.currentTarget.value))} /> - +
- +
- +
- + - +
diff --git a/data/dream-validation.yaml b/data/dream-validation.yaml new file mode 100644 index 0000000..24dd9f0 --- /dev/null +++ b/data/dream-validation.yaml @@ -0,0 +1,436 @@ +version: "1.0" +id: "dream-validation-v1" +name: "Dream-System Validation Pack (P-1 -> P2)" + +defaults: + max_turns: 6 + timeout_seconds: 30 + category: "Dream" + persona: smb-founder + rubric: multi-session-memory + user_name: "Jordan Rivera" + copilot_mode: "fast" + +# Dream-system validation pack. +# +# This pack pairs the existing conversational rubric with three new +# quantitative scorers — `demotion`, `procedure`, `dedup` — that exercise +# the dream-system roadmap items from `dream/TODO.md`: +# +# - P-1.3 retract-vs-soft-delete split (Snodgrass bi-temporal) +# - P0.3a stale-fact deprecation +# - P0.3b scoped cascading expiry (single-hop discipline) +# - P1 procedure synthesis + Save-as-Skill +# - P2 memory dedup / near-duplicate merge +# +# Each scenario carries one of the new scorer blocks plus a fixture +# under `data/fixtures/dream/` so the scorer math is exercised offline. +# When AutoGPT's dream pass starts emitting structured payloads inline +# on the chat-stream response, swap `source.fixture` for +# `source.raw_exchange_key` and the same scenarios drive live runs. + +scenarios: + + # ========================================================================= + # P-1.3 — retract sets only expired_at (Snodgrass system retraction) + # ========================================================================= + - id: dream-demotion-retract-discipline + name: "Demotion: user-initiated forget sets expired_at only (not invalid_at)" + description: | + Validates `_retract_edges` from P-1.3: a user-initiated forget is a + system retraction (only `expired_at`), not a world change + (`invalid_at` must remain null). Conflating the two is the Snodgrass + bi-temporal bug the audit flagged. + tags: [dream, demotion, snodgrass, p_minus_1] + priority: critical + rubric: memory-abstention + + context: + system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations." + + turns: + - role: user + use_exact_message: true + content: "Forget that I mentioned the Q2 budget." + + expectations: + expected_behavior: | + The forget operation flips `expired_at` on the budget edge but + leaves `invalid_at` null — that fact didn't stop being true, the + system just stopped tracking it. + expected_outcome: resolved + failure_modes: + - snodgrass_violation: Both expired_at and invalid_at were set + - missed_demotion: The budget edge was not touched at all + - wrong_edge: A different edge was retracted + + demotion: + expected_demotions: ["edge-budget-q2"] + expected_retracts: ["edge-budget-q2"] + pass_threshold: 0.6 + weight: + set_precision: 1.0 + set_recall: 1.0 + set_f1: 1.0 + timestamp_discipline: 3.0 + cascade_bounded: 0.0 + cascade_direct_f1: 0.0 + source: + fixture: fixtures/dream/demotion-snodgrass-retract.json + + # ========================================================================= + # P-1.3 negative — retract that *also* set invalid_at (the bug) + # ========================================================================= + - id: dream-demotion-snodgrass-violation + name: "Demotion: a retract that sets invalid_at is a hard fail" + description: | + Negative-test sibling: the fixture deliberately violates the + Snodgrass discipline (both timestamps set). The scorer must flag + this and force a fail regardless of the set-membership score. + tags: [dream, demotion, snodgrass, negative, p_minus_1] + priority: critical + rubric: memory-abstention + + turns: + - role: user + use_exact_message: true + content: "Forget that I mentioned the Q2 budget." + + expectations: + expected_behavior: | + This scenario exists to prove the scorer flags a Snodgrass + violation. The fixture is intentionally wrong; the scenario is + expected to FAIL with `timestampViolationCount > 0`. + expected_outcome: resolved + failure_modes: + - false_positive: Scorer marks this as passing despite the violation + - missed_violation: timestampViolationCount stays at 0 + + demotion: + expected_demotions: ["edge-budget-q2"] + expected_retracts: ["edge-budget-q2"] + pass_threshold: 0.6 + weight: + set_precision: 1.0 + set_recall: 1.0 + set_f1: 1.0 + timestamp_discipline: 3.0 + cascade_bounded: 0.0 + cascade_direct_f1: 0.0 + source: + fixture: fixtures/dream/demotion-snodgrass-violation.json + + # ========================================================================= + # P0.3a — stale fact demotion + # ========================================================================= + - id: dream-demotion-stale-fact + name: "Demotion: stale pricing flagged by dream pass is correctly demoted" + description: | + Validates P0.3a stale-fact deprecation. The dream pass identified + that old pricing was stale; the demotion sets the expected edge + to status='superseded' with expired_at-only timestamps. + tags: [dream, demotion, stale, p0_3a] + priority: high + rubric: memory-temporal + + turns: + - role: user + use_exact_message: true + content: "What's our current pricing?" + + expectations: + expected_behavior: | + The dream pass flipped the old pricing edge to superseded. The + edge is gone from active search results but lives in the audit + trail. + expected_outcome: resolved + failure_modes: + - missed: Stale edge was not demoted + - over_demoted: Active edges were demoted alongside + + demotion: + expected_demotions: ["edge-pricing-old"] + pass_threshold: 0.6 + source: + fixture: fixtures/dream/demotion-stale-fact.json + + # ========================================================================= + # P0.3b — cascading expiry (bounded to 1-hop, no runaway) + # ========================================================================= + - id: dream-demotion-cascade-bounded + name: "Demotion: invalidating an entity touches direct neighbors only" + description: | + Validates P0.3b cascading expiry. When the NorthStar lead is + invalidated, the cascade touches only the entity's direct + attachments (Marcus, CTO role) and not adjacent infrastructure + (HubSpot CRM). This is the single-hop discipline the spec + explicitly calls out. + tags: [dream, demotion, cascade, p0_3b] + priority: critical + rubric: memory-hygiene + + turns: + - role: user + use_exact_message: true + content: "The NorthStar deal fell through. Drop them from our system." + + expectations: + expected_behavior: | + Marcus and the CTO role edge are retracted. HubSpot, fiscal year, + and invoicing rule edges stay active. The cascade is bounded. + expected_outcome: resolved + failure_modes: + - runaway_cascade: HubSpot (or other 2+ hop edges) were also touched + - under_cascade: Only one of the two direct-neighbor edges was touched + + demotion: + expected_demotions: ["edge-northstar-marcus", "edge-northstar-cto"] + cascade: + expected_direct_neighbors: + - "edge-northstar-marcus" + - "edge-northstar-cto" + tangential_edges: + - "edge-hubspot-crm" + - "edge-fiscal-year" + - "edge-invoicing-1st" + pass_threshold: 0.6 + source: + fixture: fixtures/dream/demotion-cascade-bounded.json + + # ========================================================================= + # P0.3b negative — runaway cascade (the bug to prevent) + # ========================================================================= + - id: dream-demotion-cascade-runaway + name: "Demotion: cascade that touched a 2-hop edge is a hard fail" + description: | + Negative test: fixture shows the cascade touching HubSpot (which is + 2+ hops away from NorthStar via the user). Per p0-spec.md §4 this + is the runaway-demotion bug that single-hop discipline exists to + prevent. The scorer MUST flag this even though set_precision drops. + tags: [dream, demotion, cascade, negative, p0_3b] + priority: critical + rubric: memory-hygiene + + turns: + - role: user + use_exact_message: true + content: "The NorthStar deal fell through. Drop them from our system." + + expectations: + expected_behavior: | + The fixture intentionally shows a runaway cascade. The scenario + is expected to FAIL with cascadeBounded=false. + expected_outcome: resolved + failure_modes: + - false_positive: Scorer marks this as passing despite the runaway + + demotion: + expected_demotions: ["edge-northstar-marcus", "edge-northstar-cto"] + cascade: + expected_direct_neighbors: + - "edge-northstar-marcus" + - "edge-northstar-cto" + tangential_edges: + - "edge-hubspot-crm" + - "edge-fiscal-year" + - "edge-invoicing-1st" + pass_threshold: 0.6 + source: + fixture: fixtures/dream/demotion-cascade-runaway.json + + # ========================================================================= + # P1 — procedure synthesis: weekly report workflow + # ========================================================================= + - id: dream-procedure-weekly-report + name: "Procedure: dream pass extracts the weekly-report workflow" + description: | + Validates P1 procedure synthesis. After three weeks of the user + doing the same weekly-report sequence, the dream pass should + extract a ProcedureMemory with the four canonical steps in order + plus two parameters (recipient list, week window). + tags: [dream, procedure, p1] + priority: high + rubric: multi-session-memory + + sessions: + - id: s1-week-1 + time_offset: "0h" + reset: none + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Pull this week's metrics from analytics, summarize per channel, then draft an email to the leadership list. Send when ready." + + - id: s2-week-2 + time_offset: "168h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Same weekly report. Pull this week's numbers, channel breakdown, email to leadership." + + - id: s3-week-3 + time_offset: "336h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Run the weekly report again." + + expectations: + expected_behavior: | + Dream pass extracts a ProcedureMemory with the four ordered steps + and the two recurring parameters. Step order matters: pull + before summarize before draft before send. + expected_outcome: resolved + failure_modes: + - missing_step: Procedure has < 4 steps + - wrong_order: Steps present but in the wrong order + - missing_parameters: recipient_list or week_window not captured + + procedure: + golden_steps: + - "pull metrics from analytics" + - "summarize per channel" + - "draft email to stakeholders" + - "send" + golden_parameters: + - "recipient_list" + - "week_window" + pass_threshold: 0.7 + weight: + step_coverage: 1.0 + step_order: 2.0 + parameter_coverage: 1.0 + source: + fixture: fixtures/dream/procedure-weekly-report.json + + # ========================================================================= + # P1 — procedure synthesis: client onboarding + # ========================================================================= + - id: dream-procedure-client-onboarding + name: "Procedure: dream pass extracts the client-onboarding workflow" + description: | + Second P1 procedure scenario: a different repeated workflow (client + onboarding) with its own step set and parameters. Validates the + procedure scorer against a non-degenerate-similarity case. + tags: [dream, procedure, p1] + priority: high + rubric: multi-session-memory + + sessions: + - id: s1-client-a + time_offset: "0h" + reset: none + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "New client onboarding: create their folder in the workspace, draft welcome email, schedule kickoff call, share the onboarding doc." + + - id: s2-client-b + time_offset: "72h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Onboarding another new client. Same steps." + + expectations: + expected_behavior: | + Procedure has all four steps in order with the two parameters + (client name, kickoff date). + expected_outcome: resolved + failure_modes: + - missing_step: < 4 steps captured + - wrong_order: Steps present but reordered + - missing_parameters: client_name or kickoff_date not captured + + procedure: + golden_steps: + - "create folder in workspace" + - "draft welcome email" + - "schedule kickoff call" + - "share onboarding doc" + golden_parameters: + - "client_name" + - "kickoff_date" + pass_threshold: 0.7 + source: + fixture: fixtures/dream/procedure-onboarding.json + + # ========================================================================= + # P2 — dedup: clean near-duplicate merge + # ========================================================================= + - id: dream-dedup-near-duplicates + name: "Dedup: near-duplicate Sarah-billing facts are merged" + description: | + Validates P2 memory dedup. Two near-duplicate facts about Sarah's + billing role ("Sarah moved to billing" vs "Sarah is on the billing + team") should be merged into one cluster; HubSpot and fiscal-year + facts stay as singletons. + tags: [dream, dedup, p2] + priority: high + rubric: multi-session-memory + + turns: + - role: user + use_exact_message: true + content: "Show me what you know." + + expectations: + expected_behavior: | + Dedup pass clusters the two Sarah-billing facts together. ARI + approaches 1.0; pairwise F1 = 1.0. + expected_outcome: resolved + failure_modes: + - under_merge: Each Sarah-billing fact stayed in its own cluster + - over_merge: Sarah-billing got merged with an unrelated fact + + dedup: + golden_clusters: + - ["fact-sarah-billing-1", "fact-sarah-billing-2"] + - ["fact-hubspot-crm-1"] + - ["fact-fiscal-year-1"] + pass_threshold: 0.7 + source: + fixture: fixtures/dream/dedup-near-duplicates.json + + # ========================================================================= + # P2 negative — dedup that over-merges (false positive) + # ========================================================================= + - id: dream-dedup-false-positive + name: "Dedup: false-positive merge (Sarah-billing + Sarah-manager) fails" + description: | + Negative test: fixture over-merges by sweeping a Sarah-manager + fact into the Sarah-billing cluster. ARI drops; pairwise precision + drops; the scenario is expected to FAIL. + tags: [dream, dedup, negative, p2] + priority: high + rubric: multi-session-memory + + turns: + - role: user + use_exact_message: true + content: "Show me what you know." + + expectations: + expected_behavior: | + Fixture intentionally has an over-merge. Scenario is expected to + FAIL on the dedup score. + expected_outcome: resolved + failure_modes: + - false_positive: Scorer marked the over-merge as passing + + dedup: + golden_clusters: + - ["fact-sarah-billing-1", "fact-sarah-billing-2"] + - ["fact-sarah-manager-1"] + - ["fact-hubspot-crm-1"] + pass_threshold: 0.7 + source: + fixture: fixtures/dream/dedup-false-positive.json diff --git a/data/fixtures/dream/dedup-false-positive.json b/data/fixtures/dream/dedup-false-positive.json new file mode 100644 index 0000000..16a52b9 --- /dev/null +++ b/data/fixtures/dream/dedup-false-positive.json @@ -0,0 +1,10 @@ +{ + "clusters": [ + [ + "fact-sarah-billing-1", + "fact-sarah-billing-2", + "fact-sarah-manager-1" + ], + ["fact-hubspot-crm-1"] + ] +} diff --git a/data/fixtures/dream/dedup-near-duplicates.json b/data/fixtures/dream/dedup-near-duplicates.json new file mode 100644 index 0000000..1a154c4 --- /dev/null +++ b/data/fixtures/dream/dedup-near-duplicates.json @@ -0,0 +1,7 @@ +{ + "clusters": [ + ["fact-sarah-billing-1", "fact-sarah-billing-2"], + ["fact-hubspot-crm-1"], + ["fact-fiscal-year-1"] + ] +} diff --git a/data/fixtures/dream/demotion-cascade-bounded.json b/data/fixtures/dream/demotion-cascade-bounded.json new file mode 100644 index 0000000..5cfc44b --- /dev/null +++ b/data/fixtures/dream/demotion-cascade-bounded.json @@ -0,0 +1,8 @@ +{ + "observed": ["edge-northstar-marcus", "edge-northstar-cto"], + "cascade_touched": ["edge-northstar-marcus", "edge-northstar-cto"], + "retract_actions": [ + { "uuid": "edge-northstar-marcus", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" }, + { "uuid": "edge-northstar-cto", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" } + ] +} diff --git a/data/fixtures/dream/demotion-cascade-runaway.json b/data/fixtures/dream/demotion-cascade-runaway.json new file mode 100644 index 0000000..5f0c0e5 --- /dev/null +++ b/data/fixtures/dream/demotion-cascade-runaway.json @@ -0,0 +1,13 @@ +{ + "observed": ["edge-northstar-marcus", "edge-northstar-cto", "edge-hubspot-crm"], + "cascade_touched": [ + "edge-northstar-marcus", + "edge-northstar-cto", + "edge-hubspot-crm" + ], + "retract_actions": [ + { "uuid": "edge-northstar-marcus", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" }, + { "uuid": "edge-northstar-cto", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" }, + { "uuid": "edge-hubspot-crm", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" } + ] +} diff --git a/data/fixtures/dream/demotion-snodgrass-retract.json b/data/fixtures/dream/demotion-snodgrass-retract.json new file mode 100644 index 0000000..7debae7 --- /dev/null +++ b/data/fixtures/dream/demotion-snodgrass-retract.json @@ -0,0 +1,6 @@ +{ + "observed": ["edge-budget-q2"], + "retract_actions": [ + { "uuid": "edge-budget-q2", "expired_at_set": true, "invalid_at_set": false, "status": "superseded" } + ] +} diff --git a/data/fixtures/dream/demotion-snodgrass-violation.json b/data/fixtures/dream/demotion-snodgrass-violation.json new file mode 100644 index 0000000..241f69b --- /dev/null +++ b/data/fixtures/dream/demotion-snodgrass-violation.json @@ -0,0 +1,11 @@ +{ + "observed": ["edge-budget-q2"], + "retract_actions": [ + { + "uuid": "edge-budget-q2", + "expired_at_set": true, + "invalid_at_set": true, + "status": "superseded" + } + ] +} diff --git a/data/fixtures/dream/demotion-stale-fact.json b/data/fixtures/dream/demotion-stale-fact.json new file mode 100644 index 0000000..dcd2abe --- /dev/null +++ b/data/fixtures/dream/demotion-stale-fact.json @@ -0,0 +1,11 @@ +{ + "observed": ["edge-pricing-old"], + "retract_actions": [ + { + "uuid": "edge-pricing-old", + "expired_at_set": true, + "invalid_at_set": false, + "status": "superseded" + } + ] +} diff --git a/data/fixtures/dream/procedure-onboarding.json b/data/fixtures/dream/procedure-onboarding.json new file mode 100644 index 0000000..e2d0bce --- /dev/null +++ b/data/fixtures/dream/procedure-onboarding.json @@ -0,0 +1,9 @@ +{ + "steps": [ + "create folder in workspace", + "draft welcome email", + "schedule kickoff call", + "share onboarding doc" + ], + "parameters": ["client_name", "kickoff_date"] +} diff --git a/data/fixtures/dream/procedure-weekly-report.json b/data/fixtures/dream/procedure-weekly-report.json new file mode 100644 index 0000000..f829f0c --- /dev/null +++ b/data/fixtures/dream/procedure-weekly-report.json @@ -0,0 +1,9 @@ +{ + "steps": [ + "pull metrics from analytics", + "summarize per channel", + "draft email to stakeholders", + "send" + ], + "parameters": ["recipient_list", "week_window"] +} diff --git a/data/fixtures/retrieval/cascade-after-northstar-invalidated.json b/data/fixtures/retrieval/cascade-after-northstar-invalidated.json new file mode 100644 index 0000000..d6c0073 --- /dev/null +++ b/data/fixtures/retrieval/cascade-after-northstar-invalidated.json @@ -0,0 +1,17 @@ +[ + { + "id": "internal-process-invoicing", + "label": "Standing rule: invoices on the 1st", + "entity": "policy" + }, + { + "id": "hubspot-config", + "label": "HubSpot is the CRM", + "entity": "tool" + }, + { + "id": "fiscal-year", + "label": "Fiscal year ends in March", + "entity": "policy" + } +] diff --git a/data/fixtures/retrieval/forget-budget-after.json b/data/fixtures/retrieval/forget-budget-after.json new file mode 100644 index 0000000..0b1a6f3 --- /dev/null +++ b/data/fixtures/retrieval/forget-budget-after.json @@ -0,0 +1,14 @@ +[ + { + "id": "fact-marketing-channel-plan", + "label": "Marketing channel plan: balance paid and organic" + }, + { + "id": "fact-q2-sales-cycle", + "label": "Q2 sales cycle averages six weeks" + }, + { + "id": "fact-finance-runway", + "label": "Runway through Q3 funded from existing revenue" + } +] diff --git a/data/fixtures/retrieval/scope-project-atlas.json b/data/fixtures/retrieval/scope-project-atlas.json new file mode 100644 index 0000000..6bbba1f --- /dev/null +++ b/data/fixtures/retrieval/scope-project-atlas.json @@ -0,0 +1,17 @@ +[ + { + "id": "atlas-spec-doc", + "label": "Atlas project specification: v2 metrics overhaul", + "scope": "project:atlas" + }, + { + "id": "atlas-status", + "label": "Atlas status: green, on track for Q3", + "scope": "project:atlas" + }, + { + "id": "atlas-owner", + "label": "Atlas owner: Sarah", + "scope": "project:atlas" + } +] diff --git a/data/fixtures/retrieval/stale-fact-supersession.json b/data/fixtures/retrieval/stale-fact-supersession.json new file mode 100644 index 0000000..7f9afd8 --- /dev/null +++ b/data/fixtures/retrieval/stale-fact-supersession.json @@ -0,0 +1,20 @@ +[ + { + "id": "fact-pricing-current", + "label": "Pricing: $79/seat starter, $129/seat pro (current)", + "status": "active", + "valid_at": "2026-04-01" + }, + { + "id": "fact-roadmap-current", + "label": "Roadmap: shipping the metrics overhaul this quarter", + "status": "active", + "valid_at": "2026-04-15" + }, + { + "id": "fact-pricing-old", + "label": "Pricing: $49/seat flat (old, superseded)", + "status": "superseded", + "valid_at": "2025-09-01" + } +] diff --git a/data/fixtures/retrieval/warm-context-sarah.json b/data/fixtures/retrieval/warm-context-sarah.json new file mode 100644 index 0000000..75f1d81 --- /dev/null +++ b/data/fixtures/retrieval/warm-context-sarah.json @@ -0,0 +1,27 @@ +[ + { + "id": "fact-1", + "label": "Sarah's email: sarah@acme.co", + "scope": "personal" + }, + { + "id": "fact-2", + "label": "Atlas project status: green, shipping Q3", + "scope": "project:atlas" + }, + { + "id": "fact-3", + "label": "HubSpot is the CRM", + "scope": "personal" + }, + { + "id": "fact-4", + "label": "Marcus Lee, marcus@northstar.io, CTO", + "scope": "personal" + }, + { + "id": "fact-5", + "label": "Standing rule: invoices on the 1st", + "scope": "personal" + } +] diff --git a/data/retrieval-memory.yaml b/data/retrieval-memory.yaml new file mode 100644 index 0000000..854976c --- /dev/null +++ b/data/retrieval-memory.yaml @@ -0,0 +1,398 @@ +version: "1.0" +id: "retrieval-memory-v1" +name: "Retrieval-scored Memory Evaluation" + +defaults: + max_turns: 6 + timeout_seconds: 30 + category: "Memory" + persona: smb-founder + rubric: multi-session-memory + user_name: "Jordan Rivera" + copilot_mode: "fast" + +# Retrieval-scored memory pack. +# +# This pack pairs the existing conversational rubric with a quantitative +# ranking scorer that grades the top-k of a returned memory list against a +# curated golden set. Each scenario carries a `retrieval:` block that +# declares: +# - `golden`: the items the memory system should surface +# - `forbidden` (optional): items that MUST NOT be in the top-k; a forbidden +# hit forces a fail regardless of weighted score (used for forget, +# scope-filter, and demotion-correctness probes) +# - `k`: rank cutoff +# - `weight`: per-metric weights for the weighted aggregate +# - `pass_threshold`: pass cutoff on the weighted aggregate +# - `match`: comparison policy (substring | exact | regex) +# - `source`: where to read the returned items at scoring time +# * `raw_exchange_key`: read from `last_reply.rawExchange[]` +# when the adapter returns retrieval payloads inline (production +# AutoGPT-backed runs) +# * `fixture`: load a JSON file relative to this YAML (offline mode, +# exercising the scorer math against a known graph snapshot) +# +# Scenarios use fixtures here so they can be run against the scorer math +# without a live AutoGPT backend. Swap `source.fixture` for `source.raw_exchange_key` +# when running against a backend that emits retrieval payloads inline. + +scenarios: + + # ========================================================================= + # 1. Forget-on-request (ranking sibling of mem-negative-forget-on-request) + # ========================================================================= + - id: mem-retrieval-forget-on-request + name: "Retrieval: explicit forget command removes the fact from top-k" + description: | + Ranking-scored sibling of `mem-negative-forget-on-request` in + multi-session-memory.yaml (line 1710). Where the parent scenario uses + the LLM-judge to assert that the assistant doesn't return $50K, this + scenario asserts directly that the forbidden item is not in the + top-k of the retrieval response. + tags: [multi_session, memory, negative, retrieval] + priority: high + rubric: memory-abstention + + context: + system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations." + + sessions: + - id: s1-teach + time_offset: "0h" + reset: none + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Quick planning note: our budget for the Q2 marketing push is $50K. Help me sketch out how to allocate it across channels." + + - id: s2-forget + time_offset: "48h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Actually, scratch what I said about the Q2 marketing budget. I don't want that number floating around in your memory. Forget it." + + - id: s3-probe + time_offset: "120h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "What's our Q2 marketing budget?" + + expectations: + expected_behavior: | + Agent's retrieval response in S3 must not include the $50K figure or + the Q2 marketing budget fact. Other adjacent marketing-related + memories may still surface. + expected_outcome: resolved + failure_modes: + - forget_ignored: $50K appears in the top-k + - forget_acknowledged_but_leaked: $50K appears in the top-k even though the assistant text says it was forgotten (proves forget didn't clear storage -- worst failure) + - fabrication: A different number appears in the top-k + tester_note: | + Pair this with the existing `mem-negative-forget-on-request` judge + scenario in multi-session-memory.yaml. The judge checks the agent's + text reply; this scenario checks the retrieval payload directly. + + retrieval: + golden: + - "Marketing channel plan" + - "Q2 sales cycle" + forbidden: + - "$50K" + - "Q2 marketing budget" + k: 5 + pass_threshold: 0.2 + match: substring + weight: + precision_at_k: 0.5 + recall_at_k: 0.5 + mrr: 1.0 + ndcg_at_k: 1.0 + source: + fixture: fixtures/retrieval/forget-budget-after.json + + # ========================================================================= + # 2. Warm-context relevance — given a query, does the right set surface? + # ========================================================================= + - id: mem-retrieval-warm-context-sarah + name: "Retrieval: warm-context relevance for a typical knowledge query" + description: | + Given a working session that establishes Sarah, the Atlas project, the + CRM choice, and several incidental facts, a follow-up query about + Sarah should surface her email and the Atlas project status near the + top of the returned set. Tangential facts (invoicing rule, contact + info) may co-occur lower; this is graded by NDCG@k. + tags: [multi_session, memory, retention, retrieval] + priority: high + rubric: multi-session-memory + + context: + system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations." + + sessions: + - id: s1-seed + time_offset: "0h" + reset: none + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Quick housekeeping: my assistant is Sarah -- CC her on client stuff. Our CRM is HubSpot, our standing rule is invoices on the 1st, and Atlas is our flagship project (status: green, shipping Q3). New contact: Marcus Lee , NorthStar, CTO." + + - id: s2-probe + time_offset: "48h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "What do you have on Sarah and the Atlas project?" + + expectations: + expected_behavior: | + Sarah's email and Atlas project status appear in the top-k of the + retrieval response. Adjacent identity facts may co-surface, but the + two gold items should dominate the early ranks. + expected_outcome: resolved + failure_modes: + - partial_recall: Only one of the two gold facts surfaces + - missing: Neither gold fact surfaces (retention failure) + - poor_ranking: Gold facts present but buried below tangential ones (low NDCG) + + retrieval: + golden: + - "Sarah's email" + - "Atlas project status" + k: 5 + pass_threshold: 0.5 + match: substring + weight: + precision_at_k: 1.0 + recall_at_k: 1.0 + mrr: 1.0 + ndcg_at_k: 1.0 + source: + fixture: fixtures/retrieval/warm-context-sarah.json + + # ========================================================================= + # 3. Stale-fact demotion — superseded fact must not appear in top-k + # ========================================================================= + - id: mem-retrieval-stale-fact-demotion + name: "Retrieval: superseded pricing must not appear in top-k" + description: | + Pricing is updated mid-flight; the old pricing should be demoted + (marked `superseded`) and excluded from retrieval. The new pricing + and the current roadmap should surface. This is the dream-pass + "stale-fact deprecation" probe expressed as a ranking assertion. + tags: [multi_session, memory, temporal, retrieval] + priority: high + rubric: memory-temporal + + context: + system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations." + + sessions: + - id: s1-old-pricing + time_offset: "0h" + reset: none + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Quick reference for proposals: our pricing is $49/seat flat. Use that when drafting deals." + + - id: s2-pricing-update + time_offset: "240h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Pricing change: we're moving to tiered pricing. $79/seat starter, $129/seat pro. Use the new tiers from here forward." + + - id: s3-probe + time_offset: "480h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "What's our current pricing for new deals?" + + expectations: + expected_behavior: | + Retrieval surfaces only the new tiered pricing. The old $49/seat + figure should be demoted (superseded) and absent from the top-k. + expected_outcome: resolved + failure_modes: + - stale_returned: Old $49/seat figure appears in top-k + - both_returned: Both old and new appear (no demotion happened) + - missing: Neither appears (retention regression) + + retrieval: + golden: + - "$79/seat starter, $129/seat pro" + forbidden: + - "$49/seat" + k: 3 + pass_threshold: 0.5 + match: substring + weight: + precision_at_k: 1.0 + recall_at_k: 1.0 + mrr: 1.0 + ndcg_at_k: 1.0 + source: + fixture: fixtures/retrieval/stale-fact-supersession.json + + # ========================================================================= + # 4. Scope filtering — project:atlas query must not surface project:other + # ========================================================================= + - id: mem-retrieval-scope-filter-project + name: "Retrieval: scope-filtered query stays inside its scope" + description: | + A query scoped to project:atlas should surface only Atlas memories; + memories scoped to other projects (or personal scope) must not + appear in the top-k. Validates typed-edge filtering in the dream + memory graph. + tags: [multi_session, memory, retrieval, scope] + priority: high + rubric: memory-crossdomain + + context: + system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations." + injected_data: + query_scope: "project:atlas" + + sessions: + - id: s1-mixed-context + time_offset: "0h" + reset: none + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Working notes: Atlas project is green (v2 metrics overhaul). Sarah owns it. By the way, the Beacon project is on pause and the standard invoicing rule is the 1st of the month." + + - id: s2-scoped-probe + time_offset: "72h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Tell me about the Atlas project specifically -- only Atlas, not anything else." + + expectations: + expected_behavior: | + Retrieval returns only Atlas-scoped facts. Beacon, invoicing rule, + and other unrelated facts must not appear in the top-k. + expected_outcome: resolved + failure_modes: + - scope_leak_other_project: Beacon or another project's memory appears + - scope_leak_personal: Personal-scope memory appears in a project-scoped query + - missing: No Atlas facts surface + + retrieval: + golden: + - "Atlas project specification" + - "Atlas status" + - "Atlas owner" + forbidden: + - "Beacon" + - "invoices on the 1st" + k: 3 + pass_threshold: 0.5 + match: substring + weight: + precision_at_k: 1.0 + recall_at_k: 1.0 + mrr: 1.0 + ndcg_at_k: 1.0 + source: + fixture: fixtures/retrieval/scope-project-atlas.json + + # ========================================================================= + # 5. Cascading expiry — invalidated entity removes its facts, not the + # tangentially-related ones + # ========================================================================= + - id: mem-retrieval-cascading-expiry + name: "Retrieval: invalidating an entity removes its facts but not adjacent ones" + description: | + The user retracts the NorthStar lead (entity invalidated). Retrieval + after the retract must NOT surface Marcus Lee or any NorthStar fact, + but adjacent operational memory (CRM choice, fiscal year, standing + invoicing rule) should remain. Validates that the cascade is bounded + to the entity's own facts. + tags: [multi_session, memory, retrieval, cascading] + priority: medium + rubric: memory-hygiene + + context: + system_prompt: "You are AutoGPT, an autonomous AI agent helping SMB founders automate operations." + + sessions: + - id: s1-seed-northstar + time_offset: "0h" + reset: none + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Add a new contact: Marcus Lee , NorthStar, CTO. Also: HubSpot is our CRM, our fiscal year ends in March, and invoices go out on the 1st." + + - id: s2-retract-entity + time_offset: "72h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Actually, scratch NorthStar entirely. The deal fell through, the lead's dead, please retract everything we have on them or Marcus." + + - id: s3-probe-adjacent + time_offset: "168h" + reset: fresh_agent + max_turns: 1 + turns: + - role: user + use_exact_message: true + content: "Refresh me on our operational basics -- CRM, fiscal year, billing cadence." + + expectations: + expected_behavior: | + Retrieval in S3 surfaces operational adjacent facts (HubSpot, + fiscal year, invoicing rule). NorthStar and Marcus Lee facts are + absent. The cascade should not have removed adjacent unrelated + memory. + expected_outcome: resolved + failure_modes: + - cascade_too_wide: Adjacent facts (HubSpot, fiscal year, invoicing) missing -- cascade swept too much + - cascade_too_narrow: Marcus Lee or NorthStar still appears -- cascade didn't reach attached facts + - retain_after_retract: NorthStar entity still listed as active + + retrieval: + golden: + - "HubSpot is the CRM" + - "Fiscal year ends in March" + - "invoices on the 1st" + forbidden: + - "Marcus Lee" + - "NorthStar" + - "marcus@northstar.io" + k: 5 + pass_threshold: 0.6 + match: substring + weight: + precision_at_k: 1.0 + recall_at_k: 1.0 + mrr: 1.0 + ndcg_at_k: 1.0 + source: + fixture: fixtures/retrieval/cascade-after-northstar-invalidated.json diff --git a/docs/QUALITY_SCORE.md b/docs/QUALITY_SCORE.md index 6f85cd8..952c3c5 100644 --- a/docs/QUALITY_SCORE.md +++ b/docs/QUALITY_SCORE.md @@ -1,6 +1,6 @@ # Quality Score -Last updated: 2026-05-08 +Last updated: 2026-05-13 ## Health summary diff --git a/docs/generated/workspace-inventory.md b/docs/generated/workspace-inventory.md index 865161b..2cc904b 100644 --- a/docs/generated/workspace-inventory.md +++ b/docs/generated/workspace-inventory.md @@ -1,6 +1,6 @@ # Workspace Inventory -Generated: 2026-05-08T14:16:42.633Z +Generated: 2026-05-13T19:37:36.614Z ```text AGENTS.md @@ -66,6 +66,7 @@ data/ data/adversarial-scenarios.yaml data/autogpt-endpoint.yaml data/baseline-scenarios.yaml + data/dream-validation.yaml data/fixture-manifest.json data/fixtures/ data/fixtures/ad_spend_3mo.csv @@ -127,6 +128,16 @@ data/fixtures/ data/fixtures/dm_corpus.json data/fixtures/document_types.yaml data/fixtures/domain_to_industry_mapping.json +data/fixtures/dream/ + data/fixtures/dream/dedup-false-positive.json + data/fixtures/dream/dedup-near-duplicates.json + data/fixtures/dream/demotion-cascade-bounded.json + data/fixtures/dream/demotion-cascade-runaway.json + data/fixtures/dream/demotion-snodgrass-retract.json + data/fixtures/dream/demotion-snodgrass-violation.json + data/fixtures/dream/demotion-stale-fact.json + data/fixtures/dream/procedure-onboarding.json + data/fixtures/dream/procedure-weekly-report.json data/fixtures/editor_brief_template.md data/fixtures/employee_details.json data/fixtures/erp_ledger_2026-03.csv @@ -243,6 +254,12 @@ data/fixtures/ data/fixtures/research_questions.md data/fixtures/resume.pdf data/fixtures/retention_by_product.csv +data/fixtures/retrieval/ + data/fixtures/retrieval/cascade-after-northstar-invalidated.json + data/fixtures/retrieval/forget-budget-after.json + data/fixtures/retrieval/scope-project-atlas.json + data/fixtures/retrieval/stale-fact-supersession.json + data/fixtures/retrieval/warm-context-sarah.json data/fixtures/reviews_android.json data/fixtures/reviews_google_yelp.json data/fixtures/reviews_ios.json @@ -310,6 +327,7 @@ data/fixtures/snapshots/ data/openclaw-endpoints.yaml data/opencode-endpoints.yaml data/personas.yaml + data/retrieval-memory.yaml data/rubric.yaml data/scenarios.yaml docker-compose.yml @@ -395,8 +413,24 @@ src/cli/ src/cli/main.ts src/domains/ src/domains/evaluation/ + src/domains/evaluation/clustering.test.ts + src/domains/evaluation/clustering.ts + src/domains/evaluation/dedup-scorer.test.ts + src/domains/evaluation/dedup-scorer.ts + src/domains/evaluation/demotion-match.test.ts + src/domains/evaluation/demotion-match.ts + src/domains/evaluation/demotion-scorer.test.ts + src/domains/evaluation/demotion-scorer.ts src/domains/evaluation/judge.ts src/domains/evaluation/ports.ts + src/domains/evaluation/procedure-match.test.ts + src/domains/evaluation/procedure-match.ts + src/domains/evaluation/procedure-scorer.test.ts + src/domains/evaluation/procedure-scorer.ts + src/domains/evaluation/ranking.test.ts + src/domains/evaluation/ranking.ts + src/domains/evaluation/retrieval-scorer.test.ts + src/domains/evaluation/retrieval-scorer.ts src/domains/evaluation/run-suite.ts src/domains/evaluation/simulator.ts src/domains/reporting/ @@ -476,6 +510,7 @@ tests/unit/architecture/ tests/unit/dashboard/ tests/unit/dashboard/compare-view.test.tsx tests/unit/db.test.ts + tests/unit/dream-validation.test.ts tests/unit/endpoint-config.test.ts tests/unit/endpoint-overrides-controller.test.ts tests/unit/judge.test.ts @@ -493,6 +528,7 @@ tests/unit/persistence/ tests/unit/persistence/repository-contract.test.ts tests/unit/persistence/url.test.ts tests/unit/report.test.ts + tests/unit/retrieval-memory.test.ts tests/unit/runner.test.ts tests/unit/server/ tests/unit/server/comparison.test.ts diff --git a/docs/product-specs/current-state.md b/docs/product-specs/current-state.md index 71ee1a4..dcf3f6e 100644 --- a/docs/product-specs/current-state.md +++ b/docs/product-specs/current-state.md @@ -35,6 +35,8 @@ Last validated against `platform.md`: 2026-04-17 - [x] Database URL credentials stay redacted in operator-visible output - [x] Docker Compose readiness waits for server readiness - [x] Human scoring drains an unscored backlog one chat at a time +- [x] Ranking-scored scenarios grade retrieval relevance against a curated golden set +- [x] Dream-system scenarios validate demotion, procedure, and dedup behavior ## Notes diff --git a/docs/product-specs/e2e-checklist.md b/docs/product-specs/e2e-checklist.md index b09881d..5540498 100644 --- a/docs/product-specs/e2e-checklist.md +++ b/docs/product-specs/e2e-checklist.md @@ -32,3 +32,5 @@ Derived from `platform.md`. Every scenario should have a coverage owner. | Database URL credentials stay redacted in operator-visible output | `tests/unit/persistence/url.test.ts` + `tests/unit/server/config.test.ts` | ✅ covered | | Docker Compose readiness waits for server readiness | `docker-compose.yml` + `docs/playbooks/agent-probe-server.md` + `docker compose config` | ✅ covered | | Human scoring drains an unscored backlog one chat at a time | `tests/integration/server/human-scoring.test.ts` + `tests/unit/persistence/human-scoring.test.ts` | ✅ covered | +| Ranking-scored scenarios grade retrieval relevance against a curated golden set | `src/domains/evaluation/ranking.test.ts` + `src/domains/evaluation/retrieval-scorer.test.ts` + `tests/unit/retrieval-memory.test.ts` + `tests/unit/runner.test.ts` | ✅ covered | +| Dream-system scenarios validate demotion, procedure, and dedup behavior | `src/domains/evaluation/clustering.test.ts` + `src/domains/evaluation/demotion-match.test.ts` + `src/domains/evaluation/procedure-match.test.ts` + `src/domains/evaluation/{demotion,procedure,dedup}-scorer.test.ts` + `tests/unit/dream-validation.test.ts` | ✅ covered | diff --git a/docs/product-specs/platform.md b/docs/product-specs/platform.md index 85711c1..aec8c4d 100644 --- a/docs/product-specs/platform.md +++ b/docs/product-specs/platform.md @@ -298,6 +298,41 @@ The queue ignores scenario_runs whose status is not `completed`, and rerunning the click on a scenario already scored for the dimension is an upsert (no new row). +### Ranking-scored scenarios grade retrieval relevance against a curated golden set + +**Given** a scenario YAML that declares a `retrieval:` block with `golden` +(required), optional `forbidden`, `k`, `match`, `pass_threshold`, per-metric +`weight`, and a `source` (either `raw_exchange_key` or `fixture`) +**When** AgentProbe runs the scenario and the adapter returns a retrieval +payload (either inline on the last reply's `rawExchange[]` or via a +JSON fixture relative to the scenario YAML) +**Then** the runner computes precision@k, recall@k, MRR, and NDCG@k on the +returned list against the golden set, aggregates them under a weighted +average with `pass_threshold`, and forces a scenario fail when any +`forbidden` item appears in the top-k. Per-metric and aggregate scores are +persisted to `retrieval_scores` keyed by `scenario_run_id` for replay, and +the rendered run report surfaces them alongside the LLM-judge dimensions. + +### Dream-system scenarios validate demotion, procedure, and dedup behavior + +**Given** a scenario YAML that declares one of `demotion:`, `procedure:`, or +`dedup:` (each a sibling of the existing `retrieval:` block) with its own +golden expectation, weight, threshold, and `source` (fixture or +`raw_exchange_key`) +**When** AgentProbe runs the scenario and the adapter returns the +corresponding payload (observed demotions / extracted procedure / predicted +clusters) inline on the last reply's `rawExchange` or via a JSON fixture +**Then** the runner computes the appropriate metric set — set +precision/recall/F1 + Snodgrass timestamp discipline + single-hop cascade +bound for demotion; step-coverage F1 + LCS-normalized order similarity + +parameter Jaccard for procedure; pairwise P/R/F1 + Adjusted Rand Index for +dedup — aggregates under per-metric weights, and forces a fail on hard +violations (Snodgrass conflict, runaway cascade, over- or under-merge below +threshold). Per-metric and aggregate scores are persisted to +`demotion_scores`, `procedure_scores`, and `dedup_scores` tables keyed by +`scenario_run_id`, and the rendered run report surfaces them alongside the +LLM-judge and retrieval dimensions. + ### Database URL credentials stay redacted in operator-visible output **Given** an operator configures persistence with a database URL that contains diff --git a/scripts/seed-eval-scores.ts b/scripts/seed-eval-scores.ts new file mode 100644 index 0000000..0f72610 --- /dev/null +++ b/scripts/seed-eval-scores.ts @@ -0,0 +1,186 @@ +/** + * One-off seeder that writes a run + scenario_runs + retrieval/demotion/ + * procedure/dedup scores into the SQLite DB so the dashboard EvalScoresView + * has real data to render. Intended for local-dev demo / smoke testing. + * + * Usage: + * AGENTPROBE_DB_URL="sqlite:///$(pwd)/data/.agentprobe/runs.sqlite3" \ + * bun run scripts/seed-eval-scores.ts + */ + +import { randomUUID } from "node:crypto"; + +import { scoreScenarioDedup } from "../src/domains/evaluation/dedup-scorer.ts"; +import { scoreScenarioDemotion } from "../src/domains/evaluation/demotion-scorer.ts"; +import { scoreScenarioProcedure } from "../src/domains/evaluation/procedure-scorer.ts"; +import { scoreRetrieval } from "../src/domains/evaluation/retrieval-scorer.ts"; +import { + parseRubricsYaml, + parseScenarioYaml, +} from "../src/domains/validation/load-suite.ts"; +import { SqliteRunRecorder } from "../src/providers/persistence/sqlite-run-history.ts"; + +const dbUrl = + Bun.env.AGENTPROBE_DB_URL ?? + `sqlite:///${process.cwd()}/data/.agentprobe/runs.sqlite3`; + +console.log(`Seeding eval scores into ${dbUrl}`); + +const rubrics = parseRubricsYaml(`${process.cwd()}/data/rubric.yaml`).rubrics; + +const dreamScenarios = parseScenarioYaml( + `${process.cwd()}/data/dream-validation.yaml`, +); +const retrievalScenarios = parseScenarioYaml( + `${process.cwd()}/data/retrieval-memory.yaml`, +); + +const recorder = new SqliteRunRecorder(dbUrl); + +const runId = await recorder.recordRunStarted({ + endpoint: "data/autogpt-endpoint.yaml", + scenarios: "data/dream-validation.yaml + data/retrieval-memory.yaml", + personas: "data/personas.yaml", + rubric: "data/rubric.yaml", + label: "eval-scores demo seed", + notes: "seeded by scripts/seed-eval-scores.ts to populate the dashboard", + trigger: "manual", +}); +console.log(`Run id: ${runId}`); + +const allScenarios = [ + ...dreamScenarios.scenarios, + ...retrievalScenarios.scenarios, +]; +const personaSnapshot = { + id: "smb-founder", + name: "SMB Founder", +}; +let ordinal = 1; + +for (const scenario of allScenarios) { + const rubric = rubrics.find((r) => r.id === scenario.rubric); + if (!rubric) { + console.warn(`Skipping ${scenario.id}: no rubric resolved`); + continue; + } + + const scenarioRunId = await recorder.recordScenarioStarted({ + scenario, + persona: { + id: personaSnapshot.id, + name: personaSnapshot.name, + demographics: { + role: "founder", + techLiteracy: "high", + domainExpertise: "intermediate", + languageStyle: "terse", + }, + personality: { + patience: 3, + assertiveness: 4, + detailOrientation: 4, + cooperativeness: 4, + emotionalIntensity: 2, + }, + behavior: { + openingStyle: "direct", + followUpStyle: "concise", + escalationTriggers: [], + topicDrift: "low", + clarificationCompliance: "high", + }, + systemPrompt: "You are an SMB founder.", + }, + rubric, + ordinal, + userId: randomUUID(), + }); + + await recorder.recordJudgeResult(scenarioRunId, { + rubric, + score: { + dimensions: Object.fromEntries( + rubric.dimensions.map((dim) => [ + dim.id, + { + reasoning: "Synthetic seed data.", + evidence: ["seed"], + score: dim.scale.points ?? 1, + }, + ]), + ), + overallNotes: "Synthetic seed", + passed: true, + }, + overallScore: 1.0, + }); + + const evalContext = { + scenariosPath: + `${process.cwd()}/data/` + + (scenario.dedup || scenario.demotion || scenario.procedure + ? "dream-validation.yaml" + : "retrieval-memory.yaml"), + }; + + let allPassed = true; + const retrieval = scoreRetrieval(scenario, evalContext); + if (retrieval) { + await recorder.recordRetrievalResult(scenarioRunId, { + scenario, + score: retrieval, + }); + allPassed = allPassed && retrieval.passed; + } + const demotion = scoreScenarioDemotion(scenario, evalContext); + if (demotion) { + await recorder.recordDemotionResult(scenarioRunId, { + scenario, + score: demotion, + }); + allPassed = allPassed && demotion.passed; + } + const procedure = scoreScenarioProcedure(scenario, evalContext); + if (procedure) { + await recorder.recordProcedureResult(scenarioRunId, { + scenario, + score: procedure, + }); + allPassed = allPassed && procedure.passed; + } + const dedup = scoreScenarioDedup(scenario, evalContext); + if (dedup) { + await recorder.recordDedupResult(scenarioRunId, { + scenario, + score: dedup, + }); + allPassed = allPassed && dedup.passed; + } + + await recorder.recordScenarioFinished(scenarioRunId, { + result: { + scenarioId: scenario.id, + scenarioName: scenario.name, + personaId: "smb-founder", + rubricId: rubric.id, + passed: allPassed, + overallScore: 1.0, + transcript: [], + checkpoints: [], + }, + }); + ordinal += 1; + console.log( + ` [${ordinal - 1}/${allScenarios.length}] ${scenario.id}: passed=${allPassed}`, + ); +} + +await recorder.recordRunFinished({ + runId, + passed: true, + exitCode: 0, + results: [], +}); + +console.log(`Seeded ${ordinal - 1} scenarios in run ${runId}`); diff --git a/src/domains/evaluation/clustering.test.ts b/src/domains/evaluation/clustering.test.ts new file mode 100644 index 0000000..a2353ff --- /dev/null +++ b/src/domains/evaluation/clustering.test.ts @@ -0,0 +1,158 @@ +import { describe, expect, test } from "bun:test"; + +import { + adjustedRandIndex, + pairwiseAgreement, + pairwiseScores, + scoreClustering, +} from "./clustering.ts"; + +describe("pairwiseAgreement", () => { + test("perfect agreement counts every same-cluster pair as TP", () => { + const result = pairwiseAgreement([["a", "b", "c"]], [["a", "b", "c"]]); + expect(result.truePositives).toBe(3); + expect(result.falsePositives).toBe(0); + expect(result.falseNegatives).toBe(0); + expect(result.trueNegatives).toBe(0); + }); + + test("over-merging counts as false positives", () => { + // Predicted merges {a, b}; golden keeps them separate. + const result = pairwiseAgreement([["a", "b"]], [["a"], ["b"]]); + expect(result.truePositives).toBe(0); + expect(result.falsePositives).toBe(1); + expect(result.falseNegatives).toBe(0); + expect(result.trueNegatives).toBe(0); + }); + + test("under-merging counts as false negatives", () => { + // Predicted keeps them separate; golden merges them. + const result = pairwiseAgreement([["a"], ["b"]], [["a", "b"]]); + expect(result.truePositives).toBe(0); + expect(result.falsePositives).toBe(0); + expect(result.falseNegatives).toBe(1); + expect(result.trueNegatives).toBe(0); + }); +}); + +describe("pairwiseScores", () => { + test("perfect partition yields F1 of 1.0", () => { + const result = pairwiseScores( + [ + ["a", "b"], + ["c", "d"], + ], + [ + ["a", "b"], + ["c", "d"], + ], + ); + expect(result.precision).toBeCloseTo(1.0, 6); + expect(result.recall).toBeCloseTo(1.0, 6); + expect(result.f1).toBeCloseTo(1.0, 6); + }); + + test("over-merging drops precision more than recall", () => { + // Golden: {a, b}, {c, d}. Predicted: {a, b, c, d} (over-merge). + // Pairs: (a,b), (a,c), (a,d), (b,c), (b,d), (c,d) = 6 same-cluster predicted + // Of those, (a,b) and (c,d) are also same in golden -> TP=2, FP=4, FN=0 + const result = pairwiseScores( + [["a", "b", "c", "d"]], + [ + ["a", "b"], + ["c", "d"], + ], + ); + expect(result.precision).toBeCloseTo(2 / 6, 6); + expect(result.recall).toBeCloseTo(1.0, 6); + }); + + test("under-merging drops recall more than precision", () => { + // Golden: {a, b, c, d}. Predicted: {a, b}, {c, d}. + const result = pairwiseScores( + [ + ["a", "b"], + ["c", "d"], + ], + [["a", "b", "c", "d"]], + ); + expect(result.precision).toBeCloseTo(1.0, 6); + expect(result.recall).toBeCloseTo(2 / 6, 6); + }); +}); + +describe("adjustedRandIndex", () => { + test("perfect agreement yields ARI = 1", () => { + expect( + adjustedRandIndex( + [ + ["a", "b"], + ["c", "d"], + ], + [ + ["a", "b"], + ["c", "d"], + ], + ), + ).toBeCloseTo(1.0, 6); + }); + + test("complete disagreement on two pairs", () => { + // Golden: {a, b}, {c, d}. Predicted: {a, c}, {b, d}. + // Hubert-Arabie: index = 0 (no shared same-cluster pairs), + // sumPredChoose = C(2,2)+C(2,2) = 2, sumGoldChoose = 2, + // expected = 2*2/C(4,2) = 2/3, maxIndex = 2, + // ARI = (0 - 2/3) / (2 - 2/3) = -0.5. + expect( + adjustedRandIndex( + [ + ["a", "c"], + ["b", "d"], + ], + [ + ["a", "b"], + ["c", "d"], + ], + ), + ).toBeCloseTo(-0.5, 6); + }); + + test("ARI is symmetric in its inputs", () => { + const left = [ + ["a", "b", "c"], + ["d", "e"], + ]; + const right = [ + ["a", "b"], + ["c", "d", "e"], + ]; + expect(adjustedRandIndex(left, right)).toBeCloseTo( + adjustedRandIndex(right, left), + 6, + ); + }); + + test("single-item input yields ARI = 1", () => { + expect(adjustedRandIndex([["a"]], [["a"]])).toBe(1); + }); +}); + +describe("scoreClustering", () => { + test("aggregates precision/recall/F1/ARI in one call", () => { + const result = scoreClustering( + [ + ["a", "b"], + ["c", "d"], + ], + [ + ["a", "b"], + ["c", "d"], + ], + ); + expect(result.precision).toBeCloseTo(1.0, 6); + expect(result.recall).toBeCloseTo(1.0, 6); + expect(result.f1).toBeCloseTo(1.0, 6); + expect(result.ari).toBeCloseTo(1.0, 6); + expect(result.itemCount).toBe(4); + }); +}); diff --git a/src/domains/evaluation/clustering.ts b/src/domains/evaluation/clustering.ts new file mode 100644 index 0000000..8859f93 --- /dev/null +++ b/src/domains/evaluation/clustering.ts @@ -0,0 +1,239 @@ +/** + * Pure clustering / partition metrics for the dedup scorer. + * + * Given a `predicted` partition (list of clusters of item IDs) and a `golden` + * partition (the ground-truth clusters), score how well they agree. Used to + * grade memory-dedup passes: did the dedup pass cluster near-duplicates + * correctly? + * + * Metrics: + * - pairwise precision/recall/F1 over the same-cluster relation + * - Adjusted Rand Index (Hubert & Arabie 1985) — chance-corrected agreement + * + * All functions operate on item IDs (strings). Items present in the predicted + * partition but absent from the golden one (or vice versa) are treated as + * singletons in the missing side, so the math degrades gracefully under + * partial coverage. No I/O. + */ + +export type Cluster = readonly string[]; +export type Partition = readonly Cluster[]; + +/** + * Collect every distinct item across both partitions. Each item appears at + * most once even when the input clusters contain duplicates. + */ +function collectItems(left: Partition, right: Partition): string[] { + const seen = new Set(); + for (const cluster of [...left, ...right]) { + for (const item of cluster) { + seen.add(item); + } + } + return [...seen].sort(); +} + +/** + * Map each item to a numeric cluster id under the partition. Items present in + * `items` but not assigned a cluster in `partition` are emitted as + * singleton clusters (each gets its own unique id) so the math degrades + * gracefully under partial coverage. + */ +function assignClusterIds( + partition: Partition, + items: readonly string[], +): Map { + const assignment = new Map(); + partition.forEach((cluster, index) => { + for (const item of cluster) { + if (!assignment.has(item)) { + assignment.set(item, index); + } + } + }); + let nextSingletonId = partition.length; + for (const item of items) { + if (!assignment.has(item)) { + assignment.set(item, nextSingletonId); + nextSingletonId += 1; + } + } + return assignment; +} + +export type PairwiseAgreement = { + truePositives: number; + falsePositives: number; + falseNegatives: number; + trueNegatives: number; +}; + +/** + * Build the 2x2 contingency over unordered item pairs: + * TP = same cluster in both + * FP = same in predicted, different in golden + * FN = different in predicted, same in golden + * TN = different in both + */ +export function pairwiseAgreement( + predicted: Partition, + golden: Partition, +): PairwiseAgreement { + const items = collectItems(predicted, golden); + const pred = assignClusterIds(predicted, items); + const gold = assignClusterIds(golden, items); + + let tp = 0; + let fp = 0; + let fn = 0; + let tn = 0; + for (let i = 0; i < items.length; i += 1) { + for (let j = i + 1; j < items.length; j += 1) { + const left = items[i] ?? ""; + const right = items[j] ?? ""; + const sameInPred = pred.get(left) === pred.get(right); + const sameInGold = gold.get(left) === gold.get(right); + if (sameInPred && sameInGold) { + tp += 1; + } else if (sameInPred && !sameInGold) { + fp += 1; + } else if (!sameInPred && sameInGold) { + fn += 1; + } else { + tn += 1; + } + } + } + return { + truePositives: tp, + falsePositives: fp, + falseNegatives: fn, + trueNegatives: tn, + }; +} + +export type PairwiseScores = { + precision: number; + recall: number; + f1: number; +}; + +/** + * Pairwise precision/recall/F1 over the same-cluster relation. Returns 1 for + * a metric when its denominator is 0 (the partition has no positive + * judgments to score). This matches the convention used by `pytrec_eval` and + * `scikit-learn.metrics.cluster.pair_confusion_matrix`. + */ +export function pairwiseScores( + predicted: Partition, + golden: Partition, +): PairwiseScores { + const { truePositives, falsePositives, falseNegatives } = pairwiseAgreement( + predicted, + golden, + ); + const precision = + truePositives + falsePositives === 0 + ? 1 + : truePositives / (truePositives + falsePositives); + const recall = + truePositives + falseNegatives === 0 + ? 1 + : truePositives / (truePositives + falseNegatives); + const f1 = + precision + recall === 0 + ? 0 + : (2 * precision * recall) / (precision + recall); + return { precision, recall, f1 }; +} + +function choose2(n: number): number { + return n < 2 ? 0 : (n * (n - 1)) / 2; +} + +/** + * Adjusted Rand Index. Range typically [-0.something, 1]; 0 means agreement + * at chance level, 1 means perfect agreement, negative means worse than + * chance. Hubert & Arabie 1985. When both partitions have a single + * cluster (or all singletons), ARI is defined as 1. + */ +export function adjustedRandIndex( + predicted: Partition, + golden: Partition, +): number { + const items = collectItems(predicted, golden); + if (items.length < 2) { + return 1; + } + const pred = assignClusterIds(predicted, items); + const gold = assignClusterIds(golden, items); + + const predIds = [...new Set(pred.values())]; + const goldIds = [...new Set(gold.values())]; + + // Contingency matrix counts[i][j] = items in pred cluster i and gold cluster j. + const counts = new Map>(); + for (const item of items) { + const p = pred.get(item) ?? -1; + const g = gold.get(item) ?? -1; + let row = counts.get(p); + if (!row) { + row = new Map(); + counts.set(p, row); + } + row.set(g, (row.get(g) ?? 0) + 1); + } + + const predSizes = predIds.map((id) => + items.reduce((sum, item) => (pred.get(item) === id ? sum + 1 : sum), 0), + ); + const goldSizes = goldIds.map((id) => + items.reduce((sum, item) => (gold.get(item) === id ? sum + 1 : sum), 0), + ); + + let index = 0; + for (const row of counts.values()) { + for (const value of row.values()) { + index += choose2(value); + } + } + + const sumPredChoose = predSizes.reduce((sum, size) => sum + choose2(size), 0); + const sumGoldChoose = goldSizes.reduce((sum, size) => sum + choose2(size), 0); + const total = choose2(items.length); + if (total === 0) { + return 1; + } + const expected = (sumPredChoose * sumGoldChoose) / total; + const maxIndex = (sumPredChoose + sumGoldChoose) / 2; + if (maxIndex === expected) { + return 1; + } + return (index - expected) / (maxIndex - expected); +} + +export type ClusterScore = { + precision: number; + recall: number; + f1: number; + ari: number; + pairCounts: PairwiseAgreement; + itemCount: number; +}; + +export function scoreClustering( + predicted: Partition, + golden: Partition, +): ClusterScore { + const items = collectItems(predicted, golden); + const { precision, recall, f1 } = pairwiseScores(predicted, golden); + const ari = adjustedRandIndex(predicted, golden); + return { + precision, + recall, + f1, + ari, + pairCounts: pairwiseAgreement(predicted, golden), + itemCount: items.length, + }; +} diff --git a/src/domains/evaluation/dedup-scorer.test.ts b/src/domains/evaluation/dedup-scorer.test.ts new file mode 100644 index 0000000..33f7427 --- /dev/null +++ b/src/domains/evaluation/dedup-scorer.test.ts @@ -0,0 +1,129 @@ +import { describe, expect, test } from "bun:test"; +import { mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import type { + AdapterReply, + DedupConfig, + Scenario, +} from "../../shared/types/contracts.ts"; +import { + coerceDedupPayload, + resolveDedupPayload, + scoreScenarioDedup, +} from "./dedup-scorer.ts"; + +function makeTempDir(prefix: string): string { + return mkdtempSync(join(tmpdir(), `agentprobe-${prefix}-`)); +} + +function buildConfig(overrides: Partial = {}): DedupConfig { + return { + goldenClusters: overrides.goldenClusters ?? [ + ["a", "b"], + ["c", "d"], + ], + weights: overrides.weights ?? { precision: 1, recall: 1, f1: 1, ari: 1 }, + passThreshold: overrides.passThreshold ?? 0.6, + source: overrides.source, + }; +} + +function buildScenario(config: DedupConfig | undefined): Scenario { + return { + id: "dedup-test", + name: "Dedup test", + tags: [], + turns: [], + sessions: [], + expectations: { + mustInclude: [], + mustNotInclude: [], + expectedTools: [], + failureModes: [], + }, + dedup: config, + }; +} + +function buildReply(payload: unknown): AdapterReply { + return { + assistantText: "...", + toolCalls: [], + rawExchange: { dedup: payload } as unknown as AdapterReply["rawExchange"], + latencyMs: 0, + usage: {}, + }; +} + +describe("coerceDedupPayload", () => { + test("accepts a bare list of clusters", () => { + expect(coerceDedupPayload([["a", "b"], ["c"]]).clusters).toEqual([ + ["a", "b"], + ["c"], + ]); + }); + + test("accepts {clusters: [[...]]}", () => { + expect(coerceDedupPayload({ clusters: [["a"], ["b"]] }).clusters).toEqual([ + ["a"], + ["b"], + ]); + }); + + test("ignores non-string members and empty clusters", () => { + expect( + coerceDedupPayload([["a", 1, null, "b"], [], ["c"]]).clusters, + ).toEqual([["a", "b"], ["c"]]); + }); +}); + +describe("resolveDedupPayload", () => { + test("loads fixture", () => { + const dir = makeTempDir("dedup-fixture"); + writeFileSync( + join(dir, "dd.json"), + JSON.stringify({ clusters: [["a", "b"], ["c"]] }), + "utf8", + ); + const config = buildConfig({ source: { fixture: "dd.json" } }); + const result = resolveDedupPayload(config, { + scenariosPath: join(dir, "scenarios.yaml"), + }); + expect(result.source).toBe("fixture"); + expect(result.payload.clusters).toEqual([["a", "b"], ["c"]]); + }); +}); + +describe("scoreScenarioDedup", () => { + test("returns undefined when no dedup block on scenario", () => { + expect(scoreScenarioDedup(buildScenario(undefined), {})).toBeUndefined(); + }); + + test("perfect match passes with all metrics 1.0 (ARI mapped to 1)", () => { + const scenario = buildScenario(buildConfig()); + const reply = buildReply({ + clusters: [ + ["a", "b"], + ["c", "d"], + ], + }); + const result = scoreScenarioDedup(scenario, { lastAdapterReply: reply }); + expect(result?.passed).toBe(true); + expect(result?.weightedScore).toBeCloseTo(1.0, 6); + }); + + test("complete disagreement drops the weighted score below threshold", () => { + const scenario = buildScenario(buildConfig()); + // Golden: {a, b}, {c, d}. Predicted: {a, c}, {b, d} — pairwise F1=0, ARI=-0.5 + const reply = buildReply({ + clusters: [ + ["a", "c"], + ["b", "d"], + ], + }); + const result = scoreScenarioDedup(scenario, { lastAdapterReply: reply }); + expect(result?.passed).toBe(false); + }); +}); diff --git a/src/domains/evaluation/dedup-scorer.ts b/src/domains/evaluation/dedup-scorer.ts new file mode 100644 index 0000000..7720a22 --- /dev/null +++ b/src/domains/evaluation/dedup-scorer.ts @@ -0,0 +1,174 @@ +import { existsSync, readFileSync, statSync } from "node:fs"; +import { dirname, isAbsolute, resolve } from "node:path"; + +import type { + AdapterReply, + DedupConfig, + DedupMetricScore, + DedupScore, + EvalSource, + JsonValue, + Scenario, +} from "../../shared/types/contracts.ts"; +import { AgentProbeRuntimeError } from "../../shared/utils/errors.ts"; +import { logWarn } from "../../shared/utils/logging.ts"; +import { scoreClustering } from "./clustering.ts"; + +const DEFAULT_RAW_EXCHANGE_KEY = "dedup"; + +export type DedupPayload = { + clusters?: string[][]; +}; + +function resolveFixturePath( + scenariosPath: string | undefined, + fixture: string, +): string { + if (isAbsolute(fixture)) { + return fixture; + } + if (!scenariosPath) { + return resolve(fixture); + } + let base: string; + try { + base = + existsSync(scenariosPath) && statSync(scenariosPath).isDirectory() + ? scenariosPath + : dirname(scenariosPath); + } catch { + base = dirname(scenariosPath); + } + return resolve(base, fixture); +} + +export function coerceDedupPayload(payload: unknown): DedupPayload { + if (!payload) { + return {}; + } + // Accept `{clusters: [[...], [...]]}` or a bare `[[...], [...]]`. + if (Array.isArray(payload)) { + return { clusters: coerceClusters(payload) }; + } + if (typeof payload === "object") { + const record = payload as Record; + if (Array.isArray(record.clusters)) { + return { clusters: coerceClusters(record.clusters) }; + } + } + return {}; +} + +function coerceClusters(values: unknown[]): string[][] { + const clusters: string[][] = []; + for (const cluster of values) { + if (!Array.isArray(cluster)) { + continue; + } + const items = cluster.flatMap((item) => + typeof item === "string" ? [item] : [], + ); + if (items.length > 0) { + clusters.push(items); + } + } + return clusters; +} + +export type DedupSourceContext = { + scenariosPath?: string; + lastAdapterReply?: AdapterReply; +}; + +export type ResolvedDedup = { + payload: DedupPayload; + source: EvalSource; +}; + +export function resolveDedupPayload( + config: DedupConfig, + context: DedupSourceContext, +): ResolvedDedup { + const fixture = config.source?.fixture; + if (fixture) { + const resolved = resolveFixturePath(context.scenariosPath, fixture); + if (!existsSync(resolved)) { + throw new AgentProbeRuntimeError(`Dedup fixture not found: ${resolved}`); + } + return { + payload: coerceDedupPayload(JSON.parse(readFileSync(resolved, "utf8"))), + source: "fixture", + }; + } + const key = config.source?.rawExchangeKey ?? DEFAULT_RAW_EXCHANGE_KEY; + const rawExchange = context.lastAdapterReply?.rawExchange; + if (rawExchange && typeof rawExchange === "object") { + const candidate = (rawExchange as Record)[key]; + if (candidate !== undefined) { + return { + payload: coerceDedupPayload(candidate), + source: "raw_exchange", + }; + } + } + return { payload: {}, source: "missing" }; +} + +export function scoreScenarioDedup( + scenario: Scenario, + context: DedupSourceContext, +): DedupScore | undefined { + const config = scenario.dedup; + if (!config) { + return undefined; + } + + let resolution: ResolvedDedup; + try { + resolution = resolveDedupPayload(config, context); + } catch (error) { + logWarn( + `Dedup scorer failed to resolve payload for ${scenario.id}: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + resolution = { payload: {}, source: "missing" }; + } + + const predicted = resolution.payload.clusters ?? []; + const result = scoreClustering(predicted, config.goldenClusters); + + const metrics: DedupMetricScore[] = [ + { + metric: "precision", + value: result.precision, + weight: config.weights.precision, + }, + { metric: "recall", value: result.recall, weight: config.weights.recall }, + { metric: "f1", value: result.f1, weight: config.weights.f1 }, + // Map ARI from [-1, 1] to [0, 1] so it composes with the others. + { metric: "ari", value: (result.ari + 1) / 2, weight: config.weights.ari }, + ]; + const totalWeight = metrics.reduce( + (sum, m) => sum + Math.max(0, m.weight), + 0, + ); + const weightedScore = + totalWeight === 0 + ? 0 + : metrics.reduce( + (sum, m) => (m.weight > 0 ? sum + m.value * m.weight : sum), + 0, + ) / totalWeight; + + return { + metrics, + weightedScore, + passThreshold: config.passThreshold, + passed: weightedScore >= config.passThreshold, + predictedClusters: predicted.map((c) => [...c]), + goldenClusters: config.goldenClusters.map((c) => [...c]), + itemCount: result.itemCount, + source: resolution.source, + }; +} diff --git a/src/domains/evaluation/demotion-match.test.ts b/src/domains/evaluation/demotion-match.test.ts new file mode 100644 index 0000000..dea55d1 --- /dev/null +++ b/src/domains/evaluation/demotion-match.test.ts @@ -0,0 +1,154 @@ +import { describe, expect, test } from "bun:test"; + +import { + assertCascadeBounded, + assertExpectedSet, + assertTimestampDiscipline, + scoreDemotion, +} from "./demotion-match.ts"; + +describe("assertExpectedSet", () => { + test("perfect match yields F1 1.0", () => { + const result = assertExpectedSet(["a", "b"], ["a", "b"]); + expect(result.f1).toBeCloseTo(1.0, 6); + expect(result.falsePositives).toEqual([]); + expect(result.falseNegatives).toEqual([]); + }); + + test("touched the wrong edge yields FP and precision drop", () => { + const result = assertExpectedSet(["a", "wrong"], ["a", "b"]); + expect(result.falsePositives).toEqual(["wrong"]); + expect(result.falseNegatives).toEqual(["b"]); + expect(result.precision).toBeCloseTo(0.5, 6); + expect(result.recall).toBeCloseTo(0.5, 6); + }); + + test("missed an expected edge yields FN and recall drop", () => { + const result = assertExpectedSet(["a"], ["a", "b"]); + expect(result.recall).toBeCloseTo(0.5, 6); + expect(result.precision).toBeCloseTo(1.0, 6); + }); + + test("nothing expected and nothing touched is perfect", () => { + const result = assertExpectedSet([], []); + expect(result.f1).toBeCloseTo(1.0, 6); + }); +}); + +describe("assertTimestampDiscipline", () => { + test("a clean retract (expired_at only) has no violation", () => { + const violations = assertTimestampDiscipline( + [ + { + uuid: "edge1", + expiredAtSet: true, + invalidAtSet: false, + }, + ], + [], + ); + expect(violations).toEqual([]); + }); + + test("a retract that also set invalid_at is flagged", () => { + const violations = assertTimestampDiscipline( + [ + { + uuid: "edge1", + expiredAtSet: true, + invalidAtSet: true, + }, + ], + [], + ); + expect(violations).toHaveLength(1); + expect(violations[0]?.expectation).toBe("retract_only_expired"); + }); + + test("a soft_delete that set only one timestamp is flagged", () => { + const violations = assertTimestampDiscipline( + [], + [ + { + uuid: "edge2", + expiredAtSet: true, + invalidAtSet: false, + }, + ], + ); + expect(violations).toHaveLength(1); + expect(violations[0]?.expectation).toBe("soft_delete_both"); + }); +}); + +describe("assertCascadeBounded", () => { + test("touching only direct neighbors is bounded", () => { + const result = assertCascadeBounded( + ["e_ab", "e_bc"], + ["e_ab", "e_bc"], + ["e_cd"], + ); + expect(result.bounded).toBe(true); + expect(result.touchedTangentialNeighbors).toEqual([]); + expect(result.directNeighborF1).toBeCloseTo(1.0, 6); + }); + + test("touching a 2-hop edge is a runaway-demotion failure", () => { + // Graph A -> B -> C -> D. Invalidate B. Direct: (A,B), (B,C). Tangential: (C,D). + const result = assertCascadeBounded( + ["e_ab", "e_bc", "e_cd"], + ["e_ab", "e_bc"], + ["e_cd"], + ); + expect(result.bounded).toBe(false); + expect(result.touchedTangentialNeighbors).toEqual(["e_cd"]); + }); + + test("missing a direct neighbor lowers directNeighborF1 but stays bounded", () => { + const result = assertCascadeBounded(["e_ab"], ["e_ab", "e_bc"], ["e_cd"]); + expect(result.bounded).toBe(true); + expect(result.missedDirectNeighbors).toEqual(["e_bc"]); + expect(result.directNeighborF1).toBeLessThan(1.0); + }); +}); + +describe("scoreDemotion", () => { + test("perfect demotion of the expected set passes", () => { + const result = scoreDemotion({ + observedDemotions: ["e1", "e2"], + expectedDemotions: ["e1", "e2"], + retractActions: [ + { uuid: "e1", expiredAtSet: true, invalidAtSet: false }, + { uuid: "e2", expiredAtSet: true, invalidAtSet: false }, + ], + }); + expect(result.weightedScore).toBeCloseTo(1.0, 6); + expect(result.passed).toBe(true); + }); + + test("a timestamp violation is a hard fail regardless of set match", () => { + const result = scoreDemotion({ + observedDemotions: ["e1"], + expectedDemotions: ["e1"], + retractActions: [ + { uuid: "e1", expiredAtSet: true, invalidAtSet: true }, // wrong + ], + }); + expect(result.timestampViolations).toHaveLength(1); + expect(result.passed).toBe(false); + }); + + test("a runaway cascade is a hard fail", () => { + const result = scoreDemotion({ + observedDemotions: ["e_ab", "e_bc", "e_cd"], + expectedDemotions: ["e_ab", "e_bc"], + cascade: { + touched: ["e_ab", "e_bc", "e_cd"], + expectedDirectNeighbors: ["e_ab", "e_bc"], + tangentialEdges: ["e_cd"], + }, + }); + expect(result.cascade?.bounded).toBe(false); + expect(result.passed).toBe(false); + }); +}); diff --git a/src/domains/evaluation/demotion-match.ts b/src/domains/evaluation/demotion-match.ts new file mode 100644 index 0000000..26b4ab3 --- /dev/null +++ b/src/domains/evaluation/demotion-match.ts @@ -0,0 +1,323 @@ +/** + * Demotion-correctness primitives for the demotion-precision scorer. + * + * These score the *structural* half of demotion correctness — which edges + * the dream pass actually touched vs which it was supposed to touch. They + * cover: + * + * - **P-1.3 retract-vs-soft-delete:** did `_retract_edges` set only + * `expired_at`, and did `_soft_delete_edges` set both? Scored by + * `assertTimestampDiscipline`. + * - **P0.3a stale-fact deprecation:** did the pass demote items that were + * genuinely stale and leave fresh items alone? Scored by + * `assertExpectedSet`. + * - **P0.3b scoped cascading expiry:** did the cascade touch the entity's + * direct neighbors and nothing 2+ hops away? Scored by + * `assertCascadeBounded`. + * + * The LLM-judged half (was the demotion semantically warranted?) goes + * through the existing `judgeResponse` pipeline; this module is only the + * deterministic structural check. + * + * All functions are pure. + */ + +export type DemotionAction = { + /** UUID of the edge or memory that was demoted. */ + uuid: string; + /** Optional human label for reports. */ + label?: string; + /** `expired_at` / `invalid_at` flags set by the operation. */ + expiredAtSet: boolean; + invalidAtSet: boolean; + /** New status property, if any. */ + status?: string; +}; + +export type SetCheckResult = { + /** Items the dream pass correctly touched. */ + truePositives: string[]; + /** Items it touched but shouldn't have. */ + falsePositives: string[]; + /** Items it missed. */ + falseNegatives: string[]; + precision: number; + recall: number; + f1: number; +}; + +function normalize(value: string): string { + return value.trim(); +} + +function dedup(values: readonly string[]): string[] { + return [...new Set(values.map(normalize))]; +} + +/** + * Set-level precision/recall over which UUIDs were touched vs the + * `expected` set. The denominators degrade gracefully: + * - empty expected + empty observed = perfect score + * - empty expected + nonempty observed = precision 0, recall 1 + * - nonempty expected + empty observed = precision 1, recall 0 + */ +export function assertExpectedSet( + observed: readonly string[], + expected: readonly string[], +): SetCheckResult { + const observed_ = dedup(observed); + const expected_ = dedup(expected); + const expectedSet = new Set(expected_); + const observedSet = new Set(observed_); + const tp = observed_.filter((id) => expectedSet.has(id)); + const fp = observed_.filter((id) => !expectedSet.has(id)); + const fn = expected_.filter((id) => !observedSet.has(id)); + const precision = + observed_.length === 0 + ? expected_.length === 0 + ? 1 + : 0 + : tp.length / observed_.length; + const recall = expected_.length === 0 ? 1 : tp.length / expected_.length; + const f1 = + precision + recall === 0 + ? 0 + : (2 * precision * recall) / (precision + recall); + return { + truePositives: tp, + falsePositives: fp, + falseNegatives: fn, + precision, + recall, + f1, + }; +} + +export type TimestampViolation = { + uuid: string; + expectation: "retract_only_expired" | "soft_delete_both"; + observed: { expiredAtSet: boolean; invalidAtSet: boolean }; + message: string; +}; + +/** + * Verify the Snodgrass bi-temporal discipline for a list of demotions. + * + * `retract` actions must set only `expired_at` (transaction-time + * retraction). `soft_delete` actions must set BOTH `expired_at` and + * `invalid_at` (the world changed AND we recorded it). Returns a list of + * violations; empty list means the discipline held. + */ +export function assertTimestampDiscipline( + retractActions: readonly DemotionAction[], + softDeleteActions: readonly DemotionAction[], +): TimestampViolation[] { + const violations: TimestampViolation[] = []; + for (const action of retractActions) { + if (!action.expiredAtSet || action.invalidAtSet) { + violations.push({ + uuid: action.uuid, + expectation: "retract_only_expired", + observed: { + expiredAtSet: action.expiredAtSet, + invalidAtSet: action.invalidAtSet, + }, + message: `retract must set expired_at only; got expired_at=${action.expiredAtSet}, invalid_at=${action.invalidAtSet}`, + }); + } + } + for (const action of softDeleteActions) { + if (!action.expiredAtSet || !action.invalidAtSet) { + violations.push({ + uuid: action.uuid, + expectation: "soft_delete_both", + observed: { + expiredAtSet: action.expiredAtSet, + invalidAtSet: action.invalidAtSet, + }, + message: `soft_delete must set both expired_at and invalid_at; got expired_at=${action.expiredAtSet}, invalid_at=${action.invalidAtSet}`, + }); + } + } + return violations; +} + +export type CascadeCheckResult = { + /** Edges the cascade touched that should have been touched (1-hop). */ + touchedDirectNeighbors: string[]; + /** Edges 1-hop away that the cascade was supposed to touch but didn't. */ + missedDirectNeighbors: string[]; + /** Edges 2+ hops away that the cascade touched (RUNAWAY DEMOTION — failure). */ + touchedTangentialNeighbors: string[]; + /** True when no tangential edges were touched. The single-hop discipline rule. */ + bounded: boolean; + /** Set-level F1 over the expected-direct set. */ + directNeighborF1: number; +}; + +/** + * P0.3b single-hop cascade check. + * + * `expectedDirectNeighbors` is the set of edges that should be demoted when + * the entity is invalidated (its direct attachments). `tangentialEdges` is + * the set of 2+ hop edges that must NOT be touched. `touched` is the actual + * list of edges the cascade demoted. + */ +export function assertCascadeBounded( + touched: readonly string[], + expectedDirectNeighbors: readonly string[], + tangentialEdges: readonly string[], +): CascadeCheckResult { + const touched_ = dedup(touched); + const expected_ = dedup(expectedDirectNeighbors); + const tangential_ = dedup(tangentialEdges); + const expectedSet = new Set(expected_); + const tangentialSet = new Set(tangential_); + const touchedSet = new Set(touched_); + + const touchedDirect = touched_.filter((id) => expectedSet.has(id)); + const missedDirect = expected_.filter((id) => !touchedSet.has(id)); + const touchedTangential = touched_.filter((id) => tangentialSet.has(id)); + + const setResult = assertExpectedSet( + touched_.filter((id) => expectedSet.has(id) || tangentialSet.has(id)), + expected_, + ); + + return { + touchedDirectNeighbors: touchedDirect, + missedDirectNeighbors: missedDirect, + touchedTangentialNeighbors: touchedTangential, + bounded: touchedTangential.length === 0, + directNeighborF1: setResult.f1, + }; +} + +export type DemotionMetricKey = + | "set_precision" + | "set_recall" + | "set_f1" + | "timestamp_discipline" + | "cascade_bounded" + | "cascade_direct_f1"; + +export type DemotionMetricScore = { + metric: DemotionMetricKey; + value: number; + weight: number; +}; + +export type DemotionMatchInput = { + observedDemotions: readonly string[]; + expectedDemotions: readonly string[]; + retractActions?: readonly DemotionAction[]; + softDeleteActions?: readonly DemotionAction[]; + cascade?: { + touched: readonly string[]; + expectedDirectNeighbors: readonly string[]; + tangentialEdges: readonly string[]; + }; + weights?: Partial>; + passThreshold?: number; +}; + +export type DemotionMatchResult = { + metrics: DemotionMetricScore[]; + weightedScore: number; + passThreshold: number; + passed: boolean; + set: SetCheckResult; + timestampViolations: TimestampViolation[]; + cascade?: CascadeCheckResult; +}; + +const DEFAULT_DEMOTION_WEIGHTS: Required> = { + set_precision: 1, + set_recall: 1, + set_f1: 1, + timestamp_discipline: 1, + cascade_bounded: 1, + cascade_direct_f1: 1, +}; + +const DEFAULT_DEMOTION_THRESHOLD = 0.6; + +/** + * Aggregate the structural side of demotion correctness. The LLM-judged + * "was this demotion warranted?" half is scored separately via the + * existing `judgeResponse` path; this returns deterministic metrics that + * can be asserted in CI without an LLM call. + */ +export function scoreDemotion(input: DemotionMatchInput): DemotionMatchResult { + const set = assertExpectedSet( + input.observedDemotions, + input.expectedDemotions, + ); + const violations = assertTimestampDiscipline( + input.retractActions ?? [], + input.softDeleteActions ?? [], + ); + const cascade = input.cascade + ? assertCascadeBounded( + input.cascade.touched, + input.cascade.expectedDirectNeighbors, + input.cascade.tangentialEdges, + ) + : undefined; + + const timestampScore = violations.length === 0 ? 1 : 0; + + const weights = { ...DEFAULT_DEMOTION_WEIGHTS, ...(input.weights ?? {}) }; + const metrics: DemotionMetricScore[] = [ + { + metric: "set_precision", + value: set.precision, + weight: weights.set_precision, + }, + { metric: "set_recall", value: set.recall, weight: weights.set_recall }, + { metric: "set_f1", value: set.f1, weight: weights.set_f1 }, + { + metric: "timestamp_discipline", + value: timestampScore, + weight: weights.timestamp_discipline, + }, + ]; + if (cascade) { + metrics.push( + { + metric: "cascade_bounded", + value: cascade.bounded ? 1 : 0, + weight: weights.cascade_bounded, + }, + { + metric: "cascade_direct_f1", + value: cascade.directNeighborF1, + weight: weights.cascade_direct_f1, + }, + ); + } + + const totalWeight = metrics.reduce( + (sum, m) => sum + Math.max(0, m.weight), + 0, + ); + const weightedScore = + totalWeight === 0 + ? 0 + : metrics.reduce( + (sum, m) => (m.weight > 0 ? sum + m.value * m.weight : sum), + 0, + ) / totalWeight; + const passThreshold = input.passThreshold ?? DEFAULT_DEMOTION_THRESHOLD; + const hardFail = + violations.length > 0 || (cascade !== undefined && !cascade.bounded); + return { + metrics, + weightedScore, + passThreshold, + passed: !hardFail && weightedScore >= passThreshold, + set, + timestampViolations: violations, + cascade, + }; +} diff --git a/src/domains/evaluation/demotion-scorer.test.ts b/src/domains/evaluation/demotion-scorer.test.ts new file mode 100644 index 0000000..d16cf46 --- /dev/null +++ b/src/domains/evaluation/demotion-scorer.test.ts @@ -0,0 +1,159 @@ +import { describe, expect, test } from "bun:test"; +import { mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import type { + AdapterReply, + DemotionConfig, + Scenario, +} from "../../shared/types/contracts.ts"; +import { + coerceDemotionPayload, + resolveDemotionPayload, + scoreScenarioDemotion, +} from "./demotion-scorer.ts"; + +function makeTempDir(prefix: string): string { + return mkdtempSync(join(tmpdir(), `agentprobe-${prefix}-`)); +} + +function buildConfig(overrides: Partial = {}): DemotionConfig { + return { + expectedDemotions: overrides.expectedDemotions ?? ["e1", "e2"], + expectedRetracts: overrides.expectedRetracts, + cascade: overrides.cascade, + weights: overrides.weights ?? { + set_precision: 1, + set_recall: 1, + set_f1: 1, + timestamp_discipline: 1, + cascade_bounded: 1, + cascade_direct_f1: 1, + }, + passThreshold: overrides.passThreshold ?? 0.6, + source: overrides.source, + }; +} + +function buildScenario(config: DemotionConfig | undefined): Scenario { + return { + id: "demotion-test", + name: "Demotion test", + tags: [], + turns: [], + sessions: [], + expectations: { + mustInclude: [], + mustNotInclude: [], + expectedTools: [], + failureModes: [], + }, + demotion: config, + }; +} + +function buildReply(payload: unknown): AdapterReply { + return { + assistantText: "...", + toolCalls: [], + rawExchange: { + demotions: payload, + } as unknown as AdapterReply["rawExchange"], + latencyMs: 0, + usage: {}, + }; +} + +describe("coerceDemotionPayload", () => { + test("extracts observed UUIDs from `observed` and `demotions` keys", () => { + expect(coerceDemotionPayload({ observed: ["a", "b"] }).observed).toEqual([ + "a", + "b", + ]); + expect(coerceDemotionPayload({ demotions: ["c"] }).observed).toEqual(["c"]); + }); + + test("extracts cascade and action records", () => { + const payload = coerceDemotionPayload({ + observed: ["a"], + cascade_touched: ["e1", "e2"], + retract_actions: [ + { uuid: "e1", expired_at_set: true, invalid_at_set: false }, + ], + }); + expect(payload.cascadeTouched).toEqual(["e1", "e2"]); + expect(payload.retractActions?.[0]?.uuid).toBe("e1"); + }); +}); + +describe("resolveDemotionPayload", () => { + test("loads fixture relative to scenarios path", () => { + const dir = makeTempDir("demotion-fixture"); + const fp = join(dir, "demo.json"); + writeFileSync( + fp, + JSON.stringify({ observed: ["e1", "e2"], cascade_touched: ["e1"] }), + "utf8", + ); + const config = buildConfig({ source: { fixture: "demo.json" } }); + const result = resolveDemotionPayload(config, { + scenariosPath: join(dir, "scenarios.yaml"), + }); + expect(result.source).toBe("fixture"); + expect(result.payload.observed).toEqual(["e1", "e2"]); + expect(result.payload.cascadeTouched).toEqual(["e1"]); + }); + + test("reads from rawExchange when no fixture configured", () => { + const result = resolveDemotionPayload(buildConfig(), { + lastAdapterReply: buildReply({ observed: ["e1"] }), + }); + expect(result.source).toBe("raw_exchange"); + expect(result.payload.observed).toEqual(["e1"]); + }); + + test("missing source returns empty payload", () => { + expect(resolveDemotionPayload(buildConfig(), {}).source).toBe("missing"); + }); +}); + +describe("scoreScenarioDemotion", () => { + test("returns undefined when no demotion block on the scenario", () => { + expect(scoreScenarioDemotion(buildScenario(undefined), {})).toBeUndefined(); + }); + + test("perfect demotion passes", () => { + const scenario = buildScenario(buildConfig()); + const reply = buildReply({ observed: ["e1", "e2"] }); + const result = scoreScenarioDemotion(scenario, { lastAdapterReply: reply }); + expect(result?.passed).toBe(true); + expect(result?.weightedScore).toBeGreaterThan(0.6); + }); + + test("runaway cascade flips passed to false", () => { + const scenario = buildScenario( + buildConfig({ + expectedDemotions: ["e_ab", "e_bc"], + cascade: { + expectedDirectNeighbors: ["e_ab", "e_bc"], + tangentialEdges: ["e_cd"], + }, + }), + ); + const reply = buildReply({ + observed: ["e_ab", "e_bc", "e_cd"], + cascade_touched: ["e_ab", "e_bc", "e_cd"], + }); + const result = scoreScenarioDemotion(scenario, { lastAdapterReply: reply }); + expect(result?.cascadeBounded).toBe(false); + expect(result?.passed).toBe(false); + }); + + test("missing source produces a failing score with source=missing", () => { + const scenario = buildScenario(buildConfig()); + const result = scoreScenarioDemotion(scenario, {}); + expect(result?.source).toBe("missing"); + expect(result?.passed).toBe(false); + }); +}); diff --git a/src/domains/evaluation/demotion-scorer.ts b/src/domains/evaluation/demotion-scorer.ts new file mode 100644 index 0000000..e1ffb69 --- /dev/null +++ b/src/domains/evaluation/demotion-scorer.ts @@ -0,0 +1,191 @@ +import { existsSync, readFileSync, statSync } from "node:fs"; +import { dirname, isAbsolute, resolve } from "node:path"; + +import type { + AdapterReply, + DemotionAction, + DemotionConfig, + DemotionScore, + EvalSource, + JsonValue, + Scenario, +} from "../../shared/types/contracts.ts"; +import { AgentProbeRuntimeError } from "../../shared/utils/errors.ts"; +import { logWarn } from "../../shared/utils/logging.ts"; +import { scoreDemotion } from "./demotion-match.ts"; + +const DEFAULT_RAW_EXCHANGE_KEY = "demotions"; + +export type DemotionPayload = { + /** Observed demotion UUIDs. */ + observed?: string[]; + /** Optional raw retract / soft-delete action records for Snodgrass check. */ + retractActions?: DemotionAction[]; + softDeleteActions?: DemotionAction[]; + /** Observed cascade edge UUIDs. */ + cascadeTouched?: string[]; +}; + +function resolveFixturePath( + scenariosPath: string | undefined, + fixture: string, +): string { + if (isAbsolute(fixture)) { + return fixture; + } + if (!scenariosPath) { + return resolve(fixture); + } + let base: string; + try { + base = + existsSync(scenariosPath) && statSync(scenariosPath).isDirectory() + ? scenariosPath + : dirname(scenariosPath); + } catch { + base = dirname(scenariosPath); + } + return resolve(base, fixture); +} + +export function coerceDemotionPayload(payload: unknown): DemotionPayload { + if (!payload || typeof payload !== "object" || Array.isArray(payload)) { + return {}; + } + const record = payload as Record; + const observed = Array.isArray(record.observed) + ? record.observed.filter((id): id is string => typeof id === "string") + : Array.isArray(record.demotions) + ? record.demotions.filter((id): id is string => typeof id === "string") + : undefined; + const cascadeTouched = Array.isArray(record.cascade_touched) + ? record.cascade_touched.filter( + (id): id is string => typeof id === "string", + ) + : undefined; + const retractActions = Array.isArray(record.retract_actions) + ? record.retract_actions.flatMap(coerceAction) + : undefined; + const softDeleteActions = Array.isArray(record.soft_delete_actions) + ? record.soft_delete_actions.flatMap(coerceAction) + : undefined; + return { observed, cascadeTouched, retractActions, softDeleteActions }; +} + +function coerceAction(value: unknown): DemotionAction[] { + if (!value || typeof value !== "object" || Array.isArray(value)) { + return []; + } + const obj = value as Record; + const uuid = typeof obj.uuid === "string" ? obj.uuid : undefined; + if (!uuid) { + return []; + } + return [ + { + uuid, + label: typeof obj.label === "string" ? obj.label : undefined, + expiredAtSet: obj.expired_at_set === true || obj.expiredAtSet === true, + invalidAtSet: obj.invalid_at_set === true || obj.invalidAtSet === true, + status: typeof obj.status === "string" ? obj.status : undefined, + }, + ]; +} + +export type DemotionSourceContext = { + scenariosPath?: string; + lastAdapterReply?: AdapterReply; +}; + +export type ResolvedDemotion = { + payload: DemotionPayload; + source: EvalSource; +}; + +export function resolveDemotionPayload( + config: DemotionConfig, + context: DemotionSourceContext, +): ResolvedDemotion { + const fixture = config.source?.fixture; + if (fixture) { + const resolved = resolveFixturePath(context.scenariosPath, fixture); + if (!existsSync(resolved)) { + throw new AgentProbeRuntimeError( + `Demotion fixture not found: ${resolved}`, + ); + } + return { + payload: coerceDemotionPayload( + JSON.parse(readFileSync(resolved, "utf8")), + ), + source: "fixture", + }; + } + const key = config.source?.rawExchangeKey ?? DEFAULT_RAW_EXCHANGE_KEY; + const rawExchange = context.lastAdapterReply?.rawExchange; + if (rawExchange && typeof rawExchange === "object") { + const candidate = (rawExchange as Record)[key]; + if (candidate !== undefined) { + return { + payload: coerceDemotionPayload(candidate), + source: "raw_exchange", + }; + } + } + return { payload: {}, source: "missing" }; +} + +export function scoreScenarioDemotion( + scenario: Scenario, + context: DemotionSourceContext, +): DemotionScore | undefined { + const config = scenario.demotion; + if (!config) { + return undefined; + } + + let resolution: ResolvedDemotion; + try { + resolution = resolveDemotionPayload(config, context); + } catch (error) { + logWarn( + `Demotion scorer failed to resolve payload for ${scenario.id}: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + resolution = { payload: {}, source: "missing" }; + } + + const observed = resolution.payload.observed ?? []; + const cascadeTouched = resolution.payload.cascadeTouched ?? []; + const cascadeConfig = config.cascade; + const cascade = cascadeConfig + ? { + touched: cascadeTouched.length > 0 ? cascadeTouched : observed, + expectedDirectNeighbors: cascadeConfig.expectedDirectNeighbors, + tangentialEdges: cascadeConfig.tangentialEdges, + } + : undefined; + + const match = scoreDemotion({ + observedDemotions: observed, + expectedDemotions: config.expectedDemotions, + retractActions: resolution.payload.retractActions, + softDeleteActions: resolution.payload.softDeleteActions, + cascade, + weights: config.weights, + passThreshold: config.passThreshold, + }); + + return { + metrics: match.metrics, + weightedScore: match.weightedScore, + passThreshold: config.passThreshold, + passed: match.passed, + observed, + expected: [...config.expectedDemotions], + cascadeBounded: match.cascade?.bounded, + timestampViolationCount: match.timestampViolations.length, + source: resolution.source, + }; +} diff --git a/src/domains/evaluation/procedure-match.test.ts b/src/domains/evaluation/procedure-match.test.ts new file mode 100644 index 0000000..fcb5e7c --- /dev/null +++ b/src/domains/evaluation/procedure-match.test.ts @@ -0,0 +1,160 @@ +import { describe, expect, test } from "bun:test"; + +import { + longestCommonSubsequenceLength, + orderSimilarity, + parameterCoverage, + scoreProcedure, + stepCoverage, +} from "./procedure-match.ts"; + +describe("stepCoverage", () => { + test("perfect coverage yields F1 of 1.0", () => { + const result = stepCoverage(["a", "b", "c"], ["a", "b", "c"]); + expect(result.f1).toBeCloseTo(1.0, 6); + expect(result.matchedSteps).toEqual(["a", "b", "c"]); + expect(result.missingSteps).toEqual([]); + expect(result.extraSteps).toEqual([]); + }); + + test("missing one step drops recall", () => { + const result = stepCoverage(["a", "b"], ["a", "b", "c"]); + expect(result.precision).toBeCloseTo(1.0, 6); + expect(result.recall).toBeCloseTo(2 / 3, 6); + expect(result.missingSteps).toEqual(["c"]); + }); + + test("extra step drops precision", () => { + const result = stepCoverage(["a", "b", "junk"], ["a", "b"]); + expect(result.precision).toBeCloseTo(2 / 3, 6); + expect(result.recall).toBeCloseTo(1.0, 6); + expect(result.extraSteps).toEqual(["junk"]); + }); + + test("normalization is case-insensitive and whitespace-trimmed", () => { + const result = stepCoverage( + [" Open Ticket ", "ASSIGN"], + ["open ticket", "assign"], + ); + expect(result.f1).toBeCloseTo(1.0, 6); + }); +}); + +describe("longestCommonSubsequenceLength", () => { + test("identical sequences yield length |seq|", () => { + expect( + longestCommonSubsequenceLength(["a", "b", "c"], ["a", "b", "c"]), + ).toBe(3); + }); + + test("disjoint sequences yield 0", () => { + expect(longestCommonSubsequenceLength(["a", "b"], ["c", "d"])).toBe(0); + }); + + test("classic ABCBDAB / BDCAB example yields 4", () => { + // LCS of "abcbdab" and "bdcab" is "bcab" (length 4) — a canonical CLRS case. + expect( + longestCommonSubsequenceLength( + ["a", "b", "c", "b", "d", "a", "b"], + ["b", "d", "c", "a", "b"], + ), + ).toBe(4); + }); +}); + +describe("orderSimilarity", () => { + test("identical order is 1.0", () => { + expect(orderSimilarity(["a", "b", "c"], ["a", "b", "c"])).toBeCloseTo( + 1.0, + 6, + ); + }); + + test("reversed order with shared elements drops below 1.0", () => { + // LCS of [a,b,c] and [c,b,a] is 1 (b alone, or a/c alone); max length 3 => 1/3. + expect(orderSimilarity(["a", "b", "c"], ["c", "b", "a"])).toBeCloseTo( + 1 / 3, + 6, + ); + }); + + test("empty inputs degrade to 1", () => { + expect(orderSimilarity([], [])).toBe(1); + }); +}); + +describe("parameterCoverage", () => { + test("identical sets yield Jaccard 1.0", () => { + const result = parameterCoverage( + ["ticket_id", "assignee"], + ["ticket_id", "assignee"], + ); + expect(result.jaccard).toBeCloseTo(1.0, 6); + expect(result.missing).toEqual([]); + expect(result.extra).toEqual([]); + }); + + test("missing one and extra one penalize symmetrically", () => { + const result = parameterCoverage( + ["ticket_id", "junk"], + ["ticket_id", "assignee"], + ); + // matched=1, union={ticket_id, junk, assignee}=3 -> 1/3 + expect(result.jaccard).toBeCloseTo(1 / 3, 6); + }); +}); + +describe("scoreProcedure", () => { + test("perfect match passes with weightedScore 1.0", () => { + const result = scoreProcedure({ + predictedSteps: ["open ticket", "assign", "close"], + goldenSteps: ["open ticket", "assign", "close"], + predictedParameters: ["ticket_id"], + goldenParameters: ["ticket_id"], + }); + expect(result.weightedScore).toBeCloseTo(1.0, 6); + expect(result.passed).toBe(true); + }); + + test("missing one of three steps still passes at default threshold", () => { + const result = scoreProcedure({ + predictedSteps: ["open ticket", "assign"], + goldenSteps: ["open ticket", "assign", "close"], + }); + // step_coverage F1 = 2*(1.0 * 2/3)/(1.0 + 2/3) = 0.8 + // order similarity = LCS([open ticket, assign], [open ticket, assign, close]) / max(2,3) = 2/3 + // parameter_coverage (both empty) = 1 + // weighted avg = (0.8 + 2/3 + 1) / 3 = ~0.822 + expect(result.weightedScore).toBeGreaterThan(0.6); + expect(result.passed).toBe(true); + }); + + test("reversed order drops weighted score below threshold", () => { + const result = scoreProcedure({ + predictedSteps: ["close", "assign", "open ticket"], + goldenSteps: ["open ticket", "assign", "close"], + }); + // step_coverage F1 = 1.0; order similarity = 1/3; parameter = 1 + // weighted = (1.0 + 1/3 + 1) / 3 = ~0.778 — actually passes at 0.6 + expect(result.weightedScore).toBeGreaterThan(0); + // But if step_order is weighted heavily it should fail: + const heavy = scoreProcedure({ + predictedSteps: ["close", "assign", "open ticket"], + goldenSteps: ["open ticket", "assign", "close"], + weights: { step_coverage: 1, step_order: 5, parameter_coverage: 0 }, + }); + // (1.0 * 1 + 1/3 * 5 + 0) / 6 = ~0.444 + expect(heavy.weightedScore).toBeLessThan(0.6); + expect(heavy.passed).toBe(false); + }); + + test("zero weights collapse cleanly", () => { + const result = scoreProcedure({ + predictedSteps: ["a"], + goldenSteps: ["a"], + weights: { step_coverage: 0, step_order: 0, parameter_coverage: 0 }, + }); + expect(result.weightedScore).toBe(0); + expect(result.passed).toBe(false); + }); +}); diff --git a/src/domains/evaluation/procedure-match.ts b/src/domains/evaluation/procedure-match.ts new file mode 100644 index 0000000..c8869c3 --- /dev/null +++ b/src/domains/evaluation/procedure-match.ts @@ -0,0 +1,247 @@ +/** + * Procedure-structure matching primitives for the procedure-extraction scorer. + * + * Given a `golden` procedure (ordered list of step IDs / labels, optional + * parameter set) and a `predicted` procedure produced by the dream-pass + * procedure-synthesis pipeline (`ProcedureMemory`), score how well they + * match on three axes: + * + * 1. Step coverage — Jaccard / F1 over the set of step labels + * 2. Step order — normalized Levenshtein edit distance over the + * two step sequences (LCS-based normalization) + * 3. Parameter coverage — Jaccard over named parameters + * + * No I/O. All math is pure and pinned by known-answer tests. + */ + +import { precisionAtK, recallAtK } from "./ranking.ts"; + +/** Normalize a step or parameter label for matching. */ +function normalize(value: string): string { + return value.trim().toLowerCase(); +} + +function unique(values: readonly string[]): string[] { + return [...new Set(values.map(normalize))]; +} + +export type StepCoverage = { + precision: number; + recall: number; + f1: number; + matchedSteps: string[]; + missingSteps: string[]; + extraSteps: string[]; +}; + +/** + * Set-level coverage of predicted steps vs golden steps. Uses normalized + * exact equality (case-insensitive, whitespace-trimmed) — substring matching + * would be too lax for procedure step labels. + */ +export function stepCoverage( + predicted: readonly string[], + golden: readonly string[], +): StepCoverage { + const predicted_ = unique(predicted); + const golden_ = unique(golden); + const goldenSet = new Set(golden_); + const predictedSet = new Set(predicted_); + + const matched = predicted_.filter((step) => goldenSet.has(step)); + const missing = golden_.filter((step) => !predictedSet.has(step)); + const extra = predicted_.filter((step) => !goldenSet.has(step)); + + const precision = + predicted_.length === 0 + ? golden_.length === 0 + ? 1 + : 0 + : matched.length / predicted_.length; + const recall = golden_.length === 0 ? 1 : matched.length / golden_.length; + const f1 = + precision + recall === 0 + ? 0 + : (2 * precision * recall) / (precision + recall); + return { + precision, + recall, + f1, + matchedSteps: matched, + missingSteps: missing, + extraSteps: extra, + }; +} + +/** + * Length of the longest common subsequence between `a` and `b`. O(|a| * |b|) + * time and space — fine for procedures of <100 steps; we don't need the + * Hirschberg refinement. + */ +export function longestCommonSubsequenceLength( + a: readonly string[], + b: readonly string[], +): number { + const an = a.map(normalize); + const bn = b.map(normalize); + const rows = an.length + 1; + const cols = bn.length + 1; + const dp = new Array(rows * cols).fill(0); + const at = (i: number, j: number): number => dp[i * cols + j] ?? 0; + const set = (i: number, j: number, value: number): void => { + dp[i * cols + j] = value; + }; + for (let i = 1; i < rows; i += 1) { + for (let j = 1; j < cols; j += 1) { + if (an[i - 1] === bn[j - 1]) { + set(i, j, at(i - 1, j - 1) + 1); + } else { + set(i, j, Math.max(at(i - 1, j), at(i, j - 1))); + } + } + } + return at(an.length, bn.length); +} + +/** + * Order similarity in [0, 1]. Computed as `LCS / max(|a|, |b|)`. Two + * identical sequences yield 1; two with no shared elements yield 0. + * + * This is intentionally different from raw Levenshtein. Procedures are + * order-sensitive but tolerate insertions/deletions; LCS-normalized + * similarity matches what the dream-pass extractor is trying to recover. + */ +export function orderSimilarity( + predicted: readonly string[], + golden: readonly string[], +): number { + if (predicted.length === 0 && golden.length === 0) { + return 1; + } + const denom = Math.max(predicted.length, golden.length); + if (denom === 0) { + return 1; + } + const lcs = longestCommonSubsequenceLength(predicted, golden); + return lcs / denom; +} + +export type ParameterCoverage = { + jaccard: number; + matched: string[]; + missing: string[]; + extra: string[]; +}; + +export function parameterCoverage( + predicted: readonly string[], + golden: readonly string[], +): ParameterCoverage { + const predicted_ = unique(predicted); + const golden_ = unique(golden); + const goldenSet = new Set(golden_); + const predictedSet = new Set(predicted_); + const matched = predicted_.filter((p) => goldenSet.has(p)); + const missing = golden_.filter((p) => !predictedSet.has(p)); + const extra = predicted_.filter((p) => !goldenSet.has(p)); + const unionSize = new Set([...predicted_, ...golden_]).size; + const jaccard = unionSize === 0 ? 1 : matched.length / unionSize; + return { jaccard, matched, missing, extra }; +} + +export type ProcedureMatchInput = { + predictedSteps: readonly string[]; + goldenSteps: readonly string[]; + predictedParameters?: readonly string[]; + goldenParameters?: readonly string[]; + weights?: { + step_coverage?: number; + step_order?: number; + parameter_coverage?: number; + }; + passThreshold?: number; +}; + +export type ProcedureMetricKey = + | "step_coverage" + | "step_order" + | "parameter_coverage"; + +export type ProcedureMetricScore = { + metric: ProcedureMetricKey; + value: number; + weight: number; +}; + +export type ProcedureScoreResult = { + metrics: ProcedureMetricScore[]; + weightedScore: number; + passThreshold: number; + passed: boolean; + step: StepCoverage; + order: number; + parameters: ParameterCoverage; +}; + +const DEFAULT_WEIGHTS = { + step_coverage: 1, + step_order: 1, + parameter_coverage: 1, +}; + +const DEFAULT_PASS_THRESHOLD = 0.6; + +/** + * Score a single (predicted, golden) procedure pair. Use this when the + * extractor produces one procedure per query. + */ +export function scoreProcedure( + input: ProcedureMatchInput, +): ProcedureScoreResult { + const step = stepCoverage(input.predictedSteps, input.goldenSteps); + const order = orderSimilarity(input.predictedSteps, input.goldenSteps); + const parameters = parameterCoverage( + input.predictedParameters ?? [], + input.goldenParameters ?? [], + ); + const weights = { ...DEFAULT_WEIGHTS, ...(input.weights ?? {}) }; + + const metrics: ProcedureMetricScore[] = [ + { metric: "step_coverage", value: step.f1, weight: weights.step_coverage }, + { metric: "step_order", value: order, weight: weights.step_order }, + { + metric: "parameter_coverage", + value: parameters.jaccard, + weight: weights.parameter_coverage, + }, + ]; + + const totalWeight = metrics.reduce( + (sum, item) => sum + Math.max(0, item.weight), + 0, + ); + const weightedScore = + totalWeight === 0 + ? 0 + : metrics.reduce( + (sum, item) => + item.weight > 0 ? sum + item.value * item.weight : sum, + 0, + ) / totalWeight; + const passThreshold = input.passThreshold ?? DEFAULT_PASS_THRESHOLD; + return { + metrics, + weightedScore, + passThreshold, + passed: weightedScore >= passThreshold, + step, + order, + parameters, + }; +} + +/** + * Re-export the ranking primitives so callers that want to compose a + * procedure score with retrieval-style metrics have one entry point. + */ +export { precisionAtK, recallAtK }; diff --git a/src/domains/evaluation/procedure-scorer.test.ts b/src/domains/evaluation/procedure-scorer.test.ts new file mode 100644 index 0000000..0298a73 --- /dev/null +++ b/src/domains/evaluation/procedure-scorer.test.ts @@ -0,0 +1,136 @@ +import { describe, expect, test } from "bun:test"; +import { mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import type { + AdapterReply, + ProcedureConfig, + Scenario, +} from "../../shared/types/contracts.ts"; +import { + coerceProcedurePayload, + resolveProcedurePayload, + scoreScenarioProcedure, +} from "./procedure-scorer.ts"; + +function makeTempDir(prefix: string): string { + return mkdtempSync(join(tmpdir(), `agentprobe-${prefix}-`)); +} + +function buildConfig( + overrides: Partial = {}, +): ProcedureConfig { + return { + goldenSteps: overrides.goldenSteps ?? ["open ticket", "assign", "close"], + goldenParameters: overrides.goldenParameters, + weights: overrides.weights ?? { + step_coverage: 1, + step_order: 1, + parameter_coverage: 1, + }, + passThreshold: overrides.passThreshold ?? 0.6, + source: overrides.source, + }; +} + +function buildScenario(config: ProcedureConfig | undefined): Scenario { + return { + id: "procedure-test", + name: "Procedure test", + tags: [], + turns: [], + sessions: [], + expectations: { + mustInclude: [], + mustNotInclude: [], + expectedTools: [], + failureModes: [], + }, + procedure: config, + }; +} + +function buildReply(payload: unknown): AdapterReply { + return { + assistantText: "...", + toolCalls: [], + rawExchange: { + procedure: payload, + } as unknown as AdapterReply["rawExchange"], + latencyMs: 0, + usage: {}, + }; +} + +describe("coerceProcedurePayload", () => { + test("accepts a bare list of steps", () => { + expect(coerceProcedurePayload(["a", "b"]).steps).toEqual(["a", "b"]); + }); + + test("extracts steps and parameters from object payloads", () => { + const payload = coerceProcedurePayload({ + steps: ["open", "close"], + parameters: ["ticket_id"], + }); + expect(payload.steps).toEqual(["open", "close"]); + expect(payload.parameters).toEqual(["ticket_id"]); + }); +}); + +describe("resolveProcedurePayload", () => { + test("loads from fixture", () => { + const dir = makeTempDir("procedure-fixture"); + writeFileSync( + join(dir, "proc.json"), + JSON.stringify({ steps: ["open", "close"] }), + "utf8", + ); + const config = buildConfig({ source: { fixture: "proc.json" } }); + const result = resolveProcedurePayload(config, { + scenariosPath: join(dir, "scenarios.yaml"), + }); + expect(result.source).toBe("fixture"); + expect(result.payload.steps).toEqual(["open", "close"]); + }); + + test("loads from rawExchange", () => { + const result = resolveProcedurePayload(buildConfig(), { + lastAdapterReply: buildReply({ steps: ["a", "b"] }), + }); + expect(result.source).toBe("raw_exchange"); + }); +}); + +describe("scoreScenarioProcedure", () => { + test("returns undefined when no procedure block on scenario", () => { + expect( + scoreScenarioProcedure(buildScenario(undefined), {}), + ).toBeUndefined(); + }); + + test("perfect match passes", () => { + const scenario = buildScenario(buildConfig()); + const result = scoreScenarioProcedure(scenario, { + lastAdapterReply: buildReply({ + steps: ["open ticket", "assign", "close"], + }), + }); + expect(result?.weightedScore).toBeCloseTo(1.0, 6); + expect(result?.passed).toBe(true); + }); + + test("reordered steps drop weighted score", () => { + const scenario = buildScenario( + buildConfig({ + weights: { step_coverage: 1, step_order: 5, parameter_coverage: 0 }, + }), + ); + const result = scoreScenarioProcedure(scenario, { + lastAdapterReply: buildReply({ + steps: ["close", "assign", "open ticket"], + }), + }); + expect(result?.passed).toBe(false); + }); +}); diff --git a/src/domains/evaluation/procedure-scorer.ts b/src/domains/evaluation/procedure-scorer.ts new file mode 100644 index 0000000..0f98ccc --- /dev/null +++ b/src/domains/evaluation/procedure-scorer.ts @@ -0,0 +1,146 @@ +import { existsSync, readFileSync, statSync } from "node:fs"; +import { dirname, isAbsolute, resolve } from "node:path"; + +import type { + AdapterReply, + EvalSource, + JsonValue, + ProcedureConfig, + ProcedureScore, + Scenario, +} from "../../shared/types/contracts.ts"; +import { AgentProbeRuntimeError } from "../../shared/utils/errors.ts"; +import { logWarn } from "../../shared/utils/logging.ts"; +import { scoreProcedure } from "./procedure-match.ts"; + +const DEFAULT_RAW_EXCHANGE_KEY = "procedure"; + +export type ProcedurePayload = { + steps?: string[]; + parameters?: string[]; +}; + +function resolveFixturePath( + scenariosPath: string | undefined, + fixture: string, +): string { + if (isAbsolute(fixture)) { + return fixture; + } + if (!scenariosPath) { + return resolve(fixture); + } + let base: string; + try { + base = + existsSync(scenariosPath) && statSync(scenariosPath).isDirectory() + ? scenariosPath + : dirname(scenariosPath); + } catch { + base = dirname(scenariosPath); + } + return resolve(base, fixture); +} + +export function coerceProcedurePayload(payload: unknown): ProcedurePayload { + if (!payload || typeof payload !== "object") { + return {}; + } + if (Array.isArray(payload)) { + return { + steps: payload.filter((s): s is string => typeof s === "string"), + }; + } + const record = payload as Record; + const steps = Array.isArray(record.steps) + ? record.steps.filter((s): s is string => typeof s === "string") + : undefined; + const parameters = Array.isArray(record.parameters) + ? record.parameters.filter((s): s is string => typeof s === "string") + : undefined; + return { steps, parameters }; +} + +export type ProcedureSourceContext = { + scenariosPath?: string; + lastAdapterReply?: AdapterReply; +}; + +export type ResolvedProcedure = { + payload: ProcedurePayload; + source: EvalSource; +}; + +export function resolveProcedurePayload( + config: ProcedureConfig, + context: ProcedureSourceContext, +): ResolvedProcedure { + const fixture = config.source?.fixture; + if (fixture) { + const resolved = resolveFixturePath(context.scenariosPath, fixture); + if (!existsSync(resolved)) { + throw new AgentProbeRuntimeError( + `Procedure fixture not found: ${resolved}`, + ); + } + return { + payload: coerceProcedurePayload( + JSON.parse(readFileSync(resolved, "utf8")), + ), + source: "fixture", + }; + } + const key = config.source?.rawExchangeKey ?? DEFAULT_RAW_EXCHANGE_KEY; + const rawExchange = context.lastAdapterReply?.rawExchange; + if (rawExchange && typeof rawExchange === "object") { + const candidate = (rawExchange as Record)[key]; + if (candidate !== undefined) { + return { + payload: coerceProcedurePayload(candidate), + source: "raw_exchange", + }; + } + } + return { payload: {}, source: "missing" }; +} + +export function scoreScenarioProcedure( + scenario: Scenario, + context: ProcedureSourceContext, +): ProcedureScore | undefined { + const config = scenario.procedure; + if (!config) { + return undefined; + } + + let resolution: ResolvedProcedure; + try { + resolution = resolveProcedurePayload(config, context); + } catch (error) { + logWarn( + `Procedure scorer failed to resolve payload for ${scenario.id}: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + resolution = { payload: {}, source: "missing" }; + } + + const match = scoreProcedure({ + predictedSteps: resolution.payload.steps ?? [], + goldenSteps: config.goldenSteps, + predictedParameters: resolution.payload.parameters, + goldenParameters: config.goldenParameters, + weights: config.weights, + passThreshold: config.passThreshold, + }); + + return { + metrics: match.metrics, + weightedScore: match.weightedScore, + passThreshold: config.passThreshold, + passed: match.passed, + predictedSteps: [...(resolution.payload.steps ?? [])], + goldenSteps: [...config.goldenSteps], + source: resolution.source, + }; +} diff --git a/src/domains/evaluation/ranking.test.ts b/src/domains/evaluation/ranking.test.ts new file mode 100644 index 0000000..96c4f26 --- /dev/null +++ b/src/domains/evaluation/ranking.test.ts @@ -0,0 +1,217 @@ +import { describe, expect, test } from "bun:test"; + +import { + buildRelevanceVector, + countUniqueGoldHits, + mrr, + ndcgAtK, + precisionAtK, + recallAtK, + scoreRanking, +} from "./ranking.ts"; + +describe("precisionAtK", () => { + test("counts hits in the top-k window", () => { + expect(precisionAtK([1, 1, 0], 2)).toBeCloseTo(1.0, 6); + expect(precisionAtK([1, 0, 1], 3)).toBeCloseTo(2 / 3, 6); + expect(precisionAtK([0, 0, 0], 3)).toBe(0); + }); + + test("uses k as the denominator even when fewer items were returned", () => { + // Two items returned, both relevant, but k=5 — short list still penalized. + expect(precisionAtK([1, 1], 5)).toBeCloseTo(2 / 5, 6); + }); + + test("returns 0 for non-positive k", () => { + expect(precisionAtK([1, 1, 1], 0)).toBe(0); + }); +}); + +describe("recallAtK", () => { + test("returns 1 when there are no expected items", () => { + expect(recallAtK([0, 0, 0], 5, 0)).toBe(1); + }); + + test("scales by total relevant", () => { + expect(recallAtK([1, 0, 1], 3, 2)).toBeCloseTo(1.0, 6); + expect(recallAtK([1, 0, 1], 3, 4)).toBeCloseTo(0.5, 6); + expect(recallAtK([1, 0, 0], 3, 2)).toBeCloseTo(0.5, 6); + }); + + test("only counts hits within the cutoff", () => { + expect(recallAtK([0, 0, 1], 2, 1)).toBe(0); + expect(recallAtK([0, 0, 1], 3, 1)).toBe(1); + }); +}); + +describe("mrr", () => { + test("returns the reciprocal of the first hit rank", () => { + expect(mrr([0, 0, 1])).toBeCloseTo(1 / 3, 6); + expect(mrr([1, 0, 0])).toBeCloseTo(1.0, 6); + expect(mrr([0, 1, 1])).toBeCloseTo(0.5, 6); + }); + + test("returns 0 when no hits", () => { + expect(mrr([0, 0, 0])).toBe(0); + }); + + test("respects the k cutoff", () => { + // Hit at rank 3 but k=2 — treat as no hit. + expect(mrr([0, 0, 1], 2)).toBe(0); + expect(mrr([0, 0, 1], 3)).toBeCloseTo(1 / 3, 6); + }); +}); + +describe("ndcgAtK", () => { + test("perfect ranking yields 1", () => { + expect(ndcgAtK([1, 1, 1], 3)).toBeCloseTo(1.0, 6); + expect(ndcgAtK([1, 1, 0], 3)).toBeCloseTo(1.0, 6); + }); + + test("NDCG of [1, 0, 1] with log2 discount", () => { + // DCG = 1/log2(2) + 0/log2(3) + 1/log2(4) = 1 + 0 + 0.5 = 1.5 + // ideal DCG (sorted desc = [1, 1, 0]) = 1/log2(2) + 1/log2(3) + 0 = 1 + ~0.6309 = ~1.6309 + // NDCG = 1.5 / 1.6309 = ~0.9197 + expect(ndcgAtK([1, 0, 1], 3)).toBeCloseTo(0.91972, 4); + }); + + test("returns 0 when no relevant items exist", () => { + expect(ndcgAtK([0, 0, 0], 3)).toBe(0); + }); + + test("respects the k cutoff", () => { + // Relevant only at rank 3, k=2 — DCG over window is 0. + expect(ndcgAtK([0, 0, 1], 2)).toBe(0); + // Same vector, k=3, DCG = 1/log2(4) = 0.5, ideal = 1, => 0.5. + expect(ndcgAtK([0, 0, 1], 3)).toBeCloseTo(0.5, 6); + }); +}); + +describe("buildRelevanceVector", () => { + test("substring policy is case-insensitive and bidirectional", () => { + const returned = ["Sarah's email address", "Random other note"]; + const golden = ["sarah"]; + expect(buildRelevanceVector(returned, golden, "substring")).toEqual([1, 0]); + }); + + test("exact policy requires full normalized equality", () => { + expect( + buildRelevanceVector( + ["Atlas Project Status"], + ["atlas project status"], + "exact", + ), + ).toEqual([1]); + expect( + buildRelevanceVector( + ["Atlas Project"], + ["Atlas Project Status"], + "exact", + ), + ).toEqual([0]); + }); + + test("regex policy interprets the golden item as a pattern", () => { + expect(buildRelevanceVector(["budget: $50K"], ["\\$50k"], "regex")).toEqual( + [1], + ); + }); + + test("returns 0 for empty golden item to avoid false matches", () => { + expect(buildRelevanceVector(["anything"], [""], "substring")).toEqual([0]); + }); +}); + +describe("countUniqueGoldHits", () => { + test("dedupes duplicate returns against the same gold item", () => { + const returned = ["Sarah", "Sarah", "Atlas"]; + const golden = ["Sarah", "Atlas"]; + expect(countUniqueGoldHits(returned, golden, 5)).toBe(2); + }); + + test("respects k cutoff", () => { + const returned = ["Atlas", "Sarah"]; + const golden = ["Sarah", "Atlas"]; + expect(countUniqueGoldHits(returned, golden, 1)).toBe(1); + expect(countUniqueGoldHits(returned, golden, 2)).toBe(2); + }); +}); + +describe("scoreRanking", () => { + test("perfect top-k returns weightedScore 1 and passes", () => { + const result = scoreRanking({ + returned: ["sarahs email", "atlas project status"], + golden: ["sarah", "atlas project"], + k: 2, + }); + + expect(result.k).toBe(2); + expect(result.hitCount).toBe(2); + expect(result.forbiddenHits).toBe(0); + expect(result.weightedScore).toBeCloseTo(1.0, 6); + expect(result.passed).toBe(true); + }); + + test("missing gold items lower recall and weighted score", () => { + const result = scoreRanking({ + returned: ["unrelated note"], + golden: ["sarah", "atlas project"], + k: 5, + passThreshold: 0.5, + }); + + expect(result.hitCount).toBe(0); + expect(result.weightedScore).toBe(0); + expect(result.passed).toBe(false); + }); + + test("forbidden hits force a fail even when score is high", () => { + const result = scoreRanking({ + returned: ["sarah", "old budget figure"], + golden: ["sarah"], + forbidden: ["old budget"], + k: 2, + passThreshold: 0.3, + }); + + expect(result.forbiddenHits).toBe(1); + expect(result.passed).toBe(false); + }); + + test("weight=0 excludes a metric from weightedScore without dropping the report", () => { + const result = scoreRanking({ + returned: ["sarah"], + golden: ["sarah", "atlas"], + weights: { + precision_at_k: 1, + recall_at_k: 0, + mrr: 1, + ndcg_at_k: 1, + }, + k: 1, + }); + + const recall = result.metrics.find((item) => item.metric === "recall_at_k"); + expect(recall?.weight).toBe(0); + // recall at k=1 with 2 gold items is 0.5; weightedScore should ignore it. + // precision=1, mrr=1, ndcg=1 -> average 1.0 + expect(result.weightedScore).toBeCloseTo(1.0, 6); + }); + + test("defaults k to max(|returned|, |golden|, 1)", () => { + const result = scoreRanking({ + returned: ["a", "b", "c"], + golden: ["a"], + }); + expect(result.k).toBe(3); + }); + + test("zero golden items yields trivial recall=1 and a score driven by precision-style metrics", () => { + const result = scoreRanking({ + returned: [], + golden: [], + }); + const recall = result.metrics.find((item) => item.metric === "recall_at_k"); + expect(recall?.value).toBe(1); + }); +}); diff --git a/src/domains/evaluation/ranking.ts b/src/domains/evaluation/ranking.ts new file mode 100644 index 0000000..a449b43 --- /dev/null +++ b/src/domains/evaluation/ranking.ts @@ -0,0 +1,377 @@ +/** + * Pure information-retrieval ranking metrics. + * + * All functions take a `relevance` vector — the binary relevance (0 or 1) + * of the returned list at each rank position. They return values in [0, 1]. + * + * No I/O, no LLM calls; this module is intended to be the load-bearing math + * behind the YAML `retrieval:` scorer. Tests pin the algebra against + * known-answer cases. + */ + +/** Truncate to `k`, defaulting to the full length when `k` is undefined or invalid. */ +function clampK(length: number, k: number | undefined): number { + if (k === undefined || !Number.isFinite(k) || k <= 0) { + return length; + } + return Math.min(length, Math.floor(k)); +} + +/** + * Precision@k — fraction of the top-k returned items that are relevant. + * + * When `k` exceeds the returned list, the denominator stays at `k` so that + * a short list still gets penalized for not surfacing enough items. This + * matches the `pytrec_eval` convention. + */ +export function precisionAtK(relevance: number[], k: number): number { + if (k <= 0) { + return 0; + } + const limit = Math.min(relevance.length, Math.floor(k)); + let hits = 0; + for (let index = 0; index < limit; index += 1) { + if ((relevance[index] ?? 0) > 0) { + hits += 1; + } + } + return hits / Math.floor(k); +} + +/** + * Recall@k — fraction of all relevant items that appear in the top-k. + * + * `totalRelevant` is the total number of items the suite expected to be + * relevant (the size of the golden set), not the count of relevant items + * actually returned. When `totalRelevant` is 0, recall is defined as 1 + * (no expectations means nothing to miss). + */ +export function recallAtK( + relevance: number[], + k: number, + totalRelevant: number, +): number { + if (totalRelevant <= 0) { + return 1; + } + if (k <= 0) { + return 0; + } + const limit = clampK(relevance.length, k); + let hits = 0; + for (let index = 0; index < limit; index += 1) { + if ((relevance[index] ?? 0) > 0) { + hits += 1; + } + } + return hits / totalRelevant; +} + +/** + * Mean reciprocal rank — `1 / rankOfFirstHit`, or 0 when no relevant item is + * returned. Computed for a single query (the "mean" is implicit when the + * caller averages across multiple queries). + * + * When `k` is provided, only the first `k` positions are considered, so a + * hit at rank `k + 1` is treated as no hit. + */ +export function mrr(relevance: number[], k?: number): number { + const limit = clampK(relevance.length, k); + for (let index = 0; index < limit; index += 1) { + if ((relevance[index] ?? 0) > 0) { + return 1 / (index + 1); + } + } + return 0; +} + +function dcgAtK(relevance: number[], k: number): number { + const limit = clampK(relevance.length, k); + let dcg = 0; + for (let index = 0; index < limit; index += 1) { + const rel = relevance[index] ?? 0; + if (rel <= 0) { + continue; + } + // log2(rank + 1) discount, with rank starting at 1. + dcg += rel / Math.log2(index + 2); + } + return dcg; +} + +/** + * Normalized discounted cumulative gain at k. + * + * Uses the classic `log2(rank + 1)` discount and idealizes DCG against the + * relevance vector sorted descending. With binary relevance this collapses to + * the standard NDCG@k. + * + * When the ideal DCG is 0 (no relevant items expected), NDCG is defined as 0. + */ +export function ndcgAtK(relevance: number[], k: number): number { + if (k <= 0) { + return 0; + } + const idealRelevance = [...relevance].sort((left, right) => right - left); + const ideal = dcgAtK(idealRelevance, k); + if (ideal <= 0) { + return 0; + } + return dcgAtK(relevance, k) / ideal; +} + +export type RankingMetricKey = + | "precision_at_k" + | "recall_at_k" + | "mrr" + | "ndcg_at_k"; + +export type RankingMetricResult = { + metric: RankingMetricKey; + value: number; + weight: number; +}; + +export type RankingScoreResult = { + k: number; + totalRelevant: number; + totalReturned: number; + hitCount: number; + forbiddenHits: number; + metrics: RankingMetricResult[]; + /** Weighted average across the metrics that carry positive weight. */ + weightedScore: number; + /** True when score >= `passThreshold` AND no forbidden items appeared in top-k. */ + passed: boolean; +}; + +export type RankingWeights = Partial>; + +export type RankingScoreInput = { + /** The list of returned items, in rank order. */ + returned: string[]; + /** The golden set of relevant items. */ + golden: string[]; + /** + * Optional forbidden items. Any forbidden item that appears in the top-k + * forces `passed: false` and is reported via `forbiddenHits`. + */ + forbidden?: string[]; + /** Rank cutoff. Defaults to `Math.max(returned.length, golden.length)`. */ + k?: number; + /** Per-metric weights. Metrics with weight 0 (or absent) are still reported but excluded from `weightedScore`. */ + weights?: RankingWeights; + /** Match policy applied to each `returned` vs `golden` comparison. */ + match?: MatchPolicy; + /** Pass threshold on the `weightedScore`. Defaults to 0.5. */ + passThreshold?: number; +}; + +export type MatchPolicy = "exact" | "substring" | "regex"; + +const DEFAULT_WEIGHTS: Required = { + precision_at_k: 1, + recall_at_k: 1, + mrr: 1, + ndcg_at_k: 1, +}; + +const DEFAULT_PASS_THRESHOLD = 0.5; + +function normalizeString(value: string): string { + return value.trim().toLowerCase(); +} + +function matchesItem( + returned: string, + expected: string, + policy: MatchPolicy, +): boolean { + switch (policy) { + case "exact": + return normalizeString(returned) === normalizeString(expected); + case "substring": { + const candidate = normalizeString(returned); + const needle = normalizeString(expected); + if (!needle) { + return false; + } + return candidate.includes(needle) || needle.includes(candidate); + } + case "regex": + try { + return new RegExp(expected, "i").test(returned); + } catch { + return false; + } + } +} + +/** + * Build the binary-relevance vector for `returned` against `golden`. + * + * Each returned item counts as a hit when any golden item matches under the + * supplied policy. Golden items can be matched by multiple returned items + * (i.e. duplicates in `returned` do not double-count gold coverage, but each + * occurrence is still marked relevant in the vector — this matches the + * standard IR convention because rank-based metrics naturally penalize + * duplicates via the discount and the `totalRelevant` denominator). + */ +export function buildRelevanceVector( + returned: string[], + golden: string[], + policy: MatchPolicy = "substring", +): number[] { + return returned.map((candidate) => + golden.some((expected) => matchesItem(candidate, expected, policy)) ? 1 : 0, + ); +} + +/** + * Count distinct gold items that the returned list covers in the top-k. + * + * This is the numerator used by `recallAtK` when we want recall to reflect + * *unique* gold coverage rather than total relevant returns. It tolerates + * duplicates in `returned` without double-counting. + */ +export function countUniqueGoldHits( + returned: string[], + golden: string[], + k: number, + policy: MatchPolicy = "substring", +): number { + if (k <= 0) { + return 0; + } + const limit = clampK(returned.length, k); + const matched = new Set(); + for (let index = 0; index < limit; index += 1) { + const candidate = returned[index] ?? ""; + for (let gIndex = 0; gIndex < golden.length; gIndex += 1) { + if (matched.has(gIndex)) { + continue; + } + if (matchesItem(candidate, golden[gIndex] ?? "", policy)) { + matched.add(gIndex); + } + } + } + return matched.size; +} + +function countForbiddenHits( + returned: string[], + forbidden: string[], + k: number, + policy: MatchPolicy, +): number { + if (forbidden.length === 0 || k <= 0) { + return 0; + } + const limit = clampK(returned.length, k); + let hits = 0; + for (let index = 0; index < limit; index += 1) { + const candidate = returned[index] ?? ""; + if ( + forbidden.some((forbiddenItem) => + matchesItem(candidate, forbiddenItem, policy), + ) + ) { + hits += 1; + } + } + return hits; +} + +/** + * Top-level ranking scorer. Computes the four canonical metrics and + * aggregates them under a weighted average. Forbidden items override the + * pass decision regardless of metric values. + */ +export function scoreRanking(input: RankingScoreInput): RankingScoreResult { + const policy = input.match ?? "substring"; + const k = clampK( + Math.max(input.returned.length, input.golden.length, 1), + input.k, + ); + const relevance = buildRelevanceVector(input.returned, input.golden, policy); + const uniqueHits = countUniqueGoldHits( + input.returned, + input.golden, + k, + policy, + ); + + const weights: Required = { + ...DEFAULT_WEIGHTS, + ...(input.weights ?? {}), + }; + + // Recall uses unique gold coverage to keep the math meaningful when the + // returned list contains duplicates. + const recallVectorHits = uniqueHits; + const recall = + input.golden.length === 0 ? 1 : recallVectorHits / input.golden.length; + + const metrics: RankingMetricResult[] = [ + { + metric: "precision_at_k", + value: precisionAtK(relevance, k), + weight: weights.precision_at_k, + }, + { + metric: "recall_at_k", + value: recall, + weight: weights.recall_at_k, + }, + { + metric: "mrr", + value: mrr(relevance, k), + weight: weights.mrr, + }, + { + metric: "ndcg_at_k", + value: ndcgAtK(relevance, k), + weight: weights.ndcg_at_k, + }, + ]; + + const totalWeight = metrics.reduce( + (sum, item) => sum + (item.weight > 0 ? item.weight : 0), + 0, + ); + const weightedScore = + totalWeight === 0 + ? 0 + : metrics.reduce( + (sum, item) => + item.weight > 0 ? sum + item.value * item.weight : sum, + 0, + ) / totalWeight; + + const forbidden = input.forbidden ?? []; + const forbiddenHits = countForbiddenHits( + input.returned, + forbidden, + k, + policy, + ); + const passThreshold = input.passThreshold ?? DEFAULT_PASS_THRESHOLD; + const passed = forbiddenHits === 0 && weightedScore >= passThreshold; + + const hitCount = relevance.reduce( + (sum, value) => sum + (value > 0 ? 1 : 0), + 0, + ); + + return { + k, + totalRelevant: input.golden.length, + totalReturned: input.returned.length, + hitCount, + forbiddenHits, + metrics, + weightedScore, + passed, + }; +} diff --git a/src/domains/evaluation/retrieval-scorer.test.ts b/src/domains/evaluation/retrieval-scorer.test.ts new file mode 100644 index 0000000..44a1455 --- /dev/null +++ b/src/domains/evaluation/retrieval-scorer.test.ts @@ -0,0 +1,241 @@ +import { describe, expect, test } from "bun:test"; +import { mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import type { + AdapterReply, + RetrievalConfig, + Scenario, +} from "../../shared/types/contracts.ts"; +import { + coerceRetrievedItems, + resolveRetrievedItems, + scoreRetrieval, +} from "./retrieval-scorer.ts"; + +function makeTempDir(prefix: string): string { + return mkdtempSync(join(tmpdir(), `agentprobe-${prefix}-`)); +} + +function buildConfig( + overrides: Partial = {}, +): RetrievalConfig { + return { + golden: overrides.golden ?? ["Sarah's email", "Atlas project status"], + forbidden: overrides.forbidden ?? [], + k: overrides.k, + weights: overrides.weights ?? { + precision_at_k: 1, + recall_at_k: 1, + mrr: 1, + ndcg_at_k: 1, + }, + passThreshold: overrides.passThreshold ?? 0.5, + match: overrides.match ?? "substring", + source: overrides.source, + }; +} + +function buildScenario(retrieval: RetrievalConfig | undefined): Scenario { + return { + id: "retrieval-test", + name: "Retrieval Test", + tags: [], + turns: [], + sessions: [], + expectations: { + mustInclude: [], + mustNotInclude: [], + expectedTools: [], + failureModes: [], + }, + retrieval, + }; +} + +function buildReply(retrieved: unknown): AdapterReply { + return { + assistantText: "...", + toolCalls: [], + rawExchange: (retrieved === undefined + ? {} + : { retrieved }) as unknown as AdapterReply["rawExchange"], + latencyMs: 0, + usage: {}, + }; +} + +describe("coerceRetrievedItems", () => { + test("returns the string itself for a single-string payload", () => { + expect(coerceRetrievedItems("only one")).toEqual(["only one"]); + }); + + test("flattens arrays of strings", () => { + expect(coerceRetrievedItems(["a", "b"])).toEqual(["a", "b"]); + }); + + test("extracts label/text/name/summary/id from object payloads", () => { + const payload = [ + { label: "Sarah" }, + { text: "Atlas" }, + { name: "Marcus" }, + { summary: "Northstar" }, + { id: "fact-123" }, + ]; + expect(coerceRetrievedItems(payload)).toEqual([ + "Sarah", + "Atlas", + "Marcus", + "Northstar", + "fact-123", + ]); + }); + + test("ignores entries with no recognizable label", () => { + expect(coerceRetrievedItems([{ irrelevant: 42 }, null, undefined])).toEqual( + [], + ); + }); + + test("returns [] for non-array, non-string payloads", () => { + expect(coerceRetrievedItems(42)).toEqual([]); + expect(coerceRetrievedItems({ foo: "bar" })).toEqual([]); + }); +}); + +describe("resolveRetrievedItems", () => { + test("reads a JSON fixture relative to the scenarios path", () => { + const dir = makeTempDir("retrieval-fixture"); + const fixturePath = join(dir, "memories.json"); + writeFileSync( + fixturePath, + JSON.stringify(["Sarah's email", "Atlas project status"]), + "utf8", + ); + + const config = buildConfig({ + source: { fixture: "memories.json" }, + }); + + const result = resolveRetrievedItems(config, { + scenariosPath: join(dir, "scenarios.yaml"), + }); + + expect(result.source).toBe("fixture"); + expect(result.items).toEqual(["Sarah's email", "Atlas project status"]); + }); + + test("falls back to the default `retrieved` raw exchange key", () => { + const result = resolveRetrievedItems(buildConfig(), { + lastAdapterReply: buildReply([{ label: "Sarah" }]), + }); + expect(result.source).toBe("raw_exchange"); + expect(result.items).toEqual(["Sarah"]); + }); + + test("honors a custom rawExchangeKey", () => { + const config = buildConfig({ + source: { rawExchangeKey: "memories" }, + }); + const reply: AdapterReply = { + assistantText: "...", + toolCalls: [], + rawExchange: { + memories: ["A", "B"], + } as unknown as AdapterReply["rawExchange"], + latencyMs: 0, + usage: {}, + }; + + const result = resolveRetrievedItems(config, { lastAdapterReply: reply }); + expect(result.items).toEqual(["A", "B"]); + expect(result.source).toBe("raw_exchange"); + }); + + test("returns `missing` source when no fixture or raw exchange field is available", () => { + const result = resolveRetrievedItems(buildConfig(), {}); + expect(result.source).toBe("missing"); + expect(result.items).toEqual([]); + }); + + test("throws for a missing fixture file", () => { + const config = buildConfig({ + source: { fixture: "/nonexistent/path/to/file.json" }, + }); + expect(() => resolveRetrievedItems(config, {})).toThrow( + /Retrieval fixture not found/, + ); + }); +}); + +describe("scoreRetrieval", () => { + test("returns undefined when the scenario has no retrieval block", () => { + expect(scoreRetrieval(buildScenario(undefined), {})).toBeUndefined(); + }); + + test("scores a perfect retrieval as passed and weightedScore 1", () => { + const scenario = buildScenario(buildConfig({ k: 2 })); + const reply = buildReply(["Sarah's email", "Atlas project status"]); + + const result = scoreRetrieval(scenario, { lastAdapterReply: reply }); + + expect(result).toBeDefined(); + expect(result?.source).toBe("raw_exchange"); + expect(result?.hitCount).toBe(2); + expect(result?.weightedScore).toBeCloseTo(1.0, 6); + expect(result?.passed).toBe(true); + }); + + test("flags a forbidden hit and forces a fail", () => { + const scenario = buildScenario( + buildConfig({ + golden: ["I do not have that"], + forbidden: ["$50K"], + k: 3, + passThreshold: 0.2, + }), + ); + const reply = buildReply([ + "I do not have that information", + "The Q2 marketing budget was $50K", + ]); + + const result = scoreRetrieval(scenario, { lastAdapterReply: reply }); + + expect(result?.hitCount).toBeGreaterThan(0); + expect(result?.forbiddenHits).toBe(1); + expect(result?.passed).toBe(false); + }); + + test("missing source records a 0-hit score with source=missing", () => { + const scenario = buildScenario(buildConfig({ k: 5, passThreshold: 0.5 })); + const result = scoreRetrieval(scenario, {}); + + expect(result?.source).toBe("missing"); + expect(result?.hitCount).toBe(0); + expect(result?.passed).toBe(false); + }); + + test("loads retrieved items from a fixture relative to the scenarios path", () => { + const dir = makeTempDir("retrieval-fixture-scored"); + writeFileSync( + join(dir, "memories.json"), + JSON.stringify(["Sarah's email", "Atlas project status"]), + "utf8", + ); + const scenario = buildScenario( + buildConfig({ + k: 2, + source: { fixture: "memories.json" }, + }), + ); + + const result = scoreRetrieval(scenario, { + scenariosPath: join(dir, "scenarios.yaml"), + }); + + expect(result?.source).toBe("fixture"); + expect(result?.passed).toBe(true); + }); +}); diff --git a/src/domains/evaluation/retrieval-scorer.ts b/src/domains/evaluation/retrieval-scorer.ts new file mode 100644 index 0000000..4a774fa --- /dev/null +++ b/src/domains/evaluation/retrieval-scorer.ts @@ -0,0 +1,205 @@ +import { existsSync, readFileSync, statSync } from "node:fs"; +import { dirname, isAbsolute, resolve } from "node:path"; + +import type { + AdapterReply, + JsonValue, + RetrievalConfig, + RetrievalScore, + Scenario, +} from "../../shared/types/contracts.ts"; +import { AgentProbeRuntimeError } from "../../shared/utils/errors.ts"; +import { logWarn } from "../../shared/utils/logging.ts"; +import { scoreRanking } from "./ranking.ts"; + +const DEFAULT_RAW_EXCHANGE_KEY = "retrieved"; + +/** + * Convert a raw exchange/fixture payload into a flat list of strings. + * + * Accepts: + * - `["item one", "item two"]` + * - `[{ label: "foo" }, { id: "bar", label: "bar" }]` + * - `[{ name: "foo" }]` (falls back to `name` then `id`) + * - A single string (treated as a one-element list) + * + * Anything else returns an empty list and logs a warning — the scorer will + * then reasonably fail the scenario for missing data, rather than throwing + * and crashing the whole suite. + */ +export function coerceRetrievedItems(payload: unknown): string[] { + if (typeof payload === "string") { + return [payload]; + } + if (!Array.isArray(payload)) { + return []; + } + const out: string[] = []; + for (const item of payload) { + if (typeof item === "string") { + out.push(item); + continue; + } + if (typeof item === "number" || typeof item === "boolean") { + out.push(String(item)); + continue; + } + if (!item || typeof item !== "object" || Array.isArray(item)) { + continue; + } + const record = item as Record; + const label = + record.label ?? + record.text ?? + record.title ?? + record.name ?? + record.summary ?? + record.id ?? + record.uuid; + if (typeof label === "string") { + out.push(label); + } else if (typeof label === "number") { + out.push(String(label)); + } + } + return out; +} + +/** + * Resolve a retrieval `source.fixture` path relative to the scenario YAML. + * When `scenariosPath` is undefined or the scenario was loaded from memory, + * absolute paths are honored and relative paths are resolved against CWD. + */ +function resolveFixturePath( + scenariosPath: string | undefined, + fixture: string, +): string { + if (isAbsolute(fixture)) { + return fixture; + } + if (!scenariosPath) { + return resolve(fixture); + } + let base: string; + try { + base = + existsSync(scenariosPath) && statSync(scenariosPath).isDirectory() + ? scenariosPath + : dirname(scenariosPath); + } catch { + base = dirname(scenariosPath); + } + return resolve(base, fixture); +} + +function readFixture(fixturePath: string): unknown { + const contents = readFileSync(fixturePath, "utf8"); + return JSON.parse(contents) as unknown; +} + +export type RetrievalSourceContext = { + scenariosPath?: string; + lastAdapterReply?: AdapterReply; +}; + +export type RetrievedItemsResult = { + items: string[]; + source: RetrievalScore["source"]; +}; + +/** + * Resolve the actual list of retrieved items at scoring time. + * + * Resolution order: + * 1. `retrieval.source.fixture` — read JSON file, coerce to strings. + * 2. `retrieval.source.rawExchangeKey` (or `retrieved` by default) on the + * last assistant reply's `rawExchange`. + * + * Returns `{ items: [], source: "missing" }` when neither is available so + * the scorer can record an honest miss rather than guessing. + */ +export function resolveRetrievedItems( + config: RetrievalConfig, + context: RetrievalSourceContext, +): RetrievedItemsResult { + const fixture = config.source?.fixture; + if (fixture) { + const resolved = resolveFixturePath(context.scenariosPath, fixture); + if (!existsSync(resolved)) { + throw new AgentProbeRuntimeError( + `Retrieval fixture not found: ${resolved}`, + ); + } + const payload = readFixture(resolved); + const items = coerceRetrievedItems(payload); + return { items, source: "fixture" }; + } + + const rawExchangeKey = + config.source?.rawExchangeKey ?? DEFAULT_RAW_EXCHANGE_KEY; + const rawExchange = context.lastAdapterReply?.rawExchange; + if (rawExchange && typeof rawExchange === "object") { + const candidate = (rawExchange as Record)[ + rawExchangeKey + ]; + if (candidate !== undefined) { + const items = coerceRetrievedItems(candidate); + return { items, source: "raw_exchange" }; + } + } + + return { items: [], source: "missing" }; +} + +/** + * Score a scenario's retrieval block given a retrieved-list resolution + * context. Returns `undefined` when the scenario has no retrieval block, + * otherwise always returns a `RetrievalScore` — including for the `missing` + * source case (where the score will be 0 and `passed` will be false unless + * `passThreshold` is 0). + */ +export function scoreRetrieval( + scenario: Scenario, + context: RetrievalSourceContext, +): RetrievalScore | undefined { + const config = scenario.retrieval; + if (!config) { + return undefined; + } + + let resolution: RetrievedItemsResult; + try { + resolution = resolveRetrievedItems(config, context); + } catch (error) { + logWarn( + `Retrieval scoring failed to resolve items for scenario ${scenario.id}: ${ + error instanceof Error ? error.message : String(error) + }`, + ); + resolution = { items: [], source: "missing" }; + } + + const ranking = scoreRanking({ + returned: resolution.items, + golden: config.golden, + forbidden: config.forbidden, + k: config.k, + weights: config.weights, + match: config.match, + passThreshold: config.passThreshold, + }); + + return { + k: ranking.k, + totalRelevant: ranking.totalRelevant, + totalReturned: ranking.totalReturned, + hitCount: ranking.hitCount, + forbiddenHits: ranking.forbiddenHits, + metrics: ranking.metrics, + weightedScore: ranking.weightedScore, + passThreshold: config.passThreshold, + passed: ranking.passed, + returned: resolution.items, + source: resolution.source, + }; +} diff --git a/src/domains/evaluation/run-suite.ts b/src/domains/evaluation/run-suite.ts index b09049e..47c6637 100644 --- a/src/domains/evaluation/run-suite.ts +++ b/src/domains/evaluation/run-suite.ts @@ -6,11 +6,15 @@ import type { CheckpointAssertion, CheckpointResult, ConversationTurn, + DedupScore, + DemotionScore, Endpoints, JsonValue, JudgeDimensionScore, Persona, PresetSnapshot, + ProcedureScore, + RetrievalScore, Rubric, RubricScore, RunProgressEvent, @@ -38,12 +42,16 @@ import { parseScenariosInput, parseTimeOffset, } from "../validation/load-suite.ts"; +import { scoreScenarioDedup } from "./dedup-scorer.ts"; +import { scoreScenarioDemotion } from "./demotion-scorer.ts"; import { judgeResponse } from "./judge.ts"; import type { EndpointAdapter, EndpointAdapterFactory, LlmResponsesClient, } from "./ports.ts"; +import { scoreScenarioProcedure } from "./procedure-scorer.ts"; +import { scoreRetrieval } from "./retrieval-scorer.ts"; import { generatePersonaStep, resolvePersonaModel } from "./simulator.ts"; const resetsRequiringReinit = new Set(["new", "fresh_agent"]); @@ -159,6 +167,25 @@ export type RunRecorder = { overallScore: number; }, ) => Promise; + recordRetrievalResult?: ( + scenarioRunId: number, + options: { + scenario: Scenario; + score: RetrievalScore; + }, + ) => Promise; + recordDemotionResult?: ( + scenarioRunId: number, + options: { scenario: Scenario; score: DemotionScore }, + ) => Promise; + recordProcedureResult?: ( + scenarioRunId: number, + options: { scenario: Scenario; score: ProcedureScore }, + ) => Promise; + recordDedupResult?: ( + scenarioRunId: number, + options: { scenario: Scenario; score: DedupScore }, + ) => Promise; }; export type PreparedScenarioSelection = { @@ -1037,19 +1064,67 @@ export async function runScenario( }); } + const evalContext = { + scenariosPath: options.scenariosPath, + lastAdapterReply: lastReply, + }; + + const retrievalScore = scoreRetrieval(scenario, evalContext); + if (retrievalScore && scenarioRunId !== undefined) { + await options.recorder?.recordRetrievalResult?.(scenarioRunId, { + scenario, + score: retrievalScore, + }); + } + + const demotionScore = scoreScenarioDemotion(scenario, evalContext); + if (demotionScore && scenarioRunId !== undefined) { + await options.recorder?.recordDemotionResult?.(scenarioRunId, { + scenario, + score: demotionScore, + }); + } + + const procedureScore = scoreScenarioProcedure(scenario, evalContext); + if (procedureScore && scenarioRunId !== undefined) { + await options.recorder?.recordProcedureResult?.(scenarioRunId, { + scenario, + score: procedureScore, + }); + } + + const dedupScore = scoreScenarioDedup(scenario, evalContext); + if (dedupScore && scenarioRunId !== undefined) { + await options.recorder?.recordDedupResult?.(scenarioRunId, { + scenario, + score: dedupScore, + }); + } + + const overallPassed = + score.passed && + (retrievalScore?.passed ?? true) && + (demotionScore?.passed ?? true) && + (procedureScore?.passed ?? true) && + (dedupScore?.passed ?? true); + const result: ScenarioRunResult = { scenarioId: scenario.id, scenarioName: scenario.name, personaId: persona.id, rubricId: rubric.id, userId: options.userId, - passed: score.passed, - failureKind: score.failureKind, + passed: overallPassed, + failureKind: overallPassed ? undefined : (score.failureKind ?? "agent"), overallScore: finalScore, transcript: fullTranscript, checkpoints, toolCallsByTurn, judgeScore: score, + retrievalScore, + demotionScore, + procedureScore, + dedupScore, renderedTurns, }; if (scenarioRunId !== undefined) { diff --git a/src/domains/reporting/render-report.ts b/src/domains/reporting/render-report.ts index 16f98cb..a6f9058 100644 --- a/src/domains/reporting/render-report.ts +++ b/src/domains/reporting/render-report.ts @@ -374,6 +374,38 @@ function buildDimensionRows(scenario: ScenarioRecord): TemplateObject[] { }); } +function buildRetrievalRows(scenario: ScenarioRecord): TemplateObject[] { + const scores = Array.isArray(scenario.retrievalScores) + ? scenario.retrievalScores + : []; + return scores.map((score) => ({ + metric: String((score as Record).metric ?? ""), + value: numberValue((score as Record).value) ?? 0, + weight: numberValue((score as Record).weight) ?? 0, + k: numberValue((score as Record).k) ?? 0, + weighted_score: + numberValue((score as Record).weighted_score) ?? 0, + pass_threshold: + numberValue((score as Record).pass_threshold) ?? 0, + passed: (score as Record).passed === true, + total_relevant: + numberValue((score as Record).total_relevant) ?? 0, + total_returned: + numberValue((score as Record).total_returned) ?? 0, + hit_count: numberValue((score as Record).hit_count) ?? 0, + forbidden_hits: + numberValue((score as Record).forbidden_hits) ?? 0, + source: String((score as Record).source ?? ""), + returned: (score as Record).returned ?? [], + value_percent: scorePercent( + numberValue((score as Record).value), + ), + weighted_score_percent: scorePercent( + numberValue((score as Record).weighted_score), + ), + })); +} + function prepareScenarioView( scenario: ScenarioRecord, index: number, @@ -397,6 +429,8 @@ function prepareScenarioView( threshold_percent: scorePercent(scenario.passThreshold), turn_rows: buildTurnRows(scenario), dimension_rows: buildDimensionRows(scenario), + retrieval_rows: buildRetrievalRows(scenario), + retrieval_scores_pretty: prettyJson(scenario.retrievalScores), overall_notes: scenario.judge.overallNotes ?? "", judge_output_pretty: prettyJson(scenario.judge.output), error_pretty: prettyJson(scenario.error), diff --git a/src/domains/validation/load-suite.ts b/src/domains/validation/load-suite.ts index 0e69aef..735fb68 100644 --- a/src/domains/validation/load-suite.ts +++ b/src/domains/validation/load-suite.ts @@ -7,6 +7,11 @@ import type { CheckpointAssertion, CheckpointTurn, CliHarness, + DedupConfig, + DemotionCascade, + DemotionConfig, + DemotionMetricKey, + DreamSource, EndpointAuth, EndpointLogging, EndpointRequest, @@ -24,7 +29,13 @@ import type { PersonaDemographics, PersonaPersonality, Personas, + ProcedureConfig, + ProcedureMetricKey, ProcessedYamlFile, + RetrievalConfig, + RetrievalMatchPolicy, + RetrievalMetricKey, + RetrievalSource, Rubric, RubricDimension, RubricScale, @@ -951,6 +962,353 @@ function parseScenarioExpectations(value: unknown): ScenarioExpectations { return result; } +const RETRIEVAL_METRIC_KEYS: RetrievalMetricKey[] = [ + "precision_at_k", + "recall_at_k", + "mrr", + "ndcg_at_k", +]; + +const VALID_MATCH_POLICIES: RetrievalMatchPolicy[] = [ + "exact", + "substring", + "regex", +]; + +function parseRetrievalWeights( + value: unknown, +): Required { + const defaults: Required = { + precision_at_k: 1, + recall_at_k: 1, + mrr: 1, + ndcg_at_k: 1, + }; + if (!value || typeof value !== "object" || Array.isArray(value)) { + return defaults; + } + const raw = value as YamlObject; + for (const key of Object.keys(raw)) { + if (!RETRIEVAL_METRIC_KEYS.includes(key as RetrievalMetricKey)) { + throw new AgentProbeConfigError( + `Unknown retrieval metric key: ${key}. Allowed: ${RETRIEVAL_METRIC_KEYS.join(", ")}.`, + ); + } + } + for (const key of RETRIEVAL_METRIC_KEYS) { + const candidate = optionalNumber(raw[key]); + if (candidate !== undefined) { + if (candidate < 0) { + throw new AgentProbeConfigError( + `retrieval.weight.${key} must be non-negative.`, + ); + } + defaults[key] = candidate; + } + } + return defaults; +} + +function parseRetrievalSource(value: unknown): RetrievalSource | undefined { + if (!value) { + return undefined; + } + const raw = ensureObject(value, "retrieval.source must be an object."); + const fixture = optionalString(raw.fixture); + const rawExchangeKey = optionalString(raw.raw_exchange_key); + if (!fixture && !rawExchangeKey) { + return undefined; + } + return { + fixture, + rawExchangeKey, + }; +} + +function parseRetrievalConfig(value: unknown): RetrievalConfig | undefined { + if (!value) { + return undefined; + } + const raw = ensureObject(value, "scenario.retrieval must be an object."); + + const golden = stringArray(raw.golden); + if (golden.length === 0) { + throw new AgentProbeConfigError( + "scenario.retrieval.golden must be a non-empty list of strings.", + ); + } + + const forbidden = stringArray(raw.forbidden); + + const k = optionalNumber(raw.k); + if (k !== undefined && (!Number.isFinite(k) || k <= 0)) { + throw new AgentProbeConfigError( + "scenario.retrieval.k must be a positive integer when provided.", + ); + } + + const matchValue = optionalString(raw.match) ?? "substring"; + if (!VALID_MATCH_POLICIES.includes(matchValue as RetrievalMatchPolicy)) { + throw new AgentProbeConfigError( + `scenario.retrieval.match must be one of: ${VALID_MATCH_POLICIES.join(", ")}.`, + ); + } + + const passThreshold = optionalNumber(raw.pass_threshold); + if ( + passThreshold !== undefined && + (!Number.isFinite(passThreshold) || passThreshold < 0 || passThreshold > 1) + ) { + throw new AgentProbeConfigError( + "scenario.retrieval.pass_threshold must be between 0 and 1 when provided.", + ); + } + + return { + golden, + forbidden, + k: k !== undefined ? Math.floor(k) : undefined, + weights: parseRetrievalWeights(raw.weight ?? raw.weights), + passThreshold: passThreshold ?? 0.5, + match: matchValue as RetrievalMatchPolicy, + source: parseRetrievalSource(raw.source), + }; +} + +const DEMOTION_METRIC_KEYS: DemotionMetricKey[] = [ + "set_precision", + "set_recall", + "set_f1", + "timestamp_discipline", + "cascade_bounded", + "cascade_direct_f1", +]; + +const PROCEDURE_METRIC_KEYS: ProcedureMetricKey[] = [ + "step_coverage", + "step_order", + "parameter_coverage", +]; + +const DEDUP_METRIC_KEYS = ["precision", "recall", "f1", "ari"] as const; + +function parseDreamSource( + value: unknown, + scopeLabel: string, +): DreamSource | undefined { + if (!value) { + return undefined; + } + const raw = ensureObject(value, `${scopeLabel}.source must be an object.`); + const fixture = optionalString(raw.fixture); + const rawExchangeKey = optionalString(raw.raw_exchange_key); + if (!fixture && !rawExchangeKey) { + return undefined; + } + return { fixture, rawExchangeKey }; +} + +function parseWeightedKeys( + value: unknown, + scopeLabel: string, + allowedKeys: readonly K[], + defaultWeight: number, +): Record { + const result = Object.fromEntries( + allowedKeys.map((key) => [key, defaultWeight]), + ) as Record; + if (!value) { + return result; + } + const raw = ensureObject(value, `${scopeLabel}.weight must be an object.`); + for (const key of Object.keys(raw)) { + if (!(allowedKeys as readonly string[]).includes(key)) { + throw new AgentProbeConfigError( + `Unknown ${scopeLabel} metric key: ${key}. Allowed: ${allowedKeys.join(", ")}.`, + ); + } + } + for (const key of allowedKeys) { + const candidate = optionalNumber(raw[key]); + if (candidate !== undefined) { + if (candidate < 0) { + throw new AgentProbeConfigError( + `${scopeLabel}.weight.${key} must be non-negative.`, + ); + } + result[key] = candidate; + } + } + return result; +} + +function parseUnitThreshold( + value: unknown, + scopeLabel: string, + fallback: number, +): number { + const parsed = optionalNumber(value); + if (parsed === undefined) { + return fallback; + } + if (!Number.isFinite(parsed) || parsed < 0 || parsed > 1) { + throw new AgentProbeConfigError( + `${scopeLabel}.pass_threshold must be between 0 and 1 when provided.`, + ); + } + return parsed; +} + +function parseDemotionCascade(value: unknown): DemotionCascade | undefined { + if (!value) { + return undefined; + } + const raw = ensureObject(value, "demotion.cascade must be an object."); + const expected = stringArray(raw.expected_direct_neighbors ?? raw.expected); + const tangential = stringArray(raw.tangential_edges ?? raw.tangential); + if (expected.length === 0 && tangential.length === 0) { + return undefined; + } + return { expectedDirectNeighbors: expected, tangentialEdges: tangential }; +} + +function parseDemotionConfig(value: unknown): DemotionConfig | undefined { + if (!value) { + return undefined; + } + const raw = ensureObject(value, "scenario.demotion must be an object."); + + const expectedDemotions = stringArray(raw.expected_demotions ?? raw.expected); + if (expectedDemotions.length === 0) { + throw new AgentProbeConfigError( + "scenario.demotion.expected_demotions must be a non-empty list of UUIDs.", + ); + } + const expectedRetracts = stringArray(raw.expected_retracts); + const cascade = parseDemotionCascade(raw.cascade); + const weights = parseWeightedKeys( + raw.weight ?? raw.weights, + "scenario.demotion", + DEMOTION_METRIC_KEYS, + 1, + ); + const passThreshold = parseUnitThreshold( + raw.pass_threshold, + "scenario.demotion", + 0.6, + ); + return { + expectedDemotions, + expectedRetracts: + expectedRetracts.length > 0 ? expectedRetracts : undefined, + cascade, + weights, + passThreshold, + source: parseDreamSource(raw.source, "scenario.demotion"), + }; +} + +function parseProcedureConfig(value: unknown): ProcedureConfig | undefined { + if (!value) { + return undefined; + } + const raw = ensureObject(value, "scenario.procedure must be an object."); + const goldenObj = + raw.golden && typeof raw.golden === "object" && !Array.isArray(raw.golden) + ? (raw.golden as YamlObject) + : undefined; + const stepsRaw = Array.isArray(raw.golden_steps) + ? raw.golden_steps + : Array.isArray(raw.steps) + ? raw.steps + : Array.isArray(goldenObj?.steps) + ? (goldenObj.steps as unknown[]) + : []; + const goldenSteps = stringArray(stepsRaw); + if (goldenSteps.length === 0) { + throw new AgentProbeConfigError( + "scenario.procedure.golden_steps (or golden.steps) must be a non-empty list of step labels.", + ); + } + const paramRaw = + raw.golden_parameters ?? + raw.parameters ?? + (goldenObj?.parameters as unknown); + const goldenParameters = stringArray(paramRaw); + const weights = parseWeightedKeys( + raw.weight ?? raw.weights, + "scenario.procedure", + PROCEDURE_METRIC_KEYS, + 1, + ); + const passThreshold = parseUnitThreshold( + raw.pass_threshold, + "scenario.procedure", + 0.6, + ); + return { + goldenSteps, + goldenParameters: + goldenParameters.length > 0 ? goldenParameters : undefined, + weights, + passThreshold, + source: parseDreamSource(raw.source, "scenario.procedure"), + }; +} + +function parseDedupConfig(value: unknown): DedupConfig | undefined { + if (!value) { + return undefined; + } + const raw = ensureObject(value, "scenario.dedup must be an object."); + const clustersRaw = raw.golden_clusters ?? raw.golden; + if (!Array.isArray(clustersRaw)) { + throw new AgentProbeConfigError( + "scenario.dedup.golden_clusters must be a list of clusters (each a list of UUIDs).", + ); + } + const clusters: string[][] = []; + for (const cluster of clustersRaw) { + if (!Array.isArray(cluster)) { + throw new AgentProbeConfigError( + "scenario.dedup.golden_clusters items must each be a list of UUIDs.", + ); + } + const items = stringArray(cluster); + if (items.length === 0) { + continue; + } + clusters.push(items); + } + if (clusters.length === 0) { + throw new AgentProbeConfigError( + "scenario.dedup.golden_clusters must contain at least one non-empty cluster.", + ); + } + const weights = parseWeightedKeys( + raw.weight ?? raw.weights, + "scenario.dedup", + DEDUP_METRIC_KEYS, + 1, + ); + const passThreshold = parseUnitThreshold( + raw.pass_threshold, + "scenario.dedup", + 0.6, + ); + return { + goldenClusters: clusters, + weights: { + precision: weights.precision, + recall: weights.recall, + f1: weights.f1, + ari: weights.ari, + }, + passThreshold, + source: parseDreamSource(raw.source, "scenario.dedup"), + }; +} + function parseSession(value: unknown): Session { const raw = ensureObject(value, "scenario session must be an object."); return { @@ -1014,6 +1372,10 @@ function parseScenario(value: unknown, defaults?: ScenarioDefaults): Scenario { ? raw.sessions.map((item) => parseSession(item)) : [], expectations: parseScenarioExpectations(raw.expectations), + retrieval: parseRetrievalConfig(raw.retrieval), + demotion: parseDemotionConfig(raw.demotion), + procedure: parseProcedureConfig(raw.procedure), + dedup: parseDedupConfig(raw.dedup), }; } diff --git a/src/providers/persistence/drizzle/postgres-schema.ts b/src/providers/persistence/drizzle/postgres-schema.ts index fc4a4fa..986c8f4 100644 --- a/src/providers/persistence/drizzle/postgres-schema.ts +++ b/src/providers/persistence/drizzle/postgres-schema.ts @@ -221,6 +221,109 @@ export const postgresJudgeDimensionScores = pgTable( (table) => [index("idx_judge_scores_scenario_run").on(table.scenarioRunId)], ); +export const postgresRetrievalScores = pgTable( + "retrieval_scores", + { + id: bigserial("id", { mode: "number" }).primaryKey(), + scenarioRunId: bigint("scenario_run_id", { mode: "number" }) + .notNull() + .references(() => postgresScenarioRuns.id, { onDelete: "cascade" }), + metric: text("metric").notNull(), + value: doublePrecision("value").notNull(), + weight: doublePrecision("weight").notNull(), + k: integer("k").notNull(), + weightedScore: doublePrecision("weighted_score").notNull(), + passThreshold: doublePrecision("pass_threshold").notNull(), + passed: boolean("passed").notNull(), + totalRelevant: integer("total_relevant").notNull(), + totalReturned: integer("total_returned").notNull(), + hitCount: integer("hit_count").notNull(), + forbiddenHits: integer("forbidden_hits").notNull(), + source: text("source").notNull(), + returnedJson: jsonb("returned_json"), + createdAt: timestamp("created_at", { withTimezone: true }).notNull(), + }, + (table) => [ + index("idx_retrieval_scores_scenario_run").on(table.scenarioRunId), + index("idx_retrieval_scores_metric").on(table.metric), + ], +); + +export const postgresDemotionScores = pgTable( + "demotion_scores", + { + id: bigserial("id", { mode: "number" }).primaryKey(), + scenarioRunId: bigint("scenario_run_id", { mode: "number" }) + .notNull() + .references(() => postgresScenarioRuns.id, { onDelete: "cascade" }), + metric: text("metric").notNull(), + value: doublePrecision("value").notNull(), + weight: doublePrecision("weight").notNull(), + weightedScore: doublePrecision("weighted_score").notNull(), + passThreshold: doublePrecision("pass_threshold").notNull(), + passed: boolean("passed").notNull(), + timestampViolationCount: integer("timestamp_violation_count").notNull(), + cascadeBounded: boolean("cascade_bounded"), + source: text("source").notNull(), + observedJson: jsonb("observed_json"), + expectedJson: jsonb("expected_json"), + createdAt: timestamp("created_at", { withTimezone: true }).notNull(), + }, + (table) => [ + index("idx_demotion_scores_scenario_run").on(table.scenarioRunId), + index("idx_demotion_scores_metric").on(table.metric), + ], +); + +export const postgresProcedureScores = pgTable( + "procedure_scores", + { + id: bigserial("id", { mode: "number" }).primaryKey(), + scenarioRunId: bigint("scenario_run_id", { mode: "number" }) + .notNull() + .references(() => postgresScenarioRuns.id, { onDelete: "cascade" }), + metric: text("metric").notNull(), + value: doublePrecision("value").notNull(), + weight: doublePrecision("weight").notNull(), + weightedScore: doublePrecision("weighted_score").notNull(), + passThreshold: doublePrecision("pass_threshold").notNull(), + passed: boolean("passed").notNull(), + source: text("source").notNull(), + predictedJson: jsonb("predicted_json"), + goldenJson: jsonb("golden_json"), + createdAt: timestamp("created_at", { withTimezone: true }).notNull(), + }, + (table) => [ + index("idx_procedure_scores_scenario_run").on(table.scenarioRunId), + index("idx_procedure_scores_metric").on(table.metric), + ], +); + +export const postgresDedupScores = pgTable( + "dedup_scores", + { + id: bigserial("id", { mode: "number" }).primaryKey(), + scenarioRunId: bigint("scenario_run_id", { mode: "number" }) + .notNull() + .references(() => postgresScenarioRuns.id, { onDelete: "cascade" }), + metric: text("metric").notNull(), + value: doublePrecision("value").notNull(), + weight: doublePrecision("weight").notNull(), + weightedScore: doublePrecision("weighted_score").notNull(), + passThreshold: doublePrecision("pass_threshold").notNull(), + passed: boolean("passed").notNull(), + itemCount: integer("item_count").notNull(), + source: text("source").notNull(), + predictedJson: jsonb("predicted_json"), + goldenJson: jsonb("golden_json"), + createdAt: timestamp("created_at", { withTimezone: true }).notNull(), + }, + (table) => [ + index("idx_dedup_scores_scenario_run").on(table.scenarioRunId), + index("idx_dedup_scores_metric").on(table.metric), + ], +); + export const postgresHumanDimensionScores = pgTable( "human_dimension_scores", { @@ -303,6 +406,10 @@ export const postgresSchema = { checkpoints: postgresCheckpoints, judgeDimensionScores: postgresJudgeDimensionScores, humanDimensionScores: postgresHumanDimensionScores, + retrievalScores: postgresRetrievalScores, + demotionScores: postgresDemotionScores, + procedureScores: postgresProcedureScores, + dedupScores: postgresDedupScores, presets: postgresPresets, presetScenarios: postgresPresetScenarios, appSettings: postgresAppSettings, diff --git a/src/providers/persistence/drizzle/sqlite-schema.ts b/src/providers/persistence/drizzle/sqlite-schema.ts index ac7bdcc..66f5309 100644 --- a/src/providers/persistence/drizzle/sqlite-schema.ts +++ b/src/providers/persistence/drizzle/sqlite-schema.ts @@ -178,6 +178,109 @@ export const sqliteJudgeDimensionScores = sqliteTable( }, ); +export const sqliteRetrievalScores = sqliteTable( + "retrieval_scores", + { + id: integer("id").primaryKey({ autoIncrement: true }), + scenarioRunId: integer("scenario_run_id") + .notNull() + .references(() => sqliteScenarioRuns.id, { onDelete: "cascade" }), + metric: text("metric").notNull(), + value: real("value").notNull(), + weight: real("weight").notNull(), + k: integer("k").notNull(), + weightedScore: real("weighted_score").notNull(), + passThreshold: real("pass_threshold").notNull(), + passed: integer("passed").notNull(), + totalRelevant: integer("total_relevant").notNull(), + totalReturned: integer("total_returned").notNull(), + hitCount: integer("hit_count").notNull(), + forbiddenHits: integer("forbidden_hits").notNull(), + source: text("source").notNull(), + returnedJson: text("returned_json"), + createdAt: text("created_at").notNull(), + }, + (table) => [ + index("idx_retrieval_scores_scenario_run").on(table.scenarioRunId), + index("idx_retrieval_scores_metric").on(table.metric), + ], +); + +export const sqliteDemotionScores = sqliteTable( + "demotion_scores", + { + id: integer("id").primaryKey({ autoIncrement: true }), + scenarioRunId: integer("scenario_run_id") + .notNull() + .references(() => sqliteScenarioRuns.id, { onDelete: "cascade" }), + metric: text("metric").notNull(), + value: real("value").notNull(), + weight: real("weight").notNull(), + weightedScore: real("weighted_score").notNull(), + passThreshold: real("pass_threshold").notNull(), + passed: integer("passed").notNull(), + timestampViolationCount: integer("timestamp_violation_count").notNull(), + cascadeBounded: integer("cascade_bounded"), + source: text("source").notNull(), + observedJson: text("observed_json"), + expectedJson: text("expected_json"), + createdAt: text("created_at").notNull(), + }, + (table) => [ + index("idx_demotion_scores_scenario_run").on(table.scenarioRunId), + index("idx_demotion_scores_metric").on(table.metric), + ], +); + +export const sqliteProcedureScores = sqliteTable( + "procedure_scores", + { + id: integer("id").primaryKey({ autoIncrement: true }), + scenarioRunId: integer("scenario_run_id") + .notNull() + .references(() => sqliteScenarioRuns.id, { onDelete: "cascade" }), + metric: text("metric").notNull(), + value: real("value").notNull(), + weight: real("weight").notNull(), + weightedScore: real("weighted_score").notNull(), + passThreshold: real("pass_threshold").notNull(), + passed: integer("passed").notNull(), + source: text("source").notNull(), + predictedJson: text("predicted_json"), + goldenJson: text("golden_json"), + createdAt: text("created_at").notNull(), + }, + (table) => [ + index("idx_procedure_scores_scenario_run").on(table.scenarioRunId), + index("idx_procedure_scores_metric").on(table.metric), + ], +); + +export const sqliteDedupScores = sqliteTable( + "dedup_scores", + { + id: integer("id").primaryKey({ autoIncrement: true }), + scenarioRunId: integer("scenario_run_id") + .notNull() + .references(() => sqliteScenarioRuns.id, { onDelete: "cascade" }), + metric: text("metric").notNull(), + value: real("value").notNull(), + weight: real("weight").notNull(), + weightedScore: real("weighted_score").notNull(), + passThreshold: real("pass_threshold").notNull(), + passed: integer("passed").notNull(), + itemCount: integer("item_count").notNull(), + source: text("source").notNull(), + predictedJson: text("predicted_json"), + goldenJson: text("golden_json"), + createdAt: text("created_at").notNull(), + }, + (table) => [ + index("idx_dedup_scores_scenario_run").on(table.scenarioRunId), + index("idx_dedup_scores_metric").on(table.metric), + ], +); + export const sqliteHumanDimensionScores = sqliteTable( "human_dimension_scores", { @@ -256,6 +359,10 @@ export const sqliteSchema = { checkpoints: sqliteCheckpoints, judgeDimensionScores: sqliteJudgeDimensionScores, humanDimensionScores: sqliteHumanDimensionScores, + retrievalScores: sqliteRetrievalScores, + demotionScores: sqliteDemotionScores, + procedureScores: sqliteProcedureScores, + dedupScores: sqliteDedupScores, presets: sqlitePresets, presetScenarios: sqlitePresetScenarios, appSettings: sqliteAppSettings, diff --git a/src/providers/persistence/migrations/postgres.ts b/src/providers/persistence/migrations/postgres.ts index 1682b7d..ac54edf 100644 --- a/src/providers/persistence/migrations/postgres.ts +++ b/src/providers/persistence/migrations/postgres.ts @@ -2,7 +2,7 @@ import { createPostgresClient, type SqlTag } from "../postgres-client.ts"; import type { MigrationRunner } from "./types.ts"; /** Target schema version for Postgres. Bumps whenever a new migration is added. */ -export const POSTGRES_TARGET_VERSION = 4; +export const POSTGRES_TARGET_VERSION = 6; const POSTGRES_BASELINE_DDL = ` create table if not exists meta ( @@ -153,6 +153,73 @@ const POSTGRES_BASELINE_DDL = ` created_at timestamptz not null ); + create table if not exists retrieval_scores ( + id bigserial primary key, + scenario_run_id bigint not null references scenario_runs(id) on delete cascade, + metric text not null, + value double precision not null, + weight double precision not null, + k integer not null, + weighted_score double precision not null, + pass_threshold double precision not null, + passed boolean not null, + total_relevant integer not null, + total_returned integer not null, + hit_count integer not null, + forbidden_hits integer not null, + source text not null, + returned_json jsonb, + created_at timestamptz not null + ); + + create table if not exists demotion_scores ( + id bigserial primary key, + scenario_run_id bigint not null references scenario_runs(id) on delete cascade, + metric text not null, + value double precision not null, + weight double precision not null, + weighted_score double precision not null, + pass_threshold double precision not null, + passed boolean not null, + timestamp_violation_count integer not null, + cascade_bounded boolean, + source text not null, + observed_json jsonb, + expected_json jsonb, + created_at timestamptz not null + ); + + create table if not exists procedure_scores ( + id bigserial primary key, + scenario_run_id bigint not null references scenario_runs(id) on delete cascade, + metric text not null, + value double precision not null, + weight double precision not null, + weighted_score double precision not null, + pass_threshold double precision not null, + passed boolean not null, + source text not null, + predicted_json jsonb, + golden_json jsonb, + created_at timestamptz not null + ); + + create table if not exists dedup_scores ( + id bigserial primary key, + scenario_run_id bigint not null references scenario_runs(id) on delete cascade, + metric text not null, + value double precision not null, + weight double precision not null, + weighted_score double precision not null, + pass_threshold double precision not null, + passed boolean not null, + item_count integer not null, + source text not null, + predicted_json jsonb, + golden_json jsonb, + created_at timestamptz not null + ); + create table if not exists presets ( id text primary key, name text not null unique, @@ -201,6 +268,22 @@ const POSTGRES_BASELINE_DDL = ` on human_dimension_scores(scenario_run_id, dimension_id); create index if not exists idx_human_dim_scores_scenario_run on human_dimension_scores(scenario_run_id); + create index if not exists idx_retrieval_scores_scenario_run + on retrieval_scores(scenario_run_id); + create index if not exists idx_retrieval_scores_metric + on retrieval_scores(metric); + create index if not exists idx_demotion_scores_scenario_run + on demotion_scores(scenario_run_id); + create index if not exists idx_demotion_scores_metric + on demotion_scores(metric); + create index if not exists idx_procedure_scores_scenario_run + on procedure_scores(scenario_run_id); + create index if not exists idx_procedure_scores_metric + on procedure_scores(metric); + create index if not exists idx_dedup_scores_scenario_run + on dedup_scores(scenario_run_id); + create index if not exists idx_dedup_scores_metric + on dedup_scores(metric); `; async function readPostgresVersion(sql: SqlTag): Promise { @@ -305,6 +388,121 @@ export function createPostgresMigrationRunner( }); applied.push(4); } + if (from < 5) { + await sql.begin(async (tx) => { + await tx` + create table if not exists retrieval_scores ( + id bigserial primary key, + scenario_run_id bigint not null references scenario_runs(id) on delete cascade, + metric text not null, + value double precision not null, + weight double precision not null, + k integer not null, + weighted_score double precision not null, + pass_threshold double precision not null, + passed boolean not null, + total_relevant integer not null, + total_returned integer not null, + hit_count integer not null, + forbidden_hits integer not null, + source text not null, + returned_json jsonb, + created_at timestamptz not null + ) + `; + await tx` + create index if not exists idx_retrieval_scores_scenario_run + on retrieval_scores(scenario_run_id) + `; + await tx` + create index if not exists idx_retrieval_scores_metric + on retrieval_scores(metric) + `; + await tx`update meta set schema_version = 5 where id = 1`; + }); + applied.push(5); + } + if (from < 6) { + await sql.begin(async (tx) => { + await tx` + create table if not exists demotion_scores ( + id bigserial primary key, + scenario_run_id bigint not null references scenario_runs(id) on delete cascade, + metric text not null, + value double precision not null, + weight double precision not null, + weighted_score double precision not null, + pass_threshold double precision not null, + passed boolean not null, + timestamp_violation_count integer not null, + cascade_bounded boolean, + source text not null, + observed_json jsonb, + expected_json jsonb, + created_at timestamptz not null + ) + `; + await tx` + create index if not exists idx_demotion_scores_scenario_run + on demotion_scores(scenario_run_id) + `; + await tx` + create index if not exists idx_demotion_scores_metric + on demotion_scores(metric) + `; + await tx` + create table if not exists procedure_scores ( + id bigserial primary key, + scenario_run_id bigint not null references scenario_runs(id) on delete cascade, + metric text not null, + value double precision not null, + weight double precision not null, + weighted_score double precision not null, + pass_threshold double precision not null, + passed boolean not null, + source text not null, + predicted_json jsonb, + golden_json jsonb, + created_at timestamptz not null + ) + `; + await tx` + create index if not exists idx_procedure_scores_scenario_run + on procedure_scores(scenario_run_id) + `; + await tx` + create index if not exists idx_procedure_scores_metric + on procedure_scores(metric) + `; + await tx` + create table if not exists dedup_scores ( + id bigserial primary key, + scenario_run_id bigint not null references scenario_runs(id) on delete cascade, + metric text not null, + value double precision not null, + weight double precision not null, + weighted_score double precision not null, + pass_threshold double precision not null, + passed boolean not null, + item_count integer not null, + source text not null, + predicted_json jsonb, + golden_json jsonb, + created_at timestamptz not null + ) + `; + await tx` + create index if not exists idx_dedup_scores_scenario_run + on dedup_scores(scenario_run_id) + `; + await tx` + create index if not exists idx_dedup_scores_metric + on dedup_scores(metric) + `; + await tx`update meta set schema_version = 6 where id = 1`; + }); + applied.push(6); + } return applied; } finally { await sql.end?.(); diff --git a/src/providers/persistence/migrations/sqlite.ts b/src/providers/persistence/migrations/sqlite.ts index 4b91e36..6668bbc 100644 --- a/src/providers/persistence/migrations/sqlite.ts +++ b/src/providers/persistence/migrations/sqlite.ts @@ -4,7 +4,7 @@ import { resolveSqlitePath, withSqliteDatabase } from "../sqlite-connection.ts"; import type { MigrationReport, MigrationRunner } from "./types.ts"; /** Target schema version for SQLite. Keep synced with SCHEMA_VERSION in sqlite-run-history.ts. */ -export const SQLITE_TARGET_VERSION = 8; +export const SQLITE_TARGET_VERSION = 10; function utcNow(): string { return new Date().toISOString(); @@ -186,6 +186,73 @@ export function applySqliteBaseline(database: Database): void { created_at text not null ); + create table if not exists retrieval_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + k integer not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + total_relevant integer not null, + total_returned integer not null, + hit_count integer not null, + forbidden_hits integer not null, + source text not null, + returned_json text, + created_at text not null + ); + + create table if not exists demotion_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + timestamp_violation_count integer not null, + cascade_bounded integer, + source text not null, + observed_json text, + expected_json text, + created_at text not null + ); + + create table if not exists procedure_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + source text not null, + predicted_json text, + golden_json text, + created_at text not null + ); + + create table if not exists dedup_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + item_count integer not null, + source text not null, + predicted_json text, + golden_json text, + created_at text not null + ); + create table if not exists presets ( id text primary key, name text not null unique, @@ -238,6 +305,22 @@ export function applySqliteBaseline(database: Database): void { on human_dimension_scores(scenario_run_id, dimension_id); create index if not exists idx_human_dim_scores_scenario_run on human_dimension_scores(scenario_run_id); + create index if not exists idx_retrieval_scores_scenario_run + on retrieval_scores(scenario_run_id); + create index if not exists idx_retrieval_scores_metric + on retrieval_scores(metric); + create index if not exists idx_demotion_scores_scenario_run + on demotion_scores(scenario_run_id); + create index if not exists idx_demotion_scores_metric + on demotion_scores(metric); + create index if not exists idx_procedure_scores_scenario_run + on procedure_scores(scenario_run_id); + create index if not exists idx_procedure_scores_metric + on procedure_scores(metric); + create index if not exists idx_dedup_scores_scenario_run + on dedup_scores(scenario_run_id); + create index if not exists idx_dedup_scores_metric + on dedup_scores(metric); `); } @@ -331,6 +414,101 @@ export function applySqliteMigrations( applied.push(8); version = 8; } + if (version < 9) { + database.exec(` + create table if not exists retrieval_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + k integer not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + total_relevant integer not null, + total_returned integer not null, + hit_count integer not null, + forbidden_hits integer not null, + source text not null, + returned_json text, + created_at text not null + ); + create index if not exists idx_retrieval_scores_scenario_run + on retrieval_scores(scenario_run_id); + create index if not exists idx_retrieval_scores_metric + on retrieval_scores(metric); + `); + database.query("update meta set schema_version = ? where id = 1").run(9); + applied.push(9); + version = 9; + } + if (version < 10) { + database.exec(` + create table if not exists demotion_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + timestamp_violation_count integer not null, + cascade_bounded integer, + source text not null, + observed_json text, + expected_json text, + created_at text not null + ); + create index if not exists idx_demotion_scores_scenario_run + on demotion_scores(scenario_run_id); + create index if not exists idx_demotion_scores_metric + on demotion_scores(metric); + + create table if not exists procedure_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + source text not null, + predicted_json text, + golden_json text, + created_at text not null + ); + create index if not exists idx_procedure_scores_scenario_run + on procedure_scores(scenario_run_id); + create index if not exists idx_procedure_scores_metric + on procedure_scores(metric); + + create table if not exists dedup_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + item_count integer not null, + source text not null, + predicted_json text, + golden_json text, + created_at text not null + ); + create index if not exists idx_dedup_scores_scenario_run + on dedup_scores(scenario_run_id); + create index if not exists idx_dedup_scores_metric + on dedup_scores(metric); + `); + database.query("update meta set schema_version = ? where id = 1").run(10); + applied.push(10); + version = 10; + } return applied; } diff --git a/src/providers/persistence/postgres-backend.ts b/src/providers/persistence/postgres-backend.ts index fb2d658..fb2ff4b 100644 --- a/src/providers/persistence/postgres-backend.ts +++ b/src/providers/persistence/postgres-backend.ts @@ -188,6 +188,10 @@ function mapScenarioRow( toolCalls: UnknownRecord[], checkpoints: UnknownRecord[], judgeDimensionScores: UnknownRecord[], + retrievalScores: UnknownRecord[] = [], + demotionScores: UnknownRecord[] = [], + procedureScores: UnknownRecord[] = [], + dedupScores: UnknownRecord[] = [], ): ScenarioRecord { const failureKindRaw = asStringOrNull(row.failure_kind); const failureKind = @@ -269,6 +273,60 @@ function mapScenarioRow( reasoning: String(score.reasoning ?? ""), evidence: asJson(score.evidence_json) ?? [], })), + retrievalScores: retrievalScores.map((score) => ({ + metric: String(score.metric ?? ""), + value: Number(score.value), + weight: Number(score.weight), + k: Number(score.k), + weighted_score: Number(score.weighted_score), + pass_threshold: Number(score.pass_threshold), + passed: Boolean(score.passed), + total_relevant: Number(score.total_relevant), + total_returned: Number(score.total_returned), + hit_count: Number(score.hit_count), + forbidden_hits: Number(score.forbidden_hits), + source: String(score.source ?? ""), + returned: asJson(score.returned_json) ?? [], + })), + demotionScores: demotionScores.map((score) => ({ + metric: String(score.metric ?? ""), + value: Number(score.value), + weight: Number(score.weight), + weighted_score: Number(score.weighted_score), + pass_threshold: Number(score.pass_threshold), + passed: Boolean(score.passed), + timestamp_violation_count: Number(score.timestamp_violation_count), + cascade_bounded: + score.cascade_bounded === null || score.cascade_bounded === undefined + ? null + : Boolean(score.cascade_bounded), + source: String(score.source ?? ""), + observed: asJson(score.observed_json) ?? [], + expected: asJson(score.expected_json) ?? [], + })), + procedureScores: procedureScores.map((score) => ({ + metric: String(score.metric ?? ""), + value: Number(score.value), + weight: Number(score.weight), + weighted_score: Number(score.weighted_score), + pass_threshold: Number(score.pass_threshold), + passed: Boolean(score.passed), + source: String(score.source ?? ""), + predicted: asJson(score.predicted_json) ?? [], + golden: asJson(score.golden_json) ?? [], + })), + dedupScores: dedupScores.map((score) => ({ + metric: String(score.metric ?? ""), + value: Number(score.value), + weight: Number(score.weight), + weighted_score: Number(score.weighted_score), + pass_threshold: Number(score.pass_threshold), + passed: Boolean(score.passed), + item_count: Number(score.item_count), + source: String(score.source ?? ""), + predicted: asJson(score.predicted_json) ?? [], + golden: asJson(score.golden_json) ?? [], + })), error: asJson>(row.error_json) ?? null, startedAt: asIsoTimestamp(row.started_at), completedAt: asIsoTimestampOrNull(row.completed_at), @@ -308,49 +366,87 @@ async function loadScenarioRecords( } if (options.summary) { - return scenarioRows.map((row) => mapScenarioRow(row, [], [], [], [], [])); + return scenarioRows.map((row) => + mapScenarioRow(row, [], [], [], [], [], [], [], [], []), + ); } - const [turns, events, toolCalls, checkpoints, dimensionScores] = await span( - "pg.scenario_children", - () => - Promise.all([ - span( - "pg.turns", - () => sql` + const [ + turns, + events, + toolCalls, + checkpoints, + dimensionScores, + retrievalRows, + demotionRows, + procedureRows, + dedupRows, + ] = await span("pg.scenario_children", () => + Promise.all([ + span( + "pg.turns", + () => sql` select * from turns where scenario_run_id in ${sql(ids)} order by scenario_run_id asc, turn_index asc `, - ), - span( - "pg.target_events", - () => sql` + ), + span( + "pg.target_events", + () => sql` select * from target_events where scenario_run_id in ${sql(ids)} order by scenario_run_id asc, turn_index asc, exchange_index asc `, - ), - span( - "pg.tool_calls", - () => sql` + ), + span( + "pg.tool_calls", + () => sql` select * from tool_calls where scenario_run_id in ${sql(ids)} order by scenario_run_id asc, turn_index asc, call_order asc nulls last `, - ), - span( - "pg.checkpoints", - () => sql` + ), + span( + "pg.checkpoints", + () => sql` select * from checkpoints where scenario_run_id in ${sql(ids)} order by scenario_run_id asc, checkpoint_index asc `, - ), - span( - "pg.judge_dimension_scores", - () => sql` + ), + span( + "pg.judge_dimension_scores", + () => sql` select * from judge_dimension_scores where scenario_run_id in ${sql(ids)} order by scenario_run_id asc, dimension_id asc `, - ), - ]), + ), + span( + "pg.retrieval_scores", + () => sql` + select * from retrieval_scores where scenario_run_id in ${sql(ids)} + order by scenario_run_id asc, id asc + `, + ), + span( + "pg.demotion_scores", + () => sql` + select * from demotion_scores where scenario_run_id in ${sql(ids)} + order by scenario_run_id asc, id asc + `, + ), + span( + "pg.procedure_scores", + () => sql` + select * from procedure_scores where scenario_run_id in ${sql(ids)} + order by scenario_run_id asc, id asc + `, + ), + span( + "pg.dedup_scores", + () => sql` + select * from dedup_scores where scenario_run_id in ${sql(ids)} + order by scenario_run_id asc, id asc + `, + ), + ]), ); const groupBy = ( @@ -371,6 +467,10 @@ async function loadScenarioRecords( const toolsByScenario = groupBy(toolCalls, "scenario_run_id"); const checkpointsByScenario = groupBy(checkpoints, "scenario_run_id"); const dimensionsByScenario = groupBy(dimensionScores, "scenario_run_id"); + const retrievalByScenario = groupBy(retrievalRows, "scenario_run_id"); + const demotionByScenario = groupBy(demotionRows, "scenario_run_id"); + const procedureByScenario = groupBy(procedureRows, "scenario_run_id"); + const dedupByScenario = groupBy(dedupRows, "scenario_run_id"); return scenarioRows.map((row) => mapScenarioRow( @@ -380,6 +480,10 @@ async function loadScenarioRecords( toolsByScenario.get(Number(row.id)) ?? [], checkpointsByScenario.get(Number(row.id)) ?? [], dimensionsByScenario.get(Number(row.id)) ?? [], + retrievalByScenario.get(Number(row.id)) ?? [], + demotionByScenario.get(Number(row.id)) ?? [], + procedureByScenario.get(Number(row.id)) ?? [], + dedupByScenario.get(Number(row.id)) ?? [], ), ); } diff --git a/src/providers/persistence/postgres-run-recorder.ts b/src/providers/persistence/postgres-run-recorder.ts index df4a171..09685c6 100644 --- a/src/providers/persistence/postgres-run-recorder.ts +++ b/src/providers/persistence/postgres-run-recorder.ts @@ -5,8 +5,12 @@ import type { AdapterReply, CheckpointAssertion, CheckpointResult, + DedupScore, + DemotionScore, JsonValue, Persona, + ProcedureScore, + RetrievalScore, Rubric, RubricScore, RunResult, @@ -529,6 +533,106 @@ export class PostgresRunRecorder { `; } + async recordRetrievalResult( + scenarioRunId: number, + options: { scenario: Scenario; score: RetrievalScore }, + ): Promise { + const passed = options.score.passed; + const returnedJson = json(redactValue(options.score.returned)); + for (const metric of options.score.metrics) { + await this.sql` + insert into retrieval_scores ( + scenario_run_id, metric, value, weight, k, + weighted_score, pass_threshold, passed, total_relevant, + total_returned, hit_count, forbidden_hits, source, + returned_json, created_at + ) values ( + ${scenarioRunId}, ${metric.metric}, ${metric.value}, + ${metric.weight}, ${options.score.k}, + ${options.score.weightedScore}, ${options.score.passThreshold}, + ${passed}, ${options.score.totalRelevant}, + ${options.score.totalReturned}, ${options.score.hitCount}, + ${options.score.forbiddenHits}, ${options.score.source}, + ${returnedJson}::jsonb, now() + ) + `; + } + } + + async recordDemotionResult( + scenarioRunId: number, + options: { scenario: Scenario; score: DemotionScore }, + ): Promise { + const passed = options.score.passed; + const cascade = options.score.cascadeBounded ?? null; + const observedJson = json(redactValue(options.score.observed)); + const expectedJson = json(redactValue(options.score.expected)); + for (const metric of options.score.metrics) { + await this.sql` + insert into demotion_scores ( + scenario_run_id, metric, value, weight, + weighted_score, pass_threshold, passed, + timestamp_violation_count, cascade_bounded, + source, observed_json, expected_json, created_at + ) values ( + ${scenarioRunId}, ${metric.metric}, ${metric.value}, ${metric.weight}, + ${options.score.weightedScore}, ${options.score.passThreshold}, + ${passed}, ${options.score.timestampViolationCount}, + ${cascade}, + ${options.score.source}, + ${observedJson}::jsonb, ${expectedJson}::jsonb, now() + ) + `; + } + } + + async recordProcedureResult( + scenarioRunId: number, + options: { scenario: Scenario; score: ProcedureScore }, + ): Promise { + const passed = options.score.passed; + const predictedJson = json(redactValue(options.score.predictedSteps)); + const goldenJson = json(redactValue(options.score.goldenSteps)); + for (const metric of options.score.metrics) { + await this.sql` + insert into procedure_scores ( + scenario_run_id, metric, value, weight, + weighted_score, pass_threshold, passed, source, + predicted_json, golden_json, created_at + ) values ( + ${scenarioRunId}, ${metric.metric}, ${metric.value}, ${metric.weight}, + ${options.score.weightedScore}, ${options.score.passThreshold}, + ${passed}, ${options.score.source}, + ${predictedJson}::jsonb, ${goldenJson}::jsonb, now() + ) + `; + } + } + + async recordDedupResult( + scenarioRunId: number, + options: { scenario: Scenario; score: DedupScore }, + ): Promise { + const passed = options.score.passed; + const predictedJson = json(redactValue(options.score.predictedClusters)); + const goldenJson = json(redactValue(options.score.goldenClusters)); + for (const metric of options.score.metrics) { + await this.sql` + insert into dedup_scores ( + scenario_run_id, metric, value, weight, + weighted_score, pass_threshold, passed, item_count, + source, predicted_json, golden_json, created_at + ) values ( + ${scenarioRunId}, ${metric.metric}, ${metric.value}, ${metric.weight}, + ${options.score.weightedScore}, ${options.score.passThreshold}, + ${passed}, ${options.score.itemCount}, + ${options.score.source}, + ${predictedJson}::jsonb, ${goldenJson}::jsonb, now() + ) + `; + } + } + async recordScenarioFinished( scenarioRunId: number, options: { result: ScenarioRunResult }, diff --git a/src/providers/persistence/sqlite-backend.ts b/src/providers/persistence/sqlite-backend.ts index 35c1cb0..5dfec39 100644 --- a/src/providers/persistence/sqlite-backend.ts +++ b/src/providers/persistence/sqlite-backend.ts @@ -220,6 +220,10 @@ function projectRunRecord( toolCalls: [], checkpoints: [], judgeDimensionScores: [], + retrievalScores: [], + demotionScores: [], + procedureScores: [], + dedupScores: [], } : scenario, ); diff --git a/src/providers/persistence/sqlite-run-history.ts b/src/providers/persistence/sqlite-run-history.ts index 3d0107f..e8158ca 100644 --- a/src/providers/persistence/sqlite-run-history.ts +++ b/src/providers/persistence/sqlite-run-history.ts @@ -7,11 +7,15 @@ import type { AdapterReply, CheckpointAssertion, CheckpointResult, + DedupScore, + DemotionScore, Endpoints, JsonValue, Persona, PresetRecord, PresetSnapshot, + ProcedureScore, + RetrievalScore, Rubric, RubricScore, RunRecord, @@ -29,7 +33,7 @@ import { redactDbUrl } from "./url.ts"; export const DEFAULT_DB_DIRNAME = ".agentprobe"; export const DEFAULT_DB_FILENAME = "runs.sqlite3"; -export const SCHEMA_VERSION = 8; +export const SCHEMA_VERSION = 10; const REDACTED_VALUE = "[REDACTED]"; const sensitiveExactKeys = new Set([ "access_token", @@ -285,6 +289,18 @@ function migrateDatabase(database: Database, currentVersion: number): void { database.query("update meta set schema_version = ? where id = 1").run(8); version = 8; } + if (version < 9) { + ensureRetrievalScoresTable(database); + database.query("update meta set schema_version = ? where id = 1").run(9); + version = 9; + } + if (version < 10) { + ensureDemotionScoresTable(database); + ensureProcedureScoresTable(database); + ensureDedupScoresTable(database); + database.query("update meta set schema_version = ? where id = 1").run(10); + version = 10; + } if (version !== SCHEMA_VERSION) { throw new AgentProbeRuntimeError( @@ -335,6 +351,105 @@ function ensureHumanDimensionScoresTable(database: Database): void { `); } +function ensureRetrievalScoresTable(database: Database): void { + database.exec(` + create table if not exists retrieval_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + k integer not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + total_relevant integer not null, + total_returned integer not null, + hit_count integer not null, + forbidden_hits integer not null, + source text not null, + returned_json text, + created_at text not null + ); + create index if not exists idx_retrieval_scores_scenario_run + on retrieval_scores(scenario_run_id); + create index if not exists idx_retrieval_scores_metric + on retrieval_scores(metric); + `); +} + +function ensureDemotionScoresTable(database: Database): void { + database.exec(` + create table if not exists demotion_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + timestamp_violation_count integer not null, + cascade_bounded integer, + source text not null, + observed_json text, + expected_json text, + created_at text not null + ); + create index if not exists idx_demotion_scores_scenario_run + on demotion_scores(scenario_run_id); + create index if not exists idx_demotion_scores_metric + on demotion_scores(metric); + `); +} + +function ensureProcedureScoresTable(database: Database): void { + database.exec(` + create table if not exists procedure_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + source text not null, + predicted_json text, + golden_json text, + created_at text not null + ); + create index if not exists idx_procedure_scores_scenario_run + on procedure_scores(scenario_run_id); + create index if not exists idx_procedure_scores_metric + on procedure_scores(metric); + `); +} + +function ensureDedupScoresTable(database: Database): void { + database.exec(` + create table if not exists dedup_scores ( + id integer primary key autoincrement, + scenario_run_id integer not null references scenario_runs(id) on delete cascade, + metric text not null, + value real not null, + weight real not null, + weighted_score real not null, + pass_threshold real not null, + passed integer not null, + item_count integer not null, + source text not null, + predicted_json text, + golden_json text, + created_at text not null + ); + create index if not exists idx_dedup_scores_scenario_run + on dedup_scores(scenario_run_id); + create index if not exists idx_dedup_scores_metric + on dedup_scores(metric); + `); +} + function ensurePhase2Schema(database: Database): void { ensurePhase2RunColumns(database); database.exec(` @@ -527,6 +642,10 @@ export function initDb(dbUrl?: string): void { ensureAppSettingsTable(database); ensureEndpointOverridesTable(database); ensureHumanDimensionScoresTable(database); + ensureRetrievalScoresTable(database); + ensureDemotionScoresTable(database); + ensureProcedureScoresTable(database); + ensureDedupScoresTable(database); const meta = database .query("select schema_version from meta where id = 1") @@ -1090,6 +1209,162 @@ export class SqliteRunRecorder { ); } + async recordRetrievalResult( + scenarioRunId: number, + options: { scenario: Scenario; score: RetrievalScore }, + ): Promise { + const createdAt = utcNow(); + const passedFlag = options.score.passed ? 1 : 0; + const returnedJson = encodeJson(redactValue(options.score.returned)); + for (const metric of options.score.metrics) { + this.database + .query( + ` + insert into retrieval_scores ( + scenario_run_id, metric, value, weight, k, + weighted_score, pass_threshold, passed, total_relevant, + total_returned, hit_count, forbidden_hits, source, + returned_json, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `, + ) + .run( + scenarioRunId, + metric.metric, + metric.value, + metric.weight, + options.score.k, + options.score.weightedScore, + options.score.passThreshold, + passedFlag, + options.score.totalRelevant, + options.score.totalReturned, + options.score.hitCount, + options.score.forbiddenHits, + options.score.source, + returnedJson, + createdAt, + ); + } + } + + async recordDemotionResult( + scenarioRunId: number, + options: { scenario: Scenario; score: DemotionScore }, + ): Promise { + const createdAt = utcNow(); + const passedFlag = options.score.passed ? 1 : 0; + const cascade = + options.score.cascadeBounded === undefined + ? null + : options.score.cascadeBounded + ? 1 + : 0; + const observedJson = encodeJson(redactValue(options.score.observed)); + const expectedJson = encodeJson(redactValue(options.score.expected)); + for (const metric of options.score.metrics) { + this.database + .query( + ` + insert into demotion_scores ( + scenario_run_id, metric, value, weight, + weighted_score, pass_threshold, passed, + timestamp_violation_count, cascade_bounded, + source, observed_json, expected_json, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `, + ) + .run( + scenarioRunId, + metric.metric, + metric.value, + metric.weight, + options.score.weightedScore, + options.score.passThreshold, + passedFlag, + options.score.timestampViolationCount, + cascade, + options.score.source, + observedJson, + expectedJson, + createdAt, + ); + } + } + + async recordProcedureResult( + scenarioRunId: number, + options: { scenario: Scenario; score: ProcedureScore }, + ): Promise { + const createdAt = utcNow(); + const passedFlag = options.score.passed ? 1 : 0; + const predictedJson = encodeJson(redactValue(options.score.predictedSteps)); + const goldenJson = encodeJson(redactValue(options.score.goldenSteps)); + for (const metric of options.score.metrics) { + this.database + .query( + ` + insert into procedure_scores ( + scenario_run_id, metric, value, weight, + weighted_score, pass_threshold, passed, source, + predicted_json, golden_json, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `, + ) + .run( + scenarioRunId, + metric.metric, + metric.value, + metric.weight, + options.score.weightedScore, + options.score.passThreshold, + passedFlag, + options.score.source, + predictedJson, + goldenJson, + createdAt, + ); + } + } + + async recordDedupResult( + scenarioRunId: number, + options: { scenario: Scenario; score: DedupScore }, + ): Promise { + const createdAt = utcNow(); + const passedFlag = options.score.passed ? 1 : 0; + const predictedJson = encodeJson( + redactValue(options.score.predictedClusters), + ); + const goldenJson = encodeJson(redactValue(options.score.goldenClusters)); + for (const metric of options.score.metrics) { + this.database + .query( + ` + insert into dedup_scores ( + scenario_run_id, metric, value, weight, + weighted_score, pass_threshold, passed, item_count, + source, predicted_json, golden_json, created_at + ) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `, + ) + .run( + scenarioRunId, + metric.metric, + metric.value, + metric.weight, + options.score.weightedScore, + options.score.passThreshold, + passedFlag, + options.score.itemCount, + options.score.source, + predictedJson, + goldenJson, + createdAt, + ); + } + } + async recordScenarioFinished( scenarioRunId: number, options: { result: ScenarioRunResult }, @@ -1921,6 +2196,26 @@ function getScenarioRecords( "select * from judge_dimension_scores where scenario_run_id = ? order by dimension_id asc", ) .all(scenarioRunId) as Array>; + const retrievalScores = database + .query( + "select * from retrieval_scores where scenario_run_id = ? order by id asc", + ) + .all(scenarioRunId) as Array>; + const demotionScores = database + .query( + "select * from demotion_scores where scenario_run_id = ? order by id asc", + ) + .all(scenarioRunId) as Array>; + const procedureScores = database + .query( + "select * from procedure_scores where scenario_run_id = ? order by id asc", + ) + .all(scenarioRunId) as Array>; + const dedupScores = database + .query( + "select * from dedup_scores where scenario_run_id = ? order by id asc", + ) + .all(scenarioRunId) as Array>; return { scenarioRunId, @@ -2038,6 +2333,60 @@ function getScenarioRecords( reasoning: String(score.reasoning), evidence: decodeJson(score.evidence_json) ?? [], })), + retrievalScores: retrievalScores.map((score) => ({ + metric: String(score.metric), + value: Number(score.value), + weight: Number(score.weight), + k: Number(score.k), + weighted_score: Number(score.weighted_score), + pass_threshold: Number(score.pass_threshold), + passed: Number(score.passed) === 1, + total_relevant: Number(score.total_relevant), + total_returned: Number(score.total_returned), + hit_count: Number(score.hit_count), + forbidden_hits: Number(score.forbidden_hits), + source: String(score.source), + returned: decodeJson(score.returned_json) ?? [], + })), + demotionScores: demotionScores.map((score) => ({ + metric: String(score.metric), + value: Number(score.value), + weight: Number(score.weight), + weighted_score: Number(score.weighted_score), + pass_threshold: Number(score.pass_threshold), + passed: Number(score.passed) === 1, + timestamp_violation_count: Number(score.timestamp_violation_count), + cascade_bounded: + score.cascade_bounded === null || score.cascade_bounded === undefined + ? null + : Number(score.cascade_bounded) === 1, + source: String(score.source), + observed: decodeJson(score.observed_json) ?? [], + expected: decodeJson(score.expected_json) ?? [], + })), + procedureScores: procedureScores.map((score) => ({ + metric: String(score.metric), + value: Number(score.value), + weight: Number(score.weight), + weighted_score: Number(score.weighted_score), + pass_threshold: Number(score.pass_threshold), + passed: Number(score.passed) === 1, + source: String(score.source), + predicted: decodeJson(score.predicted_json) ?? [], + golden: decodeJson(score.golden_json) ?? [], + })), + dedupScores: dedupScores.map((score) => ({ + metric: String(score.metric), + value: Number(score.value), + weight: Number(score.weight), + weighted_score: Number(score.weighted_score), + pass_threshold: Number(score.pass_threshold), + passed: Number(score.passed) === 1, + item_count: Number(score.item_count), + source: String(score.source), + predicted: decodeJson(score.predicted_json) ?? [], + golden: decodeJson(score.golden_json) ?? [], + })), error: decodeJson>(row.error_json) ?? null, startedAt: String(row.started_at), completedAt: diff --git a/src/runtime/server/default-presets.ts b/src/runtime/server/default-presets.ts index 8302902..3eae9c0 100644 --- a/src/runtime/server/default-presets.ts +++ b/src/runtime/server/default-presets.ts @@ -40,7 +40,95 @@ export const PRE_RELEASE_DEFAULT_PRESET: PresetWriteInput = { dryRun: false, }; -const DEFAULT_PRESETS = [PRE_RELEASE_DEFAULT_PRESET] as const; +// The three memory packs that together cover the full dream-system +// roadmap surface: multi-session conversational (judge), retrieval +// ranking, and the dream-validation trio (demotion/procedure/dedup). +// All three vendor a per-file selection so the preset is stable across +// scenario reorders and additive YAML changes. +const MULTI_SESSION_MEMORY_SCENARIO_IDS = [ + "mem-retention-basic-identity", + "mem-retention-incidental-facts", + "mem-distill-authed-http-image-gen", + "mem-distill-onboarding-workflow", + "mem-distill-weekly-report-format", + "mem-distill-implicit-tool-preferences", + "mem-distill-lead-cleaning-procedure", + "mem-rigidity-email-tone-override", + "mem-rigidity-tool-migration", + "mem-rigidity-pricing-update", + "mem-abstain-ambiguous-reference", + "mem-abstain-no-fabricated-preferences", + "mem-temporal-stale-team-member", + "mem-temporal-deprecated-procedure", + "mem-continuation-interrupted-task", + "mem-continuation-project-state", + "mem-crossdomain-business-identity", + "mem-crossdomain-customer-allergy-with-negative", + "mem-crossdomain-pricing-structure-with-negative", + "mem-crossdomain-shipping-schedule-reasoning", + "mem-crossdomain-notion-rate-limit", + "mem-procupdate-clean-replacement", + "mem-procupdate-additive", + "mem-compositional-board-prep", + "mem-introspection-what-do-you-know", + "mem-introspection-gaps", + "mem-longtail-lawyer-recall", + "mem-hygiene-bounded-time", + "mem-hygiene-temporary-status", + "mem-negative-one-off-qualifier", + "mem-negative-forget-on-request", +] as const; + +const RETRIEVAL_MEMORY_SCENARIO_IDS = [ + "mem-retrieval-forget-on-request", + "mem-retrieval-warm-context-sarah", + "mem-retrieval-stale-fact-demotion", + "mem-retrieval-scope-filter-project", + "mem-retrieval-cascading-expiry", +] as const; + +const DREAM_VALIDATION_SCENARIO_IDS = [ + "dream-demotion-retract-discipline", + "dream-demotion-snodgrass-violation", + "dream-demotion-stale-fact", + "dream-demotion-cascade-bounded", + "dream-demotion-cascade-runaway", + "dream-procedure-weekly-report", + "dream-procedure-client-onboarding", + "dream-dedup-near-duplicates", + "dream-dedup-false-positive", +] as const; + +export const FULL_MEMORY_DEFAULT_PRESET: PresetWriteInput = { + name: "Full Memory Suite", + description: + "All memory-related scenarios in one preset: multi-session conversational, retrieval ranking, and dream-system validation (demotion / procedure / dedup). Covers the full P-1 -> P2 dream-system roadmap surface.", + endpoint: "autogpt-endpoint.yaml", + personas: "personas.yaml", + rubric: "rubric.yaml", + selection: [ + ...MULTI_SESSION_MEMORY_SCENARIO_IDS.map((id) => ({ + file: "multi-session-memory.yaml", + id, + })), + ...RETRIEVAL_MEMORY_SCENARIO_IDS.map((id) => ({ + file: "retrieval-memory.yaml", + id, + })), + ...DREAM_VALIDATION_SCENARIO_IDS.map((id) => ({ + file: "dream-validation.yaml", + id, + })), + ], + parallel: { enabled: false, limit: null }, + repeat: 1, + dryRun: false, +}; + +const DEFAULT_PRESETS = [ + PRE_RELEASE_DEFAULT_PRESET, + FULL_MEMORY_DEFAULT_PRESET, +] as const; export type DefaultPresetSeedResult = { name: string; diff --git a/src/shared/types/contracts.ts b/src/shared/types/contracts.ts index 5bea5ba..98b7bc5 100644 --- a/src/shared/types/contracts.ts +++ b/src/shared/types/contracts.ts @@ -386,6 +386,125 @@ export type Session = { turns: TurnType[]; }; +export type RetrievalMatchPolicy = "exact" | "substring" | "regex"; + +export type RetrievalMetricKey = + | "precision_at_k" + | "recall_at_k" + | "mrr" + | "ndcg_at_k"; + +export type RetrievalMetricWeights = Partial< + Record +>; + +export type RetrievalSource = { + /** + * Path resolved relative to the scenario YAML file. + * The file must contain a JSON array of strings or objects with a `label` field. + */ + fixture?: string; + /** + * Key on the last assistant reply's `rawExchange` payload that holds the + * returned items. Defaults to `retrieved`. + */ + rawExchangeKey?: string; +}; + +export type RetrievalConfig = { + /** Ordered list of items the retriever is expected to surface. */ + golden: string[]; + /** + * Items that MUST NOT appear in the top-k. A forbidden hit forces a fail + * regardless of the weighted score (used for forget / scope-filter probes). + */ + forbidden: string[]; + /** Rank cutoff. Defaults to max(|golden|, |returned|, 1) when omitted. */ + k?: number; + /** Per-metric weights for the weighted aggregate score. */ + weights: Required; + /** Pass threshold on the weighted aggregate score. Defaults to 0.5. */ + passThreshold: number; + /** Match policy applied to each pair of returned vs golden / forbidden items. */ + match: RetrievalMatchPolicy; + /** Where to look for the retrieved list at scoring time. */ + source?: RetrievalSource; +}; + +/** A `(rawExchangeKey | fixture)` resolution mirroring `RetrievalSource`. */ +export type DreamSource = { + fixture?: string; + rawExchangeKey?: string; +}; + +export type DemotionMetricKey = + | "set_precision" + | "set_recall" + | "set_f1" + | "timestamp_discipline" + | "cascade_bounded" + | "cascade_direct_f1"; + +export type DemotionMetricWeights = Partial>; + +export type DemotionAction = { + uuid: string; + label?: string; + expiredAtSet: boolean; + invalidAtSet: boolean; + status?: string; +}; + +export type DemotionCascade = { + /** Edges that should be touched (1-hop neighbors of the invalidated entity). */ + expectedDirectNeighbors: string[]; + /** Edges that MUST NOT be touched (2+ hops). */ + tangentialEdges: string[]; +}; + +export type DemotionConfig = { + /** Expected set of demoted edge / memory UUIDs. */ + expectedDemotions: string[]; + /** Optional retract-vs-soft-delete discipline check. */ + expectedRetracts?: string[]; + /** Optional cascade check (P0.3b). */ + cascade?: DemotionCascade; + weights: Required; + passThreshold: number; + source?: DreamSource; +}; + +export type ProcedureMetricKey = + | "step_coverage" + | "step_order" + | "parameter_coverage"; + +export type ProcedureMetricWeights = Partial< + Record +>; + +export type ProcedureConfig = { + /** Ordered list of expected step labels. */ + goldenSteps: string[]; + /** Optional parameter names the procedure must surface. */ + goldenParameters?: string[]; + weights: Required; + passThreshold: number; + source?: DreamSource; +}; + +export type DedupConfig = { + /** + * Expected clusters. Each inner list is a cluster of item IDs that should + * be merged together. Items present in the predicted set but absent here + * are treated as singletons. + */ + goldenClusters: string[][]; + weights: { precision: number; recall: number; f1: number; ari: number }; + passThreshold: number; + source?: DreamSource; +}; + export type Scenario = { id: string; name: string; @@ -401,6 +520,10 @@ export type Scenario = { turns: TurnType[]; sessions: Session[]; expectations: ScenarioExpectations; + retrieval?: RetrievalConfig; + demotion?: DemotionConfig; + procedure?: ProcedureConfig; + dedup?: DedupConfig; [key: string]: unknown; }; @@ -473,6 +596,83 @@ export type RubricScore = { failureModeDetected?: string | null; }; +export type RetrievalMetricScore = { + metric: RetrievalMetricKey; + value: number; + weight: number; +}; + +export type RetrievalScore = { + k: number; + totalRelevant: number; + totalReturned: number; + hitCount: number; + forbiddenHits: number; + metrics: RetrievalMetricScore[]; + weightedScore: number; + passThreshold: number; + passed: boolean; + /** The returned list (in rank order) used to score, captured for replay. */ + returned: string[]; + /** Where the returned list came from: `fixture` | `raw_exchange` | `missing`. */ + source: "fixture" | "raw_exchange" | "missing"; +}; + +export type EvalSource = "fixture" | "raw_exchange" | "missing"; + +export type DemotionMetricScore = { + metric: DemotionMetricKey; + value: number; + weight: number; +}; + +export type DemotionScore = { + metrics: DemotionMetricScore[]; + weightedScore: number; + passThreshold: number; + passed: boolean; + observed: string[]; + expected: string[]; + cascadeBounded?: boolean; + timestampViolationCount: number; + source: EvalSource; +}; + +export type ProcedureMetricScore = { + metric: ProcedureMetricKey; + value: number; + weight: number; +}; + +export type ProcedureScore = { + metrics: ProcedureMetricScore[]; + weightedScore: number; + passThreshold: number; + passed: boolean; + predictedSteps: string[]; + goldenSteps: string[]; + source: EvalSource; +}; + +export type DedupMetricKey = "precision" | "recall" | "f1" | "ari"; + +export type DedupMetricScore = { + metric: DedupMetricKey; + value: number; + weight: number; +}; + +export type DedupScore = { + metrics: DedupMetricScore[]; + weightedScore: number; + passThreshold: number; + passed: boolean; + predictedClusters: string[][]; + goldenClusters: string[][]; + itemCount: number; + source: EvalSource; +}; + export type ScenarioRunResult = { scenarioId: string; scenarioName: string; @@ -486,6 +686,10 @@ export type ScenarioRunResult = { checkpoints: CheckpointResult[]; toolCallsByTurn?: Record; judgeScore?: RubricScore; + retrievalScore?: RetrievalScore; + demotionScore?: DemotionScore; + procedureScore?: ProcedureScore; + dedupScore?: DedupScore; renderedTurns?: Array>; }; @@ -622,6 +826,10 @@ export type ScenarioRecord = { toolCalls: Array>; checkpoints: Array>; judgeDimensionScores: Array>; + retrievalScores?: Array>; + demotionScores?: Array>; + procedureScores?: Array>; + dedupScores?: Array>; error?: Record | null; startedAt: string; completedAt?: string | null; diff --git a/tests/unit/db.test.ts b/tests/unit/db.test.ts index f9774d3..96ba1fa 100644 --- a/tests/unit/db.test.ts +++ b/tests/unit/db.test.ts @@ -214,6 +214,10 @@ describe("sqlite recorder", () => { "checkpoints", "judge_dimension_scores", "human_dimension_scores", + "retrieval_scores", + "demotion_scores", + "procedure_scores", + "dedup_scores", "presets", "preset_scenarios", "app_settings", @@ -223,7 +227,7 @@ describe("sqlite recorder", () => { } expect( database.query("select schema_version from meta where id = 1").get(), - ).toEqual({ schema_version: 8 }); + ).toEqual({ schema_version: 10 }); } finally { database.close(); } @@ -706,7 +710,7 @@ describe("sqlite recorder", () => { } expect( migrated.query("select schema_version from meta where id = 1").get(), - ).toEqual({ schema_version: 8 }); + ).toEqual({ schema_version: 10 }); } finally { migrated.close(); } diff --git a/tests/unit/dream-validation.test.ts b/tests/unit/dream-validation.test.ts new file mode 100644 index 0000000..04fa01d --- /dev/null +++ b/tests/unit/dream-validation.test.ts @@ -0,0 +1,120 @@ +import { describe, expect, test } from "bun:test"; +import { existsSync } from "node:fs"; +import { resolve } from "node:path"; + +import { scoreScenarioDedup } from "../../src/domains/evaluation/dedup-scorer.ts"; +import { scoreScenarioDemotion } from "../../src/domains/evaluation/demotion-scorer.ts"; +import { scoreScenarioProcedure } from "../../src/domains/evaluation/procedure-scorer.ts"; +import { parseScenarioYaml } from "../../src/domains/validation/load-suite.ts"; + +const SCENARIOS_PATH = resolve( + import.meta.dir, + "..", + "..", + "data", + "dream-validation.yaml", +); + +describe("dream-validation pack", () => { + const parsed = parseScenarioYaml(SCENARIOS_PATH); + const scenarios = parsed.scenarios; + + function requireScenario(id: string) { + const scenario = scenarios.find((s) => s.id === id); + if (!scenario) { + throw new Error(`Missing scenario: ${id}`); + } + return scenario; + } + + test("ships at least four demotion, two procedure, and two dedup scenarios", () => { + const demotion = scenarios.filter((s) => s.demotion !== undefined); + const procedure = scenarios.filter((s) => s.procedure !== undefined); + const dedup = scenarios.filter((s) => s.dedup !== undefined); + expect(demotion.length).toBeGreaterThanOrEqual(4); + expect(procedure.length).toBeGreaterThanOrEqual(2); + expect(dedup.length).toBeGreaterThanOrEqual(2); + }); + + test("every scenario references a fixture that exists on disk", () => { + for (const scenario of scenarios) { + const fixture = + scenario.demotion?.source?.fixture ?? + scenario.procedure?.source?.fixture ?? + scenario.dedup?.source?.fixture; + expect(fixture).toBeDefined(); + const resolved = resolve(SCENARIOS_PATH, "..", fixture ?? ""); + expect(existsSync(resolved)).toBe(true); + } + }); + + test("Snodgrass-respecting retract scenario passes", () => { + const scenario = requireScenario("dream-demotion-retract-discipline"); + const score = scoreScenarioDemotion(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.source).toBe("fixture"); + expect(score?.timestampViolationCount).toBe(0); + expect(score?.passed).toBe(true); + }); + + test("Snodgrass-violating retract scenario fails on timestamp discipline", () => { + const scenario = requireScenario("dream-demotion-snodgrass-violation"); + const score = scoreScenarioDemotion(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.timestampViolationCount).toBeGreaterThan(0); + expect(score?.passed).toBe(false); + }); + + test("bounded cascade scenario passes (single-hop discipline held)", () => { + const scenario = requireScenario("dream-demotion-cascade-bounded"); + const score = scoreScenarioDemotion(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.cascadeBounded).toBe(true); + expect(score?.passed).toBe(true); + }); + + test("runaway cascade scenario fails on cascade_bounded", () => { + const scenario = requireScenario("dream-demotion-cascade-runaway"); + const score = scoreScenarioDemotion(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.cascadeBounded).toBe(false); + expect(score?.passed).toBe(false); + }); + + test("weekly-report procedure scenario passes against its golden", () => { + const scenario = requireScenario("dream-procedure-weekly-report"); + const score = scoreScenarioProcedure(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.source).toBe("fixture"); + expect(score?.passed).toBe(true); + }); + + test("client-onboarding procedure scenario passes against its golden", () => { + const scenario = requireScenario("dream-procedure-client-onboarding"); + const score = scoreScenarioProcedure(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.passed).toBe(true); + }); + + test("clean dedup scenario passes (no over-merge, no under-merge)", () => { + const scenario = requireScenario("dream-dedup-near-duplicates"); + const score = scoreScenarioDedup(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.passed).toBe(true); + }); + + test("over-merge dedup scenario fails on pairwise precision + ARI", () => { + const scenario = requireScenario("dream-dedup-false-positive"); + const score = scoreScenarioDedup(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.passed).toBe(false); + }); +}); diff --git a/tests/unit/load-suite.test.ts b/tests/unit/load-suite.test.ts index 7bc02b3..a4a23e6 100644 --- a/tests/unit/load-suite.test.ts +++ b/tests/unit/load-suite.test.ts @@ -49,6 +49,171 @@ describe("scenario parsing", () => { expect(scenario?.context?.copilotMode).toBe("fast"); }); + test("parses a retrieval block with defaults", () => { + const path = join( + makeTempDir("scenario-retrieval-basic"), + "scenarios.yaml", + ); + writeFileSync( + path, + [ + "defaults:", + " persona: shopper", + " rubric: support", + "scenarios:", + " - id: retrieval-basic", + ' name: "Retrieval basic"', + " turns:", + " - role: user", + ' content: "what do we have on Sarah?"', + " expectations:", + ' expected_behavior: "Surface gold items."', + " expected_outcome: resolved", + " retrieval:", + " golden:", + ' - "Sarah\\u0027s email"', + ' - "Atlas project status"', + " k: 5", + "", + ].join("\n"), + "utf8", + ); + + const parsed = parseScenarioYaml(path); + const scenario = parsed.scenarios[0]; + + expect(scenario?.retrieval).toBeDefined(); + expect(scenario?.retrieval?.golden).toEqual([ + "Sarah's email", + "Atlas project status", + ]); + expect(scenario?.retrieval?.k).toBe(5); + expect(scenario?.retrieval?.weights.precision_at_k).toBe(1); + expect(scenario?.retrieval?.weights.recall_at_k).toBe(1); + expect(scenario?.retrieval?.weights.mrr).toBe(1); + expect(scenario?.retrieval?.weights.ndcg_at_k).toBe(1); + expect(scenario?.retrieval?.passThreshold).toBe(0.5); + expect(scenario?.retrieval?.match).toBe("substring"); + }); + + test("parses retrieval block with custom weights, forbidden, threshold, and source", () => { + const path = join( + makeTempDir("scenario-retrieval-custom"), + "scenarios.yaml", + ); + writeFileSync( + path, + [ + "defaults:", + " persona: shopper", + " rubric: support", + "scenarios:", + " - id: retrieval-custom", + ' name: "Retrieval custom"', + " turns:", + " - role: user", + ' content: "what is our Q2 budget?"', + " expectations:", + ' expected_behavior: "Honor forget request."', + " expected_outcome: resolved", + " retrieval:", + " golden:", + ' - "I do not have that"', + " forbidden:", + ' - "$50K"', + " k: 3", + " pass_threshold: 0.6", + " match: substring", + " weight:", + " precision_at_k: 0.5", + " recall_at_k: 2.0", + " mrr: 1.0", + " ndcg_at_k: 1.5", + " source:", + ' raw_exchange_key: "memories"', + "", + ].join("\n"), + "utf8", + ); + + const parsed = parseScenarioYaml(path); + const scenario = parsed.scenarios[0]; + + expect(scenario?.retrieval?.forbidden).toEqual(["$50K"]); + expect(scenario?.retrieval?.k).toBe(3); + expect(scenario?.retrieval?.passThreshold).toBeCloseTo(0.6, 6); + expect(scenario?.retrieval?.weights.recall_at_k).toBe(2); + expect(scenario?.retrieval?.weights.precision_at_k).toBe(0.5); + expect(scenario?.retrieval?.source?.rawExchangeKey).toBe("memories"); + }); + + test("rejects retrieval config with empty golden", () => { + const path = join( + makeTempDir("scenario-retrieval-empty-golden"), + "scenarios.yaml", + ); + writeFileSync( + path, + [ + "defaults:", + " persona: shopper", + " rubric: support", + "scenarios:", + " - id: retrieval-empty", + ' name: "Retrieval empty"', + " turns:", + " - role: user", + ' content: "x"', + " expectations:", + ' expected_behavior: "x"', + " expected_outcome: resolved", + " retrieval:", + " golden: []", + "", + ].join("\n"), + "utf8", + ); + + expect(() => parseScenarioYaml(path)).toThrow( + /retrieval.golden must be a non-empty/, + ); + }); + + test("rejects unknown retrieval metric weight keys", () => { + const path = join( + makeTempDir("scenario-retrieval-bad-weight"), + "scenarios.yaml", + ); + writeFileSync( + path, + [ + "defaults:", + " persona: shopper", + " rubric: support", + "scenarios:", + " - id: retrieval-bad-weight", + ' name: "Retrieval bad weight"', + " turns:", + " - role: user", + ' content: "x"', + " expectations:", + ' expected_behavior: "x"', + " expected_outcome: resolved", + " retrieval:", + " golden:", + ' - "foo"', + " weight:", + " hit_rate: 1.0", + "", + ].join("\n"), + "utf8", + ); + + expect(() => parseScenarioYaml(path)).toThrow( + /Unknown retrieval metric key/, + ); + }); + test("parses session max_turns and scenario base_date", () => { const path = join(makeTempDir("scenario-sessions"), "scenarios.yaml"); writeFileSync( diff --git a/tests/unit/persistence/drizzle-schema.test.ts b/tests/unit/persistence/drizzle-schema.test.ts index a3b5b1a..c28462e 100644 --- a/tests/unit/persistence/drizzle-schema.test.ts +++ b/tests/unit/persistence/drizzle-schema.test.ts @@ -12,12 +12,16 @@ import { const expectedTables = [ "app_settings", "checkpoints", + "dedup_scores", + "demotion_scores", "endpoint_overrides", "human_dimension_scores", "judge_dimension_scores", "meta", "preset_scenarios", "presets", + "procedure_scores", + "retrieval_scores", "runs", "scenario_runs", "target_events", @@ -33,12 +37,12 @@ function schemaTableNames(schema: Record): string[] { describe("Drizzle schema mirrors persistence schema contracts", () => { test("declares the complete SQLite table inventory for the current target version", () => { - expect(SQLITE_TARGET_VERSION).toBe(8); + expect(SQLITE_TARGET_VERSION).toBe(10); expect(schemaTableNames(sqliteSchema)).toEqual(expectedTables); }); test("declares the complete Postgres table inventory for the current target version", () => { - expect(POSTGRES_TARGET_VERSION).toBe(4); + expect(POSTGRES_TARGET_VERSION).toBe(6); expect(schemaTableNames(postgresSchema)).toEqual(expectedTables); }); }); diff --git a/tests/unit/persistence/migrations.test.ts b/tests/unit/persistence/migrations.test.ts index 5b5a4d1..4e91bba 100644 --- a/tests/unit/persistence/migrations.test.ts +++ b/tests/unit/persistence/migrations.test.ts @@ -76,7 +76,7 @@ describe("migration dispatcher", () => { const url = `sqlite:///${path}`; const report = await runMigrations(url); expect(report.currentVersion).toBe(1); - expect(report.applied).toEqual([2, 3, 4, 5, 6, 7, 8]); + expect(report.applied).toEqual([2, 3, 4, 5, 6, 7, 8, 9, 10]); expect(report.targetVersion).toBe(SQLITE_TARGET_VERSION); }); @@ -94,7 +94,7 @@ describe("migration dispatcher", () => { const report = await runMigrations(url); expect(report.currentVersion).toBe(2); - expect(report.applied).toEqual([3, 4]); + expect(report.applied).toEqual([3, 4, 5, 6]); expect(report.targetVersion).toBe(POSTGRES_TARGET_VERSION); const check = await checkSchemaVersion(url); diff --git a/tests/unit/retrieval-memory.test.ts b/tests/unit/retrieval-memory.test.ts new file mode 100644 index 0000000..46fd8bb --- /dev/null +++ b/tests/unit/retrieval-memory.test.ts @@ -0,0 +1,113 @@ +import { describe, expect, test } from "bun:test"; +import { resolve } from "node:path"; +import { scoreRetrieval } from "../../src/domains/evaluation/retrieval-scorer.ts"; +import { + parseRubricsYaml, + parseScenarioYaml, +} from "../../src/domains/validation/load-suite.ts"; + +const SCENARIOS_PATH = resolve( + import.meta.dir, + "..", + "..", + "data", + "retrieval-memory.yaml", +); +const RUBRIC_PATH = resolve(import.meta.dir, "..", "..", "data", "rubric.yaml"); + +describe("retrieval-memory pack", () => { + const parsed = parseScenarioYaml(SCENARIOS_PATH); + const scenarios = parsed.scenarios; + + test("declares at least five ranking-scored scenarios", () => { + const withRetrieval = scenarios.filter((s) => s.retrieval !== undefined); + expect(withRetrieval.length).toBeGreaterThanOrEqual(5); + }); + + test("every scenario references a known memory rubric", () => { + const rubrics = parseRubricsYaml(RUBRIC_PATH); + const rubricIds = new Set(rubrics.rubrics.map((r) => r.id)); + for (const scenario of scenarios) { + expect(scenario.rubric).toBeDefined(); + expect(rubricIds.has(scenario.rubric ?? "")).toBe(true); + } + }); + + test("each retrieval block uses fixture source that exists relative to the YAML", () => { + for (const scenario of scenarios) { + const fixture = scenario.retrieval?.source?.fixture; + expect(fixture).toBeDefined(); + // Resolve relative to YAML dir. + const resolved = resolve(SCENARIOS_PATH, "..", fixture ?? ""); + // Sanity-check the file exists (Bun.file.exists is sync via existsSync) + const exists = require("node:fs").existsSync(resolved); + expect(exists).toBe(true); + } + }); + + function requireScenario(id: string) { + const scenario = scenarios.find((s) => s.id === id); + if (!scenario) { + throw new Error(`Missing scenario in pack: ${id}`); + } + return scenario; + } + + test("forget-on-request scenario forbids the budget figure and passes against the fixture", () => { + const scenario = requireScenario("mem-retrieval-forget-on-request"); + expect(scenario.retrieval?.forbidden ?? []).toContain("$50K"); + + const score = scoreRetrieval(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.forbiddenHits).toBe(0); + }); + + test("warm-context scenario scores its happy-path fixture as passed", () => { + const scenario = requireScenario("mem-retrieval-warm-context-sarah"); + + const score = scoreRetrieval(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.source).toBe("fixture"); + expect(score?.passed).toBe(true); + expect(score?.hitCount).toBeGreaterThanOrEqual(2); + }); + + test("stale-fact demotion scenario passes when only the new pricing surfaces", () => { + const scenario = requireScenario("mem-retrieval-stale-fact-demotion"); + + const score = scoreRetrieval(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + // The fixture intentionally still contains the superseded item to prove + // the forbidden-hit check is active. So this scenario, when run against + // its committed fixture, should fail. That documents the negative-test + // intent of the YAML: swap the fixture for an actual retrieval payload, + // and a correctly-functioning dream pass would have demoted the old + // pricing out of the top-k. + expect(score?.source).toBe("fixture"); + expect(score?.forbiddenHits).toBeGreaterThan(0); + expect(score?.passed).toBe(false); + }); + + test("scope-filter scenario passes against its in-scope-only fixture", () => { + const scenario = requireScenario("mem-retrieval-scope-filter-project"); + + const score = scoreRetrieval(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.forbiddenHits).toBe(0); + expect(score?.passed).toBe(true); + }); + + test("cascading-expiry scenario passes when the entity's facts are gone but adjacent facts remain", () => { + const scenario = requireScenario("mem-retrieval-cascading-expiry"); + + const score = scoreRetrieval(scenario, { + scenariosPath: SCENARIOS_PATH, + }); + expect(score?.forbiddenHits).toBe(0); + expect(score?.passed).toBe(true); + }); +}); diff --git a/tests/unit/runner.test.ts b/tests/unit/runner.test.ts index 05c0fa6..bc27ee3 100644 --- a/tests/unit/runner.test.ts +++ b/tests/unit/runner.test.ts @@ -1642,4 +1642,111 @@ describe("runner", () => { // Judge must NOT have been called. expect(client.calls).toHaveLength(0); }); + + test("runScenario scores retrieval from rawExchange and surfaces it on the result", async () => { + const adapter = new FakeAdapter([ + adapterReply("Here are the relevant memories.", { + rawExchange: { + retrieved: ["Sarah's email", "Atlas project status"], + } as unknown as Record, + }), + ]); + const client = new FakeResponsesClient([ + buildPersonaStep("completed"), + buildScore(), + ]); + + const scenario = buildScenario({ + turns: [ + { + role: "user", + content: "What do you remember about Sarah?", + useExactMessage: true, + attachments: [], + }, + ], + }); + scenario.retrieval = { + golden: ["Sarah's email", "Atlas project status"], + forbidden: [], + weights: { + precision_at_k: 1, + recall_at_k: 1, + mrr: 1, + ndcg_at_k: 1, + }, + passThreshold: 0.5, + match: "substring", + k: 2, + }; + + const result = await runScenario( + adapter, + scenario, + buildPersona(), + buildRubric(), + { + client: asResponsesClient(client) as never, + }, + ); + + expect(result.retrievalScore).toBeDefined(); + expect(result.retrievalScore?.passed).toBe(true); + expect(result.retrievalScore?.hitCount).toBe(2); + expect(result.retrievalScore?.source).toBe("raw_exchange"); + expect(result.passed).toBe(true); + }); + + test("runScenario fails when retrieval contains a forbidden hit, even if the judge passes", async () => { + const adapter = new FakeAdapter([ + adapterReply("Sure thing.", { + rawExchange: { + retrieved: ["I do not have that on file", "Old budget was $50K"], + } as unknown as Record, + }), + ]); + const client = new FakeResponsesClient([ + buildPersonaStep("completed"), + buildScore({ score: 5 }), + ]); + + const scenario = buildScenario({ + turns: [ + { + role: "user", + content: "What's our Q2 budget?", + useExactMessage: true, + attachments: [], + }, + ], + }); + scenario.retrieval = { + golden: ["I do not have that"], + forbidden: ["$50K"], + weights: { + precision_at_k: 1, + recall_at_k: 1, + mrr: 1, + ndcg_at_k: 1, + }, + passThreshold: 0.2, + match: "substring", + k: 5, + }; + + const result = await runScenario( + adapter, + scenario, + buildPersona(), + buildRubric(), + { + client: asResponsesClient(client) as never, + }, + ); + + expect(result.judgeScore?.passed).toBe(true); + expect(result.retrievalScore?.forbiddenHits).toBe(1); + expect(result.retrievalScore?.passed).toBe(false); + expect(result.passed).toBe(false); + }); }); diff --git a/tests/unit/server/default-presets.test.ts b/tests/unit/server/default-presets.test.ts index 0e2790e..d3f975c 100644 --- a/tests/unit/server/default-presets.test.ts +++ b/tests/unit/server/default-presets.test.ts @@ -5,6 +5,7 @@ import { join } from "node:path"; import { SqliteRepository } from "../../../src/providers/persistence/sqlite-backend.ts"; import { SuiteController } from "../../../src/runtime/server/controllers/suite-controller.ts"; import { + FULL_MEMORY_DEFAULT_PRESET, PRE_RELEASE_DEFAULT_PRESET, seedDefaultPresets, } from "../../../src/runtime/server/default-presets.ts"; @@ -25,14 +26,23 @@ describe("default preset seeding", () => { repository, suiteController, }); - expect(results[0]).toMatchObject({ + const preReleaseResult = results.find( + (r) => r.name === PRE_RELEASE_DEFAULT_PRESET.name, + ); + expect(preReleaseResult).toMatchObject({ name: PRE_RELEASE_DEFAULT_PRESET.name, status: "created", }); + expect( + results.find((r) => r.name === FULL_MEMORY_DEFAULT_PRESET.name), + ).toMatchObject({ + name: FULL_MEMORY_DEFAULT_PRESET.name, + status: "created", + }); const presets = await repository.listPresets(); - expect(presets).toHaveLength(1); - const preset = presets[0]; + expect(presets).toHaveLength(2); + const preset = presets.find((p) => p.name === "Pre Release Checks"); expect(preset).toMatchObject({ name: "Pre Release Checks", description: null, @@ -44,17 +54,31 @@ describe("default preset seeding", () => { dryRun: false, }); expect(preset?.selection).toEqual(PRE_RELEASE_DEFAULT_PRESET.selection); + const memoryPreset = presets.find( + (p) => p.name === FULL_MEMORY_DEFAULT_PRESET.name, + ); + expect(memoryPreset?.selection).toEqual( + FULL_MEMORY_DEFAULT_PRESET.selection, + ); const secondPass = await seedDefaultPresets({ repository, suiteController, }); - expect(secondPass[0]).toMatchObject({ + expect( + secondPass.find((r) => r.name === PRE_RELEASE_DEFAULT_PRESET.name), + ).toMatchObject({ name: PRE_RELEASE_DEFAULT_PRESET.name, status: "existing", presetId: preset?.id, }); - expect(await repository.listPresets()).toHaveLength(1); + expect( + secondPass.find((r) => r.name === FULL_MEMORY_DEFAULT_PRESET.name), + ).toMatchObject({ + name: FULL_MEMORY_DEFAULT_PRESET.name, + status: "existing", + }); + expect(await repository.listPresets()).toHaveLength(2); }); test("restores a soft-deleted default preset by name", async () => { @@ -62,24 +86,31 @@ describe("default preset seeding", () => { await repository.initialize(); const suiteController = new SuiteController({ dataPath: DATA_DIR }); await seedDefaultPresets({ repository, suiteController }); - const seeded = (await repository.listPresets())[0]; + const seeded = (await repository.listPresets()).find( + (p) => p.name === PRE_RELEASE_DEFAULT_PRESET.name, + ); expect(seeded).toBeDefined(); await repository.softDeletePreset(seeded?.id ?? ""); - expect(await repository.listPresets()).toHaveLength(0); + expect(await repository.listPresets()).toHaveLength(1); const results = await seedDefaultPresets({ repository, suiteController, }); - expect(results[0]).toMatchObject({ + expect( + results.find((r) => r.name === PRE_RELEASE_DEFAULT_PRESET.name), + ).toMatchObject({ name: PRE_RELEASE_DEFAULT_PRESET.name, status: "restored", presetId: seeded?.id, }); const presets = await repository.listPresets(); - expect(presets).toHaveLength(1); - expect(presets[0]?.deletedAt ?? null).toBeNull(); + expect(presets).toHaveLength(2); + expect( + presets.find((p) => p.name === PRE_RELEASE_DEFAULT_PRESET.name) + ?.deletedAt ?? null, + ).toBeNull(); }); test("skips seeding when the data root does not include packaged default files", async () => { @@ -95,10 +126,18 @@ describe("default preset seeding", () => { repository, suiteController: new SuiteController({ dataPath }), }); - expect(results[0]).toMatchObject({ + expect( + results.find((r) => r.name === PRE_RELEASE_DEFAULT_PRESET.name), + ).toMatchObject({ name: PRE_RELEASE_DEFAULT_PRESET.name, status: "skipped", }); + expect( + results.find((r) => r.name === FULL_MEMORY_DEFAULT_PRESET.name), + ).toMatchObject({ + name: FULL_MEMORY_DEFAULT_PRESET.name, + status: "skipped", + }); expect(await repository.listPresets()).toHaveLength(0); }); });