Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:graphrag:attribute_prediction:2882bce6","task":"Predict missing attributes of entities using local graph structure","meta":"generated"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:graphrag:attribute_prediction:30a10bee","task":"Predict missing attributes of entities using local graph structure","meta":"generated"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:graphrag:community_detection:59c8dab1","task":"Identify sub-communities within a large knowledge graph","meta":"generated"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:graphrag:community_detection:b4678e68","task":"Identify sub-communities within a large knowledge graph","meta":"generated"}
16 changes: 16 additions & 0 deletions GOLDEN/datasets/graphrag/cases.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,21 @@
"EVID:graphrag:cross_domain_synthesis:94dfef54"
],
"task": "Evaluate Cross domain synthesis capabilities"
},
{
"id": "EVID:graphrag:community_detection",
"required_evidence": [
"EVID:graphrag:community_detection:b4678e68",
"EVID:graphrag:community_detection:59c8dab1"
],
"task": "Identify sub-communities within a large knowledge graph"
},
{
"id": "EVID:graphrag:attribute_prediction",
"required_evidence": [
"EVID:graphrag:attribute_prediction:30a10bee",
"EVID:graphrag:attribute_prediction:2882bce6"
],
"task": "Predict missing attributes of entities using local graph structure"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:multiagent:resource_allocation:8efecaad","task":"Distribute limited resources among competing agents","meta":"generated"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:multiagent:resource_allocation:e0f2b6f1","task":"Distribute limited resources among competing agents","meta":"generated"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:multiagent:role_discovery:7514ea55","task":"Agents dynamically discover and assume roles based on task requirements","meta":"generated"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:multiagent:role_discovery:a6e46d05","task":"Agents dynamically discover and assume roles based on task requirements","meta":"generated"}
16 changes: 16 additions & 0 deletions GOLDEN/datasets/multi-agent/cases.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,5 +118,21 @@
"EVID:multiagent:hierarchical_planning:fc86bbe4"
],
"task": "Evaluate Hierarchical planning capabilities"
},
{
"id": "EVID:multiagent:resource_allocation",
"required_evidence": [
"EVID:multiagent:resource_allocation:8efecaad",
"EVID:multiagent:resource_allocation:e0f2b6f1"
],
"task": "Distribute limited resources among competing agents"
},
{
"id": "EVID:multiagent:role_discovery",
"required_evidence": [
"EVID:multiagent:role_discovery:7514ea55",
"EVID:multiagent:role_discovery:a6e46d05"
],
"task": "Agents dynamically discover and assume roles based on task requirements"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:tooluse:code_execution:1dd8e73e","task":"Execute generated code and handle runtime errors","meta":"generated"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:tooluse:code_execution:c7d5cea8","task":"Execute generated code and handle runtime errors","meta":"generated"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:tooluse:file_system:2571feb3","task":"Perform multi-step file system operations","meta":"generated"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"id":"EVID:tooluse:file_system:bab1d56c","task":"Perform multi-step file system operations","meta":"generated"}
16 changes: 16 additions & 0 deletions GOLDEN/datasets/tooluse/cases.json
Original file line number Diff line number Diff line change
Expand Up @@ -91,5 +91,21 @@
"EVID:tooluse:nested_invocation:ff0006de"
],
"task": "Evaluate Nested invocation capabilities"
},
{
"id": "EVID:tooluse:file_system",
"required_evidence": [
"EVID:tooluse:file_system:2571feb3",
"EVID:tooluse:file_system:bab1d56c"
],
"task": "Perform multi-step file system operations"
},
{
"id": "EVID:tooluse:code_execution",
"required_evidence": [
"EVID:tooluse:code_execution:c7d5cea8",
"EVID:tooluse:code_execution:1dd8e73e"
],
"task": "Execute generated code and handle runtime errors"
}
]
11 changes: 11 additions & 0 deletions evaluation/scoring/evidence.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,14 @@ export function aggregateScores(scores: EvidenceScore[]): EvidenceScore {
f1_score: sum.f1_score / scores.length,
};
}

export function scoreEvidencePrecision(required: string[], provided: string[]): number {
if (required.length === 0) return 1.0;
const hits = required.filter(r => provided.includes(r)).length;
return hits / required.length;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Compute precision using provided evidence count

This implementation divides by required.length, which computes recall/coverage, not precision. If a run returns all required evidence plus many irrelevant IDs, the score still becomes 1.0 (for example, 2 required hits out of 20 provided), so false positives are never penalized and benchmark precision is systematically inflated. Precision should use the provided evidence count (ideally deduplicated) as the denominator.

Useful? React with 👍 / 👎.

}
Comment on lines +59 to +63
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The function name scoreEvidencePrecision is misleading because the implementation calculates Recall (hits / required.length) rather than Precision (hits / provided.length). Additionally, using provided.includes(r) inside a filter results in O(N * M) complexity. Using a Set for the provided items improves performance to O(N + M).

Suggested change
export function scoreEvidencePrecision(required: string[], provided: string[]): number {
if (required.length === 0) return 1.0;
const hits = required.filter(r => provided.includes(r)).length;
return hits / required.length;
}
export function scoreEvidenceRecall(required: string[], provided: string[]): number {
if (required.length === 0) return 1.0;
const providedSet = new Set(provided);
const hits = required.filter(r => providedSet.has(r)).length;
return hits / required.length;
}


export function scoreToolEfficiency(optimalSteps: number, actualSteps: number): number {
if (actualSteps <= optimalSteps) return 1.0;
return Math.max(0, optimalSteps / actualSteps);
}
Comment on lines +65 to +68
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current implementation returns 1.0 (perfect efficiency) when actualSteps is 0 and optimalSteps is greater than 0. In a benchmark context, taking zero steps when some are required usually indicates a failure to attempt the task. It is safer to return 0.0 in this case unless optimalSteps is also 0.

Suggested change
export function scoreToolEfficiency(optimalSteps: number, actualSteps: number): number {
if (actualSteps <= optimalSteps) return 1.0;
return Math.max(0, optimalSteps / actualSteps);
}
export function scoreToolEfficiency(optimalSteps: number, actualSteps: number): number {
if (actualSteps === 0) return optimalSteps === 0 ? 1.0 : 0.0;
if (actualSteps <= optimalSteps) return 1.0;
return Math.max(0, optimalSteps / actualSteps);
}

Loading