diff --git a/GOLDEN/datasets/graphrag/EVID_graphrag_attribute_prediction_2882bce6.json b/GOLDEN/datasets/graphrag/EVID_graphrag_attribute_prediction_2882bce6.json new file mode 100644 index 00000000000..8dba4be7b75 --- /dev/null +++ b/GOLDEN/datasets/graphrag/EVID_graphrag_attribute_prediction_2882bce6.json @@ -0,0 +1 @@ +{"id":"EVID:graphrag:attribute_prediction:2882bce6","task":"Predict missing attributes of entities using local graph structure","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/graphrag/EVID_graphrag_attribute_prediction_30a10bee.json b/GOLDEN/datasets/graphrag/EVID_graphrag_attribute_prediction_30a10bee.json new file mode 100644 index 00000000000..dbbf27561f4 --- /dev/null +++ b/GOLDEN/datasets/graphrag/EVID_graphrag_attribute_prediction_30a10bee.json @@ -0,0 +1 @@ +{"id":"EVID:graphrag:attribute_prediction:30a10bee","task":"Predict missing attributes of entities using local graph structure","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/graphrag/EVID_graphrag_community_detection_59c8dab1.json b/GOLDEN/datasets/graphrag/EVID_graphrag_community_detection_59c8dab1.json new file mode 100644 index 00000000000..62fe3082c7d --- /dev/null +++ b/GOLDEN/datasets/graphrag/EVID_graphrag_community_detection_59c8dab1.json @@ -0,0 +1 @@ +{"id":"EVID:graphrag:community_detection:59c8dab1","task":"Identify sub-communities within a large knowledge graph","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/graphrag/EVID_graphrag_community_detection_b4678e68.json b/GOLDEN/datasets/graphrag/EVID_graphrag_community_detection_b4678e68.json new file mode 100644 index 00000000000..1d2c5be0472 --- /dev/null +++ b/GOLDEN/datasets/graphrag/EVID_graphrag_community_detection_b4678e68.json @@ -0,0 +1 @@ +{"id":"EVID:graphrag:community_detection:b4678e68","task":"Identify sub-communities within a large knowledge graph","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/graphrag/cases.json b/GOLDEN/datasets/graphrag/cases.json index c34783af37e..e57a16dae2e 100644 --- a/GOLDEN/datasets/graphrag/cases.json +++ b/GOLDEN/datasets/graphrag/cases.json @@ -79,5 +79,21 @@ "EVID:graphrag:cross_domain_synthesis:94dfef54" ], "task": "Evaluate Cross domain synthesis capabilities" + }, + { + "id": "EVID:graphrag:community_detection", + "required_evidence": [ + "EVID:graphrag:community_detection:b4678e68", + "EVID:graphrag:community_detection:59c8dab1" + ], + "task": "Identify sub-communities within a large knowledge graph" + }, + { + "id": "EVID:graphrag:attribute_prediction", + "required_evidence": [ + "EVID:graphrag:attribute_prediction:30a10bee", + "EVID:graphrag:attribute_prediction:2882bce6" + ], + "task": "Predict missing attributes of entities using local graph structure" } ] \ No newline at end of file diff --git a/GOLDEN/datasets/multi-agent/EVID_multiagent_resource_allocation_8efecaad.json b/GOLDEN/datasets/multi-agent/EVID_multiagent_resource_allocation_8efecaad.json new file mode 100644 index 00000000000..99f1de15a4c --- /dev/null +++ b/GOLDEN/datasets/multi-agent/EVID_multiagent_resource_allocation_8efecaad.json @@ -0,0 +1 @@ +{"id":"EVID:multiagent:resource_allocation:8efecaad","task":"Distribute limited resources among competing agents","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/multi-agent/EVID_multiagent_resource_allocation_e0f2b6f1.json b/GOLDEN/datasets/multi-agent/EVID_multiagent_resource_allocation_e0f2b6f1.json new file mode 100644 index 00000000000..4c4f7549ee9 --- /dev/null +++ b/GOLDEN/datasets/multi-agent/EVID_multiagent_resource_allocation_e0f2b6f1.json @@ -0,0 +1 @@ +{"id":"EVID:multiagent:resource_allocation:e0f2b6f1","task":"Distribute limited resources among competing agents","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/multi-agent/EVID_multiagent_role_discovery_7514ea55.json b/GOLDEN/datasets/multi-agent/EVID_multiagent_role_discovery_7514ea55.json new file mode 100644 index 00000000000..1a7718896f9 --- /dev/null +++ b/GOLDEN/datasets/multi-agent/EVID_multiagent_role_discovery_7514ea55.json @@ -0,0 +1 @@ +{"id":"EVID:multiagent:role_discovery:7514ea55","task":"Agents dynamically discover and assume roles based on task requirements","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/multi-agent/EVID_multiagent_role_discovery_a6e46d05.json b/GOLDEN/datasets/multi-agent/EVID_multiagent_role_discovery_a6e46d05.json new file mode 100644 index 00000000000..b128efafcd1 --- /dev/null +++ b/GOLDEN/datasets/multi-agent/EVID_multiagent_role_discovery_a6e46d05.json @@ -0,0 +1 @@ +{"id":"EVID:multiagent:role_discovery:a6e46d05","task":"Agents dynamically discover and assume roles based on task requirements","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/multi-agent/cases.json b/GOLDEN/datasets/multi-agent/cases.json index b6144552dd2..6935dbcdbda 100644 --- a/GOLDEN/datasets/multi-agent/cases.json +++ b/GOLDEN/datasets/multi-agent/cases.json @@ -118,5 +118,21 @@ "EVID:multiagent:hierarchical_planning:fc86bbe4" ], "task": "Evaluate Hierarchical planning capabilities" + }, + { + "id": "EVID:multiagent:resource_allocation", + "required_evidence": [ + "EVID:multiagent:resource_allocation:8efecaad", + "EVID:multiagent:resource_allocation:e0f2b6f1" + ], + "task": "Distribute limited resources among competing agents" + }, + { + "id": "EVID:multiagent:role_discovery", + "required_evidence": [ + "EVID:multiagent:role_discovery:7514ea55", + "EVID:multiagent:role_discovery:a6e46d05" + ], + "task": "Agents dynamically discover and assume roles based on task requirements" } ] \ No newline at end of file diff --git a/GOLDEN/datasets/tooluse/EVID_tooluse_code_execution_1dd8e73e.json b/GOLDEN/datasets/tooluse/EVID_tooluse_code_execution_1dd8e73e.json new file mode 100644 index 00000000000..03c11771fad --- /dev/null +++ b/GOLDEN/datasets/tooluse/EVID_tooluse_code_execution_1dd8e73e.json @@ -0,0 +1 @@ +{"id":"EVID:tooluse:code_execution:1dd8e73e","task":"Execute generated code and handle runtime errors","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/tooluse/EVID_tooluse_code_execution_c7d5cea8.json b/GOLDEN/datasets/tooluse/EVID_tooluse_code_execution_c7d5cea8.json new file mode 100644 index 00000000000..8d7bae3dd9c --- /dev/null +++ b/GOLDEN/datasets/tooluse/EVID_tooluse_code_execution_c7d5cea8.json @@ -0,0 +1 @@ +{"id":"EVID:tooluse:code_execution:c7d5cea8","task":"Execute generated code and handle runtime errors","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/tooluse/EVID_tooluse_file_system_2571feb3.json b/GOLDEN/datasets/tooluse/EVID_tooluse_file_system_2571feb3.json new file mode 100644 index 00000000000..f7dfb14cb1b --- /dev/null +++ b/GOLDEN/datasets/tooluse/EVID_tooluse_file_system_2571feb3.json @@ -0,0 +1 @@ +{"id":"EVID:tooluse:file_system:2571feb3","task":"Perform multi-step file system operations","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/tooluse/EVID_tooluse_file_system_bab1d56c.json b/GOLDEN/datasets/tooluse/EVID_tooluse_file_system_bab1d56c.json new file mode 100644 index 00000000000..a659892a01a --- /dev/null +++ b/GOLDEN/datasets/tooluse/EVID_tooluse_file_system_bab1d56c.json @@ -0,0 +1 @@ +{"id":"EVID:tooluse:file_system:bab1d56c","task":"Perform multi-step file system operations","meta":"generated"} \ No newline at end of file diff --git a/GOLDEN/datasets/tooluse/cases.json b/GOLDEN/datasets/tooluse/cases.json index f66707b42d9..92c7866f3ab 100644 --- a/GOLDEN/datasets/tooluse/cases.json +++ b/GOLDEN/datasets/tooluse/cases.json @@ -91,5 +91,21 @@ "EVID:tooluse:nested_invocation:ff0006de" ], "task": "Evaluate Nested invocation capabilities" + }, + { + "id": "EVID:tooluse:file_system", + "required_evidence": [ + "EVID:tooluse:file_system:2571feb3", + "EVID:tooluse:file_system:bab1d56c" + ], + "task": "Perform multi-step file system operations" + }, + { + "id": "EVID:tooluse:code_execution", + "required_evidence": [ + "EVID:tooluse:code_execution:c7d5cea8", + "EVID:tooluse:code_execution:1dd8e73e" + ], + "task": "Execute generated code and handle runtime errors" } ] \ No newline at end of file diff --git a/evaluation/scoring/evidence.ts b/evaluation/scoring/evidence.ts index 00365c78c82..7d9cc4845e6 100644 --- a/evaluation/scoring/evidence.ts +++ b/evaluation/scoring/evidence.ts @@ -55,3 +55,14 @@ export function aggregateScores(scores: EvidenceScore[]): EvidenceScore { f1_score: sum.f1_score / scores.length, }; } + +export function scoreEvidencePrecision(required: string[], provided: string[]): number { + if (required.length === 0) return 1.0; + const hits = required.filter(r => provided.includes(r)).length; + return hits / required.length; +} + +export function scoreToolEfficiency(optimalSteps: number, actualSteps: number): number { + if (actualSteps <= optimalSteps) return 1.0; + return Math.max(0, optimalSteps / actualSteps); +}