This repository was archived by the owner on Mar 12, 2026. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
Proposed New Leaderboard Schema #25
Merged
damian1996
merged 7 commits into
evaleval:main
from
akornilotrust:akornilo/leaderboard-schema
Oct 1, 2025
Merged
Changes from 6 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
3abe94e
proposed new leaderboard schema
a189e5e
Add config for sample level data
2dbf00e
fix indent + add score level names
8ceca24
remove some unused keys
64b190e
Additional cleanup
ccd05dd
add Python schema
c6f56ca
updated schema
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,354 @@ | ||
| { | ||
| "$schema": "http://json-schema.org/draft-07/schema#", | ||
| "version": "0.0.1", | ||
| "type": "object", | ||
| "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics", | ||
| "required": [ | ||
| "schema_version", | ||
| "evaluation_id", | ||
| "model_info", | ||
| "evaluation_results" | ||
| ], | ||
| "properties": { | ||
| "schema_version": { | ||
| "type": "string", | ||
| "description": "Version of the schema used for this evaluation data" | ||
| }, | ||
| "evaluation_id": { | ||
| "type": "string", | ||
| "description": "Unique identifier for this specific evaluation run" | ||
| }, | ||
| "model_info": { | ||
| "type": "object", | ||
| "description": "Complete model specification including basic information, technical configuration and inference settings", | ||
| "required": [ | ||
| "name", | ||
| "source_url" | ||
| ], | ||
| "properties": { | ||
| "name": { | ||
| "type": "string", | ||
| "description": "Model name and version (e.g., 'Llama-2-13b-chat-hf')" | ||
| }, | ||
| "source_url": { | ||
| "type": "string", | ||
| "description": "URL for the source of the evaluation data" | ||
| }, | ||
| "provider_name": { | ||
| "type": "string", | ||
| "description": "Name of the provider for the version of the model used during evaluation." | ||
| }, | ||
| "developer": { | ||
| "type": "string", | ||
| "description": "Name of organization that provides the model (e.g. 'OpenAI')" | ||
| }, | ||
| "inference_settings": { | ||
| "type": "object", | ||
| "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution", | ||
| "properties": { | ||
| "quantization_method": { | ||
|
damian1996 marked this conversation as resolved.
Outdated
|
||
| "type": "string", | ||
| "description": "Quantization method used for the model (e.g GPTQ)" | ||
| }, | ||
| "generation_args": { | ||
|
damian1996 marked this conversation as resolved.
Outdated
|
||
| "type": "object", | ||
| "properties": { | ||
| "temperature": { | ||
| "type": [ | ||
| "null", | ||
| "number" | ||
| ], | ||
| "description": "Sampling temperature" | ||
| }, | ||
| "top_p": { | ||
| "type": [ | ||
| "null", | ||
| "number" | ||
| ], | ||
| "description": "Nucleus sampling parameter" | ||
| }, | ||
| "top_k": { | ||
| "type": [ | ||
| "null", | ||
| "number" | ||
| ], | ||
| "description": "Top-k sampling parameter" | ||
| }, | ||
| "max_tokens": { | ||
| "type": "integer", | ||
| "minimum": 1, | ||
| "description": "Maximum number of tokens to generate" | ||
| } | ||
| }, | ||
| "additionalProperties": true | ||
| } | ||
| } | ||
| } | ||
| } | ||
| }, | ||
| "evaluation_results": { | ||
| "type": "array", | ||
| "description": "Array of evaluation results", | ||
| "items": { | ||
| "type": "object", | ||
| "required": [ | ||
| "evaluation_name", | ||
| "metric_config", | ||
| "score_details" | ||
| ], | ||
| "properties": { | ||
| "evaluation_name": { | ||
| "type": "string", | ||
| "description": "Name of the evaluation" | ||
| }, | ||
| "metric_config": { | ||
| "type": "object", | ||
| "description": "Details about the metric", | ||
| "required": [ | ||
| "lower_is_better" | ||
| ], | ||
| "properties": { | ||
| "evaluation_description": { | ||
| "type": "string", | ||
| "description": "Description of the evaluation" | ||
| }, | ||
| "lower_is_better": { | ||
| "type": "boolean", | ||
| "description": "Whether a lower score is better" | ||
| }, | ||
| "score_type": { | ||
| "type": "string", | ||
| "description": "Type of score", | ||
| "enum": [ | ||
| "binary", | ||
| "continuous", | ||
| "levels" | ||
| ] | ||
| }, | ||
| "score_level_names": { | ||
| "type": "array", | ||
| "description": "Names of the score levels", | ||
| "items": { | ||
| "type": "string" | ||
| } | ||
| }, | ||
| "min_score": { | ||
| "type": "number", | ||
| "description": "Minimum possible score" | ||
| }, | ||
| "max_score": { | ||
| "type": "number", | ||
| "description": "Maximum possible score" | ||
| } | ||
| } | ||
| }, | ||
| "score_details": { | ||
| "type": "string", | ||
| "description": "The score for the evaluation and related details", | ||
| "required": [ | ||
| "score" | ||
| ], | ||
| "properties": { | ||
| "score": { | ||
| "type": "number", | ||
|
akornilotrust marked this conversation as resolved.
|
||
| "description": "The score for the evaluation" | ||
| }, | ||
| "details": { | ||
| "type": "string", | ||
| "description": "Any additional details about the score" | ||
| } | ||
| } | ||
| }, | ||
| "sample_level_data": { | ||
| "type": "array", | ||
| "description": "Sample level results for items used in evaluation", | ||
| "items": { | ||
| "type": "object", | ||
| "required": [ | ||
| "sample_id", | ||
| "score" | ||
| ], | ||
| "properties": { | ||
| "sample_id": { | ||
| "type": "string", | ||
| "description": "Unique identifier for the sample" | ||
| }, | ||
| "score": { | ||
| "type": "number", | ||
| "description": "Score for the sample" | ||
| } | ||
| } | ||
| } | ||
| }, | ||
| "generation_config": { | ||
| "type": "string", | ||
| "description": "Details about how the scores were generated", | ||
| "prompt_config": { | ||
| "type": "object", | ||
| "description": "Configuration of the prompt template and formatting", | ||
| "required": [ | ||
| "prompt_class" | ||
| ], | ||
| "properties": { | ||
| "prompt_class": { | ||
| "type": "string", | ||
| "description": "Type of task and its formatting requirements", | ||
| "enum": [ | ||
| "MultipleChoice", | ||
| "OpenEnded", | ||
| "Completion" | ||
| ] | ||
| }, | ||
| "dimensions": { | ||
| "type": "object", | ||
| "description": "Format-specific configuration dimensions", | ||
| "required": [ | ||
| "choices_order", | ||
| "enumerator", | ||
| "instruction_phrasing", | ||
| "separator", | ||
| "shots" | ||
| ], | ||
| "properties": { | ||
| "choices_order": { | ||
| "type": "object", | ||
| "required": [ | ||
| "method", | ||
| "description" | ||
| ], | ||
| "properties": { | ||
| "method": { | ||
| "type": "string", | ||
| "description": "The method to use for ordering choices" | ||
| }, | ||
| "description": { | ||
| "type": "string", | ||
| "description": "Detailed explanation of the ordering method" | ||
| } | ||
| } | ||
| }, | ||
| "demonstrations": { | ||
| "type": "array", | ||
| "description": "Array of demonstration examples used in few-shot prompting", | ||
| "default": [] | ||
| }, | ||
| "enumerator": { | ||
| "type": "string", | ||
| "description": "Style of enumeration for multiple choice options", | ||
| "enum": [ | ||
| "capitals", | ||
| "lowercase", | ||
| "numbers", | ||
| "roman", | ||
| "keyboard", | ||
| "greek" | ||
| ] | ||
| }, | ||
| "instruction_phrasing": { | ||
| "type": "object", | ||
| "required": [ | ||
| "name", | ||
| "text" | ||
| ], | ||
| "properties": { | ||
| "name": { | ||
| "type": "string", | ||
| "description": "Name of the instruction template" | ||
| }, | ||
| "text": { | ||
| "type": "string", | ||
| "description": "Template text with placeholders for question and choices (or more)" | ||
| } | ||
| } | ||
| }, | ||
| "separator": { | ||
| "type": "string", | ||
| "description": "Character(s) used to separate multiple choice options", | ||
| "enum": [ | ||
| "\\s", | ||
| "\n", | ||
| ", ", | ||
| "; ", | ||
| " | ", | ||
| " OR ", | ||
| " or " | ||
| ] | ||
| }, | ||
| "shots": { | ||
| "type": "integer", | ||
| "description": "Number of examples provided in the prompt", | ||
| "minimum": 0, | ||
| "maximum": 10 | ||
| } | ||
| } | ||
| } | ||
| } | ||
| }, | ||
| "evaluation_method": { | ||
| "type": "object", | ||
| "description": "Evaluation metrics and ground truth", | ||
| "required": [ | ||
| "evaluation_method" | ||
| ], | ||
| "properties": { | ||
| "evaluation_method": { | ||
| "type": "object", | ||
| "description": "Method used to evaluate the answer, including predefined methods and user-defined methods.", | ||
| "properties": { | ||
| "method_name": { | ||
| "type": "string", | ||
| "description": "Name of the evaluation method. Can be a predefined method or a user-defined method." | ||
| }, | ||
| "description": { | ||
| "type": "string", | ||
| "description": "Detailed explanation of how the evaluation method works. For user-defined methods, this is required." | ||
| }, | ||
| "parameters": { | ||
| "type": "object", | ||
| "description": "Optional parameters used by the evaluation method. Allows custom configuration.", | ||
| "additionalProperties": true | ||
| } | ||
| }, | ||
| "required": [ | ||
| "method_name", | ||
| "description" | ||
| ], | ||
| "if": { | ||
| "properties": { | ||
| "method_name": { | ||
| "enum": [ | ||
| "label_only_match", | ||
| "content_similarity" | ||
| ] | ||
| } | ||
| } | ||
| }, | ||
| "then": { | ||
| "properties": { | ||
| "description": { | ||
| "type": "string", | ||
| "enum": [ | ||
| "Compares only the choice identifier/label to evaluate the response.", | ||
| "Finds the most similar answer among the given choices by comparing the textual content" | ||
| ] | ||
| } | ||
| } | ||
| }, | ||
| "else": { | ||
| "properties": { | ||
| "description": { | ||
| "type": "string", | ||
| "description": "Explanation of the custom evaluation method." | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| } | ||
| } | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.