Skip to content
This repository was archived by the owner on Mar 12, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
354 changes: 354 additions & 0 deletions schema/leaderboard.schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,354 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"version": "0.0.1",
"type": "object",
"description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
"required": [
"schema_version",
"evaluation_id",
"model_info",
"evaluation_results"
],
"properties": {
"schema_version": {
"type": "string",
"description": "Version of the schema used for this evaluation data"
},
"evaluation_id": {
"type": "string",
"description": "Unique identifier for this specific evaluation run"
},
"model_info": {
"type": "object",
"description": "Complete model specification including basic information, technical configuration and inference settings",
"required": [
"name",
"source_url"
],
"properties": {
"name": {
"type": "string",
"description": "Model name and version (e.g., 'Llama-2-13b-chat-hf')"
},
"source_url": {
"type": "string",
"description": "URL for the source of the evaluation data"
},
"provider_name": {
Comment thread
damian1996 marked this conversation as resolved.
"type": "string",
"description": "Name of the provider for the version of the model used during evaluation."
},
"developer": {
"type": "string",
"description": "Name of organization that provides the model (e.g. 'OpenAI')"
},
"inference_settings": {
"type": "object",
"description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution",
"properties": {
"quantization_method": {
Comment thread
damian1996 marked this conversation as resolved.
Outdated
"type": "string",
"description": "Quantization method used for the model (e.g GPTQ)"
},
"generation_args": {
Comment thread
damian1996 marked this conversation as resolved.
Outdated
"type": "object",
"properties": {
"temperature": {
"type": [
"null",
"number"
],
"description": "Sampling temperature"
},
"top_p": {
"type": [
"null",
"number"
],
"description": "Nucleus sampling parameter"
},
"top_k": {
"type": [
"null",
"number"
],
"description": "Top-k sampling parameter"
},
"max_tokens": {
"type": "integer",
"minimum": 1,
"description": "Maximum number of tokens to generate"
}
},
"additionalProperties": true
}
}
}
}
},
"evaluation_results": {
"type": "array",
"description": "Array of evaluation results",
"items": {
"type": "object",
"required": [
"evaluation_name",
"metric_config",
"score_details"
],
"properties": {
"evaluation_name": {
"type": "string",
"description": "Name of the evaluation"
},
"metric_config": {
"type": "object",
"description": "Details about the metric",
"required": [
"lower_is_better"
],
"properties": {
"evaluation_description": {
"type": "string",
"description": "Description of the evaluation"
},
"lower_is_better": {
"type": "boolean",
"description": "Whether a lower score is better"
},
"score_type": {
"type": "string",
"description": "Type of score",
"enum": [
"binary",
"continuous",
"levels"
]
},
"score_level_names": {
"type": "array",
"description": "Names of the score levels",
"items": {
"type": "string"
}
},
"min_score": {
"type": "number",
"description": "Minimum possible score"
},
"max_score": {
"type": "number",
"description": "Maximum possible score"
}
}
},
"score_details": {
"type": "string",
"description": "The score for the evaluation and related details",
"required": [
"score"
],
"properties": {
"score": {
"type": "number",
Comment thread
akornilotrust marked this conversation as resolved.
"description": "The score for the evaluation"
},
"details": {
"type": "string",
"description": "Any additional details about the score"
}
}
},
"sample_level_data": {
"type": "array",
"description": "Sample level results for items used in evaluation",
"items": {
"type": "object",
"required": [
"sample_id",
"score"
],
"properties": {
"sample_id": {
"type": "string",
"description": "Unique identifier for the sample"
},
"score": {
"type": "number",
"description": "Score for the sample"
}
}
}
},
"generation_config": {
"type": "string",
"description": "Details about how the scores were generated",
"prompt_config": {
"type": "object",
"description": "Configuration of the prompt template and formatting",
"required": [
"prompt_class"
],
"properties": {
"prompt_class": {
"type": "string",
"description": "Type of task and its formatting requirements",
"enum": [
"MultipleChoice",
"OpenEnded",
"Completion"
]
},
"dimensions": {
"type": "object",
"description": "Format-specific configuration dimensions",
"required": [
"choices_order",
"enumerator",
"instruction_phrasing",
"separator",
"shots"
],
"properties": {
"choices_order": {
"type": "object",
"required": [
"method",
"description"
],
"properties": {
"method": {
"type": "string",
"description": "The method to use for ordering choices"
},
"description": {
"type": "string",
"description": "Detailed explanation of the ordering method"
}
}
},
"demonstrations": {
"type": "array",
"description": "Array of demonstration examples used in few-shot prompting",
"default": []
},
"enumerator": {
"type": "string",
"description": "Style of enumeration for multiple choice options",
"enum": [
"capitals",
"lowercase",
"numbers",
"roman",
"keyboard",
"greek"
]
},
"instruction_phrasing": {
"type": "object",
"required": [
"name",
"text"
],
"properties": {
"name": {
"type": "string",
"description": "Name of the instruction template"
},
"text": {
"type": "string",
"description": "Template text with placeholders for question and choices (or more)"
}
}
},
"separator": {
"type": "string",
"description": "Character(s) used to separate multiple choice options",
"enum": [
"\\s",
"\n",
", ",
"; ",
" | ",
" OR ",
" or "
]
},
"shots": {
"type": "integer",
"description": "Number of examples provided in the prompt",
"minimum": 0,
"maximum": 10
}
}
}
}
},
"evaluation_method": {
"type": "object",
"description": "Evaluation metrics and ground truth",
"required": [
"evaluation_method"
],
"properties": {
"evaluation_method": {
"type": "object",
"description": "Method used to evaluate the answer, including predefined methods and user-defined methods.",
"properties": {
"method_name": {
"type": "string",
"description": "Name of the evaluation method. Can be a predefined method or a user-defined method."
},
"description": {
"type": "string",
"description": "Detailed explanation of how the evaluation method works. For user-defined methods, this is required."
},
"parameters": {
"type": "object",
"description": "Optional parameters used by the evaluation method. Allows custom configuration.",
"additionalProperties": true
}
},
"required": [
"method_name",
"description"
],
"if": {
"properties": {
"method_name": {
"enum": [
"label_only_match",
"content_similarity"
]
}
}
},
"then": {
"properties": {
"description": {
"type": "string",
"enum": [
"Compares only the choice identifier/label to evaluate the response.",
"Finds the most similar answer among the given choices by comparing the textual content"
]
}
}
},
"else": {
"properties": {
"description": {
"type": "string",
"description": "Explanation of the custom evaluation method."
}
}
}
}
}
}
}
}
}

}
}
}
Loading