evaleval · damian1996 · Oct 1, 2025 · Sep 28, 2025 · Sep 28, 2025 · Sep 28, 2025
diff --git a/schema/leaderboard.schema.json b/schema/leaderboard.schema.json
@@ -0,0 +1,354 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "version": "0.0.1",
+    "type": "object",
+    "description": "Schema for storing and validating LLMs evaluation data, including model configuration, prompts, instances, Output, and evaluation metrics",
+    "required": [
+        "schema_version",
+        "evaluation_id",
+        "model_info",
+        "evaluation_results"
+    ],
+    "properties": {
+        "schema_version": {
+            "type": "string",
+            "description": "Version of the schema used for this evaluation data"
+        },
+        "evaluation_id": {
+            "type": "string",
+            "description": "Unique identifier for this specific evaluation run"
+        },
+        "model_info": {
+            "type": "object",
+            "description": "Complete model specification including basic information, technical configuration and inference settings",
+            "required": [
+                    "name",
+                    "source_url"
+            ],
+            "properties": {
+                "name": {
+                    "type": "string",
+                    "description": "Model name and version (e.g., 'Llama-2-13b-chat-hf')"
+                },
+                "source_url": {
+                    "type": "string",
+                    "description": "URL for the source of the evaluation data"
+                },
+                "provider_name": {
+                    "type": "string",
+                    "description": "Name of the provider for the version of the model used during evaluation."
+                },
+                "developer": {
+                    "type": "string",
+                    "description": "Name of organization that provides the model (e.g. 'OpenAI')"
+                },
+                "inference_settings": {
+                    "type": "object",
+                    "description": "Runtime settings and parameters for model inference - controls how the model generates outputs and performs during execution",
+                    "properties": {
+                        "quantization_method": {
+                            "type": "string",
+                            "description": "Quantization method used for the model (e.g GPTQ)"
+                        },
+                        "generation_args": {
+                            "type": "object",
+                            "properties": {
+                                "temperature": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Sampling temperature"
+                                },
+                                "top_p": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Nucleus sampling parameter"
+                                },
+                                "top_k": {
+                                    "type": [
+                                        "null",
+                                        "number"
+                                    ],
+                                    "description": "Top-k sampling parameter"
+                                },
+                                "max_tokens": {
+                                    "type": "integer",
+                                    "minimum": 1,
+                                    "description": "Maximum number of tokens to generate"
+                                }
+                            },
+                            "additionalProperties": true
+                        }
+                    }
+                }
+            }
+        },
+        "evaluation_results": {
+            "type": "array",
+            "description": "Array of evaluation results",
+            "items": {
+                "type": "object",
+                "required": [
+                    "evaluation_name",
+                    "metric_config",
+                    "score_details"
+                ],
+                "properties": {
+                    "evaluation_name": {
+                        "type": "string",
+                        "description": "Name of the evaluation"
+                    },
+                    "metric_config": {
+                        "type": "object",
+                        "description": "Details about the metric",
+                        "required": [
+                            "lower_is_better"
+                        ],
+                        "properties": {
+                            "evaluation_description": {
+                                "type": "string",
+                                "description": "Description of the evaluation"
+                            },
+                            "lower_is_better": {
+                                "type": "boolean",
+                                "description": "Whether a lower score is better"
+                            },
+                            "score_type": {
+                                "type": "string",
+                                "description": "Type of score",
+                                "enum": [
+                                    "binary",
+                                    "continuous",
+                                    "levels"
+                                ]
+                            },
+                            "score_level_names": {
+                                "type": "array",
+                                "description": "Names of the score levels",
+                                "items": {
+                                    "type": "string"
+                                }
+                            },
+                            "min_score": {
+                                "type": "number",
+                                "description": "Minimum possible score"
+                            },
+                            "max_score": {
+                                "type": "number",
+                                "description": "Maximum possible score"
+                            }
+                        }
+                    },
+                    "score_details": {
+                        "type": "string",
+                        "description": "The score for the evaluation and related details",
+                        "required": [
+                            "score"
+                        ],
+                        "properties": {
+                            "score": {
+                                "type": "number",
+                                "description": "The score for the evaluation"
+                            },
+                            "details": {
+                                "type": "string",
+                                "description": "Any additional details about the score"
+                            }
+                        }
+                    },
+                    "sample_level_data": {
+                        "type": "array",
+                        "description": "Sample level results for items used in evaluation",
+                        "items": {
+                            "type": "object",
+                            "required": [
+                                "sample_id",
+                                "score"
+                            ],
+                            "properties": {
+                                "sample_id": {
+                                    "type": "string",
+                                    "description": "Unique identifier for the sample"
+                                },
+                                "score": {
+                                    "type": "number",
+                                    "description": "Score for the sample"
+                                }
+                            }
+                        }
+                    },
+                    "generation_config": {
+                        "type": "string",
+                        "description": "Details about how the scores were generated",
+                        "prompt_config": {
+                            "type": "object",
+                            "description": "Configuration of the prompt template and formatting",
+                            "required": [
+                                "prompt_class"
+                            ],
+                            "properties": {
+                                "prompt_class": {
+                                    "type": "string",
+                                    "description": "Type of task and its formatting requirements",
+                                    "enum": [
+                                        "MultipleChoice",
+                                        "OpenEnded",
+                                        "Completion"
+                                    ]
+                                },
+                                "dimensions": {
+                                    "type": "object",
+                                    "description": "Format-specific configuration dimensions",
+                                    "required": [
+                                        "choices_order",
+                                        "enumerator",
+                                        "instruction_phrasing",
+                                        "separator",
+                                        "shots"
+                                    ],
+                                    "properties": {
+                                        "choices_order": {
+                                            "type": "object",
+                                            "required": [
+                                                "method",
+                                                "description"
+                                            ],
+                                            "properties": {
+                                                "method": {
+                                                    "type": "string",
+                                                    "description": "The method to use for ordering choices"
+                                                },
+                                                "description": {
+                                                    "type": "string",
+                                                    "description": "Detailed explanation of the ordering method"
+                                                }
+                                            }
+                                        },
+                                        "demonstrations": {
+                                            "type": "array",
+                                            "description": "Array of demonstration examples used in few-shot prompting",
+                                            "default": []
+                                        },
+                                        "enumerator": {
+                                            "type": "string",
+                                            "description": "Style of enumeration for multiple choice options",
+                                            "enum": [
+                                                "capitals",
+                                                "lowercase",
+                                                "numbers",
+                                                "roman",
+                                                "keyboard",
+                                                "greek"
+                                            ]
+                                        },
+                                        "instruction_phrasing": {
+                                            "type": "object",
+                                            "required": [
+                                                "name",
+                                                "text"
+                                            ],
+                                            "properties": {
+                                                "name": {
+                                                    "type": "string",
+                                                    "description": "Name of the instruction template"
+                                                },
+                                                "text": {
+                                                    "type": "string",
+                                                    "description": "Template text with placeholders for question and choices (or more)"
+                                                }
+                                            }
+                                        },
+                                        "separator": {
+                                            "type": "string",
+                                            "description": "Character(s) used to separate multiple choice options",
+                                            "enum": [
+                                                "\\s",
+                                                "\n",
+                                                ", ",
+                                                "; ",
+                                                " | ",
+                                                " OR ",
+                                                " or "
+                                            ]
+                                        },
+                                        "shots": {
+                                            "type": "integer",
+                                            "description": "Number of examples provided in the prompt",
+                                            "minimum": 0,
+                                            "maximum": 10
+                                        }
+                                    }
+                                }
+                            }
+                        },
+                        "evaluation_method": {
+                            "type": "object",
+                            "description": "Evaluation metrics and ground truth",
+                            "required": [
+                                "evaluation_method"
+                            ],
+                            "properties": {
+                                "evaluation_method": {
+                                    "type": "object",
+                                    "description": "Method used to evaluate the answer, including predefined methods and user-defined methods.",
+                                    "properties": {
+                                        "method_name": {
+                                            "type": "string",
+                                            "description": "Name of the evaluation method. Can be a predefined method or a user-defined method."
+                                        },
+                                        "description": {
+                                            "type": "string",
+                                            "description": "Detailed explanation of how the evaluation method works. For user-defined methods, this is required."
+                                        },
+                                        "parameters": {
+                                            "type": "object",
+                                            "description": "Optional parameters used by the evaluation method. Allows custom configuration.",
+                                            "additionalProperties": true
+                                        }
+                                    },
+                                    "required": [
+                                        "method_name",
+                                        "description"
+                                    ],
+                                    "if": {
+                                        "properties": {
+                                            "method_name": {
+                                                "enum": [
+                                                    "label_only_match",
+                                                    "content_similarity"
+                                                ]
+                                            }
+                                        }
+                                    },
+                                    "then": {
+                                        "properties": {
+                                            "description": {
+                                                "type": "string",
+                                                "enum": [
+                                                    "Compares only the choice identifier/label to evaluate the response.",
+                                                    "Finds the most similar answer among the given choices by comparing the textual content"
+                                                ]
+                                            }
+                                        }
+                                    },
+                                    "else": {
+                                        "properties": {
+                                            "description": {
+                                                "type": "string",
+                                                "description": "Explanation of the custom evaluation method."
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+        }
+    }
+}