From 6f38bdc21995e25f5b97bf05f2ba678efc0fff31 Mon Sep 17 00:00:00 2001 From: BradenBug Date: Tue, 16 Jun 2026 15:54:27 -0700 Subject: [PATCH 1/4] feat: hash-guarded dataset versioning and release scaffolding --- cli/generator.py | 6 +- src/benchmark_service/app.py | 3 +- src/benchmark_service/base.py | 17 ++++- src/benchmark_service/dataset_versioning.py | 78 +++++++++++++++++++++ src/benchmark_service/schemas.py | 1 + templates/pyproject.toml.jinja | 7 +- tests/test_client.py | 1 + tests/test_dataset_versioning.py | 76 ++++++++++++++++++++ tests/test_generator.py | 6 +- tests/test_version.py | 26 +++++++ 10 files changed, 208 insertions(+), 13 deletions(-) create mode 100644 src/benchmark_service/dataset_versioning.py create mode 100644 tests/test_dataset_versioning.py diff --git a/cli/generator.py b/cli/generator.py index bad8d5f..488fe7e 100644 --- a/cli/generator.py +++ b/cli/generator.py @@ -169,11 +169,7 @@ def generate_project( shutil.copytree( root / ".github", output_dir / ".github", - ignore=shutil.ignore_patterns( - "auto-tag-release.yaml", - "check-pr-title.yaml", - "cli-integration.yaml", - ), + ignore=shutil.ignore_patterns("cli-integration.yaml"), ) # Create empty tests directory diff --git a/src/benchmark_service/app.py b/src/benchmark_service/app.py index 0dfd3f2..0ded0a7 100644 --- a/src/benchmark_service/app.py +++ b/src/benchmark_service/app.py @@ -197,11 +197,12 @@ def _current_service_version(self) -> str | None: service_override = self.service.get_service_version() return service_override or self._service_version - async def _version(self) -> VersionResponse: + async def _version(self, dataset: str | None = None) -> VersionResponse: return VersionResponse( framework_version=_framework_version, service_name=self._service_name, service_version=self._current_service_version(), + dataset_version=self.service.get_dataset_version(dataset), ) async def _authorize_websocket(self, websocket: WebSocket) -> str | None: diff --git a/src/benchmark_service/base.py b/src/benchmark_service/base.py index 296efea..1194bd1 100644 --- a/src/benchmark_service/base.py +++ b/src/benchmark_service/base.py @@ -7,7 +7,8 @@ from abc import ABC, abstractmethod from collections.abc import AsyncGenerator -from typing import Any, Self +from pathlib import Path +from typing import Any, ClassVar, Self from benchmark_service.auth import ( LEGACY_TENANT_SENTINEL, @@ -15,6 +16,7 @@ load_allowlist, resolve_caller_tenant, ) +from benchmark_service.dataset_versioning import DatasetVersionEntry, load_verified_dataset_versions from benchmark_service.sandbox import Sandbox from benchmark_service.schemas import ( EvaluateResponseRequest, @@ -36,10 +38,20 @@ class BenchmarkService(ABC): datasets: dict[str, dict[str, Any]] + # When set, dataset versions are loaded and content-verified at startup and + # served by get_dataset_version(); a checksum mismatch aborts startup. + dataset_versions_file: ClassVar[Path | None] = None + dataset_versions: dict[str, DatasetVersionEntry] + @classmethod async def create(cls) -> Self: """Factory method to create and initialize a benchmark service.""" instance = cls.__new__(cls) + instance.dataset_versions = ( + load_verified_dataset_versions(cls.dataset_versions_file) + if cls.dataset_versions_file is not None + else {} + ) instance.datasets = await instance.load_datasets() return instance @@ -104,7 +116,8 @@ def get_service_version(self) -> str | None: def get_dataset_version(self, dataset: str | None = None) -> str | None: """Return the version for `dataset`, if this benchmark tracks one.""" - return None + entry = self.dataset_versions.get(dataset or "default") + return entry.version if entry is not None else None def get_dataset(self, dataset: str | None = None) -> dict[str, Any]: """Get a specific dataset by name. Defaults to 'default'.""" diff --git a/src/benchmark_service/dataset_versioning.py b/src/benchmark_service/dataset_versioning.py new file mode 100644 index 0000000..3dd3493 --- /dev/null +++ b/src/benchmark_service/dataset_versioning.py @@ -0,0 +1,78 @@ +"""Hash-guarded dataset version tracking. + +A dataset_versions.yaml next to the dataset files maps each dataset name to a +human-assigned semver and a sha256 of its content file. Version semantics: +major = scores not comparable, minor = additive, patch = non-scoring fixes. +""" + +import hashlib +import sys +from pathlib import Path + +import yaml +from pydantic import BaseModel + + +class DatasetVersionError(Exception): + """Dataset content does not match its declared version entry.""" + + +class DatasetVersionEntry(BaseModel): + file: str + version: str + sha256: str + + +def compute_checksum(path: Path) -> str: + return hashlib.sha256(path.read_bytes()).hexdigest() + + +def load_dataset_versions(versions_file: Path) -> dict[str, DatasetVersionEntry]: + data = yaml.safe_load(versions_file.read_text()) + return {name: DatasetVersionEntry.model_validate(entry) for name, entry in data.items()} + + +def load_verified_dataset_versions(versions_file: Path) -> dict[str, DatasetVersionEntry]: + """Load entries and verify every dataset file matches its declared checksum. + + Raises DatasetVersionError on any mismatch: content that does not match its + declared version must never be served. + """ + entries = load_dataset_versions(versions_file) + data_dir = versions_file.parent + mismatches: list[str] = [] + for name, entry in entries.items(): + actual = compute_checksum(data_dir / entry.file) + if actual != entry.sha256: + mismatches.append(f"{name} ({entry.file}): declared {entry.sha256}, actual {actual}") + if mismatches: + raise DatasetVersionError( + "dataset content does not match dataset_versions.yaml — bump the version, then run " + "`python -m benchmark_service.dataset_versioning update `:\n " + "\n ".join(mismatches) + ) + return entries + + +def main(argv: list[str]) -> int: + if len(argv) != 2 or argv[0] not in ("check", "update"): + print("usage: python -m benchmark_service.dataset_versioning {check|update} ") + return 2 + command, versions_file = argv[0], Path(argv[1]) + if command == "check": + try: + load_verified_dataset_versions(versions_file) + except DatasetVersionError as exc: + print(exc) + return 1 + print("dataset checksums OK") + return 0 + raw = yaml.safe_load(versions_file.read_text()) + for entry in raw.values(): + entry["sha256"] = compute_checksum(versions_file.parent / entry["file"]) + versions_file.write_text(yaml.safe_dump(raw, sort_keys=False)) + print(f"updated {versions_file}") + return 0 + + +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) diff --git a/src/benchmark_service/schemas.py b/src/benchmark_service/schemas.py index 56752be..45fb89f 100644 --- a/src/benchmark_service/schemas.py +++ b/src/benchmark_service/schemas.py @@ -167,6 +167,7 @@ class VersionResponse(BaseModel): framework_version: str service_name: str | None = None service_version: str | None = None + dataset_version: str | None = None class StreamMessageChunk(BaseModel): diff --git a/templates/pyproject.toml.jinja b/templates/pyproject.toml.jinja index 7eff3b9..187784c 100644 --- a/templates/pyproject.toml.jinja +++ b/templates/pyproject.toml.jinja @@ -1,6 +1,6 @@ [project] name = "{{ benchmark_name }}-benchmark-service" -version = "0.1.0" +dynamic = ["version"] readme = "README.md" requires-python = ">=3.12" @@ -17,9 +17,12 @@ dev = [ ] [build-system] -requires = ["hatchling"] +requires = ["hatchling", "hatch-vcs"] build-backend = "hatchling.build" +[tool.hatch.version] +source = "vcs" + [tool.hatch.build.targets.wheel] packages = ["src/{{ benchmark_package }}"] diff --git a/tests/test_client.py b/tests/test_client.py index 005fec3..be0e61c 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -59,6 +59,7 @@ def test_sandbox_config_rejects_unknown_provider(monkeypatch: pytest.MonkeyPatch "framework_version": "0.7.4", "service_name": "legal-research-benchmark-service", "service_version": "1.2.3", + "dataset_version": "3.0.0", }, ), ( diff --git a/tests/test_dataset_versioning.py b/tests/test_dataset_versioning.py new file mode 100644 index 0000000..069c60d --- /dev/null +++ b/tests/test_dataset_versioning.py @@ -0,0 +1,76 @@ +"""Tests for hash-guarded dataset version tracking.""" + +from pathlib import Path + +import pytest +import yaml + +from benchmark_service.dataset_versioning import ( + DatasetVersionError, + compute_checksum, + load_verified_dataset_versions, + main, +) + +from tests.conftest import StubBenchmark + + +def _write_fixture(tmp_path: Path, content: bytes = b'{"tests": []}') -> Path: + data_file = tmp_path / "validation.json" + data_file.write_bytes(content) + versions_file = tmp_path / "dataset_versions.yaml" + versions_file.write_text( + yaml.safe_dump( + { + "validation": { + "file": "validation.json", + "version": "1.0.0", + "sha256": compute_checksum(data_file), + } + } + ) + ) + return versions_file + + +def test_load_verified_returns_entries_when_content_matches(tmp_path: Path) -> None: + versions_file = _write_fixture(tmp_path) + entries = load_verified_dataset_versions(versions_file) + assert entries["validation"].version == "1.0.0" + + +def test_load_verified_raises_on_content_mismatch(tmp_path: Path) -> None: + versions_file = _write_fixture(tmp_path) + (tmp_path / "validation.json").write_bytes(b'{"tests": [1]}') + with pytest.raises(DatasetVersionError, match="validation"): + load_verified_dataset_versions(versions_file) + + +def test_check_command_fails_on_mismatch_and_update_repairs(tmp_path: Path) -> None: + versions_file = _write_fixture(tmp_path) + (tmp_path / "validation.json").write_bytes(b'{"tests": [1]}') + assert main(["check", str(versions_file)]) == 1 + assert main(["update", str(versions_file)]) == 0 + assert main(["check", str(versions_file)]) == 0 + + +async def test_service_startup_verifies_and_serves_dataset_versions(tmp_path: Path) -> None: + versions_file = _write_fixture(tmp_path) + + class VersionedBenchmark(StubBenchmark): + dataset_versions_file = versions_file + + service = await VersionedBenchmark.create() + assert service.get_dataset_version("validation") == "1.0.0" + assert service.get_dataset_version("unknown") is None + + +async def test_service_startup_fails_on_checksum_mismatch(tmp_path: Path) -> None: + versions_file = _write_fixture(tmp_path) + (tmp_path / "validation.json").write_bytes(b"tampered") + + class VersionedBenchmark(StubBenchmark): + dataset_versions_file = versions_file + + with pytest.raises(DatasetVersionError): + await VersionedBenchmark.create() diff --git a/tests/test_generator.py b/tests/test_generator.py index c37992f..033b4e8 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -131,7 +131,7 @@ def test_generates_project_structure() -> None: assert (output_dir / ".github" / "workflows").is_dir() -def test_generated_project_excludes_framework_versioning_workflows(tmp_path: Path) -> None: +def test_generated_project_includes_release_workflows_but_not_cli_integration(tmp_path: Path) -> None: output_dir = tmp_path / "swebench-benchmark-service" generate_project("swebench", output_dir) @@ -139,9 +139,9 @@ def test_generated_project_excludes_framework_versioning_workflows(tmp_path: Pat assert (workflows_dir / "test.yaml").exists() assert (workflows_dir / "style.yaml").exists() assert (workflows_dir / "typecheck.yaml").exists() + assert (workflows_dir / "auto-tag-release.yaml").exists() + assert (workflows_dir / "check-pr-title.yaml").exists() assert not (workflows_dir / "cli-integration.yaml").exists() - assert not (workflows_dir / "auto-tag-release.yaml").exists() - assert not (workflows_dir / "check-pr-title.yaml").exists() def test_generated_benchmark_service_implements_task_listing(tmp_path: Path) -> None: diff --git a/tests/test_version.py b/tests/test_version.py index 3c661ba..fde6c90 100644 --- a/tests/test_version.py +++ b/tests/test_version.py @@ -2,6 +2,7 @@ import re from collections.abc import AsyncGenerator +from pathlib import Path from typing import Any from unittest.mock import patch @@ -18,6 +19,8 @@ StreamChunk, ) +from tests.test_dataset_versioning import _write_fixture # pyright: ignore[reportPrivateUsage] + def test_version_is_importable_and_well_formed() -> None: assert isinstance(benchmark_service.__version__, str) @@ -100,3 +103,26 @@ def test_version_endpoint_prefers_service_version_hook() -> None: assert response.status_code == 200 assert response.json()["service_version"] == "service-hook-1.2.3" + + +def test_version_endpoint_reports_dataset_version_key() -> None: + app = BenchmarkServiceApp(_FakeService) + with TestClient(app) as client: + response = client.get("/version", params={"dataset": "default"}) + + assert response.status_code == 200 + assert "dataset_version" in response.json() + + +def test_version_endpoint_reports_tracked_dataset_version(tmp_path: Path) -> None: + versions_file = _write_fixture(tmp_path) + + class _DatasetVersionedService(_FakeService): + dataset_versions_file = versions_file + + app = BenchmarkServiceApp(_DatasetVersionedService) + with TestClient(app) as client: + response = client.get("/version", params={"dataset": "validation"}) + + assert response.status_code == 200 + assert response.json()["dataset_version"] == "1.0.0" From 102c1c3db714b11b8e2d3b379d55a78d5f12f782 Mon Sep 17 00:00:00 2001 From: BradenBug Date: Thu, 18 Jun 2026 15:20:23 -0700 Subject: [PATCH 2/4] fix: fallback version for scaffolded projects without git metadata --- templates/pyproject.toml.jinja | 3 +++ 1 file changed, 3 insertions(+) diff --git a/templates/pyproject.toml.jinja b/templates/pyproject.toml.jinja index 187784c..4bc48c8 100644 --- a/templates/pyproject.toml.jinja +++ b/templates/pyproject.toml.jinja @@ -22,6 +22,9 @@ build-backend = "hatchling.build" [tool.hatch.version] source = "vcs" +# A freshly scaffolded project has no git tags (or no repo yet); fall back so it +# still builds. Real versions come from tags once the repo is set up. +fallback-version = "0.0.0" [tool.hatch.build.targets.wheel] packages = ["src/{{ benchmark_package }}"] From fc04b776f2a73f94d291e7aac9a9ac1f61c1aa00 Mon Sep 17 00:00:00 2001 From: BradenBug Date: Thu, 18 Jun 2026 17:28:35 -0700 Subject: [PATCH 3/4] fix: clear error when dataset_versions.yaml is empty or not a mapping --- src/benchmark_service/dataset_versioning.py | 16 +++++++++++++--- tests/test_dataset_versioning.py | 8 ++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/benchmark_service/dataset_versioning.py b/src/benchmark_service/dataset_versioning.py index 3dd3493..fcd8f8c 100644 --- a/src/benchmark_service/dataset_versioning.py +++ b/src/benchmark_service/dataset_versioning.py @@ -8,6 +8,7 @@ import hashlib import sys from pathlib import Path +from typing import Any, cast import yaml from pydantic import BaseModel @@ -27,9 +28,18 @@ def compute_checksum(path: Path) -> str: return hashlib.sha256(path.read_bytes()).hexdigest() -def load_dataset_versions(versions_file: Path) -> dict[str, DatasetVersionEntry]: +def _load_versions_mapping(versions_file: Path) -> dict[str, Any]: + """Load the versions file as a mapping; an empty or non-mapping file is a clear error.""" data = yaml.safe_load(versions_file.read_text()) - return {name: DatasetVersionEntry.model_validate(entry) for name, entry in data.items()} + if not isinstance(data, dict): + raise DatasetVersionError( + f"{versions_file} must be a YAML mapping of dataset name to entry, got {type(data).__name__}" + ) + return cast("dict[str, Any]", data) + + +def load_dataset_versions(versions_file: Path) -> dict[str, DatasetVersionEntry]: + return {name: DatasetVersionEntry.model_validate(entry) for name, entry in _load_versions_mapping(versions_file).items()} def load_verified_dataset_versions(versions_file: Path) -> dict[str, DatasetVersionEntry]: @@ -66,7 +76,7 @@ def main(argv: list[str]) -> int: return 1 print("dataset checksums OK") return 0 - raw = yaml.safe_load(versions_file.read_text()) + raw = _load_versions_mapping(versions_file) for entry in raw.values(): entry["sha256"] = compute_checksum(versions_file.parent / entry["file"]) versions_file.write_text(yaml.safe_dump(raw, sort_keys=False)) diff --git a/tests/test_dataset_versioning.py b/tests/test_dataset_versioning.py index 069c60d..420e077 100644 --- a/tests/test_dataset_versioning.py +++ b/tests/test_dataset_versioning.py @@ -8,6 +8,7 @@ from benchmark_service.dataset_versioning import ( DatasetVersionError, compute_checksum, + load_dataset_versions, load_verified_dataset_versions, main, ) @@ -46,6 +47,13 @@ def test_load_verified_raises_on_content_mismatch(tmp_path: Path) -> None: load_verified_dataset_versions(versions_file) +def test_empty_versions_file_raises_clear_error(tmp_path: Path) -> None: + versions_file = tmp_path / "dataset_versions.yaml" + versions_file.write_text("# no entries yet\n") + with pytest.raises(DatasetVersionError, match="must be a YAML mapping"): + load_dataset_versions(versions_file) + + def test_check_command_fails_on_mismatch_and_update_repairs(tmp_path: Path) -> None: versions_file = _write_fixture(tmp_path) (tmp_path / "validation.json").write_bytes(b'{"tests": [1]}') From b05e033349006a064c54ce1d45a0b3a907c65e24 Mon Sep 17 00:00:00 2001 From: BradenBug Date: Thu, 18 Jun 2026 17:31:19 -0700 Subject: [PATCH 4/4] docs: document release setup (GH_PAT, PR-title bump) in scaffolded README --- templates/README.md.jinja | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/templates/README.md.jinja b/templates/README.md.jinja index 0cd67b7..d218341 100644 --- a/templates/README.md.jinja +++ b/templates/README.md.jinja @@ -23,3 +23,12 @@ make dev # run local server make test # run tests make help # list all commands ``` + +## Releasing + +Versions come from git tags (hatch-vcs). To enable the release flow in this repo: + +- Add a **`GH_PAT`** repository secret — a personal access token with permission to push tags (`contents: write`). `auto-tag-release` uses it to push the version tag on merge to `main`; the workflow fails without it. +- Every PR title must include **`#patch`**, **`#minor`**, or **`#major`** — `check-pr-title` enforces this and the tag is bumped accordingly on merge. + +Until the first tag exists, the package builds at version `0.0.0`.