PrimeIntellect-ai · rasdani · Apr 27, 2026 · Apr 27, 2026 · May 3, 2026 · May 3, 2026
diff --git a/tests/test_composable_env.py b/tests/test_composable_env.py
@@ -2,6 +2,7 @@
 
 import importlib
 import json
+import subprocess
 from types import SimpleNamespace
 from unittest.mock import AsyncMock, call
 
@@ -10,6 +11,7 @@
 import verifiers as vf
 from verifiers.envs.experimental.composable import (
     ComposableEnv,
+    GitPatchCollector,
     Harness,
     SandboxSpec,
     SandboxTaskSet,
@@ -72,6 +74,21 @@ def get_rubric(self):
         return MockMathRubric()
 
 
+class RecordingStateCollector:
+    def __init__(self):
+        self.events = []
+
+    async def post_sandbox_setup(self, env, state):
+        self.events.append(
+            ("post_sandbox_setup", env.taskset.get_workdir(state["info"]))
+        )
+        state["collector_setup"] = True
+
+    async def post_rollout(self, env, state):
+        self.events.append(("post_rollout", env.taskset.get_workdir(state["info"])))
+        state["collector_rollout"] = True
+
+
 def _make_dataset(n=3):
     from datasets import Dataset
 
@@ -263,6 +280,185 @@ async def test_composable_env_quotes_log_path_when_collecting_logs():
     assert state["agent_logs"] == "agent log"
 
 
+@pytest.mark.asyncio
+async def test_composable_env_runs_state_collectors():
+    taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
+    collector = RecordingStateCollector()
+    env = ComposableEnv(
+        taskset=taskset,
+        harness=Harness(run_command="true", state_collectors=[collector]),
+    )
+    env.sandbox_client = SimpleNamespace(
+        execute_command=AsyncMock(
+            return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
+        ),
+        teardown=lambda: None,
+    )
+    env.taskset.setup = AsyncMock()
+    env.upload_content = AsyncMock()
+
+    state = {"sandbox_id": "sbx", "info": {"id": 0}, "timing": {"total_ms": 0}}
+
+    await env.post_sandbox_setup(state)
+    await env.post_rollout(state)
+
+    assert state["collector_setup"] is True
+    assert state["collector_rollout"] is True
+    assert collector.events == [
+        ("post_sandbox_setup", "/testbed"),
+        ("post_rollout", "/testbed"),
+    ]
+
+
+@pytest.mark.asyncio
+async def test_git_patch_collector_snapshots_and_collects_patch_state():
+    taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
+    diff = """diff --git a/tests/test_bug.py b/tests/test_bug.py
+index 1111111..2222222 100644
+--- a/tests/test_bug.py
++++ b/tests/test_bug.py
+@@ -1 +1 @@
+-old
++new
+"""
+    collector = GitPatchCollector()
+    env = SimpleNamespace(
+        taskset=taskset,
+        sandbox_client=SimpleNamespace(
+            execute_command=AsyncMock(
+                side_effect=[
+                    SimpleNamespace(stdout="tree123\n", stderr="", exit_code=0),
+                    SimpleNamespace(stdout=diff, stderr="", exit_code=0),
+                ]
+            )
+        ),
+    )
+
+    state = {"sandbox_id": "sbx", "info": {"id": 0}}
+
+    await collector.post_sandbox_setup(env, state)
+    await collector.post_rollout(env, state)
+
+    assert state[collector._base_tree_key] == "tree123"
+    assert state[collector._workdir_key] == "/testbed"
+    assert state["agent_patch"] == diff
+    snapshot_call, patch_call = env.sandbox_client.execute_command.await_args_list
+    assert "GIT_INDEX_FILE" in snapshot_call.args[1]
+    assert "GIT_INDEX_FILE" in patch_call.args[1]
+    assert "commit" not in snapshot_call.args[1]
+    assert "commit" not in patch_call.args[1]
+    assert (
+        'diff --binary --full-index --text tree123 "$current_tree"'
+        in patch_call.args[1]
+    )
+    assert snapshot_call.kwargs == {"working_dir": "/testbed", "timeout": 120}
+    assert patch_call.kwargs == {"working_dir": "/testbed", "timeout": 120}
+
+
+def test_git_patch_collector_commands_diff_against_post_setup_tree(tmp_path):
+    collector = GitPatchCollector()
+    repo = tmp_path / "repo"
+    repo.mkdir()
+    (repo / "pkg.py").write_text("VALUE = 'base'\n")
+    (repo / "tests").mkdir()
+    (repo / "tests" / "test_pkg.py").write_text("def test_base():\n    assert True\n")
+
+    subprocess.run(["git", "init", "-b", "main"], cwd=repo, check=True)
+    subprocess.run(["git", "add", "."], cwd=repo, check=True)
+    subprocess.run(
+        [
+            "git",
+            "-c",
+            "user.name=Codex",
+            "-c",
+            "user.email=codex@example.com",
+            "commit",
+            "-m",
+            "base",
+        ],
+        cwd=repo,
+        check=True,
+    )
+    head_before = subprocess.run(
+        ["git", "rev-parse", "HEAD"],
+        cwd=repo,
+        check=True,
+        capture_output=True,
+        text=True,
+    ).stdout.strip()
+
+    # Task setup may apply benchmark tests before the agent starts. The
+    # baseline tree must include these so they do not appear as agent edits.
+    (repo / "tests" / "test_pkg.py").write_text(
+        "def test_base():\n    assert True\n\n"
+        "def test_setup_patch():\n    assert True\n"
+    )
+    base_tree = subprocess.run(
+        collector.snapshot_command(),
+        cwd=repo,
+        shell=True,
+        check=True,
+        capture_output=True,
+        text=True,
+    ).stdout.strip()
+
+    head_after = subprocess.run(
+        ["git", "rev-parse", "HEAD"],
+        cwd=repo,
+        check=True,
+        capture_output=True,
+        text=True,
+    ).stdout.strip()
+    assert head_after == head_before
+    subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=repo, check=True)
+
+    # Capture a committed source change, an unstaged test edit, and a new
+    # untracked test file. All are agent edits relative to the setup tree.
+    (repo / "pkg.py").write_text("VALUE = 'agent'\n")
+    subprocess.run(["git", "add", "pkg.py"], cwd=repo, check=True)
+    subprocess.run(
+        [
+            "git",
+            "-c",
+            "user.name=Codex",
+            "-c",
+            "user.email=codex@example.com",
+            "commit",
+            "-m",
+            "agent source edit",
+        ],
+        cwd=repo,
+        check=True,
+    )
+    (repo / "tests" / "test_pkg.py").write_text(
+        "def test_base():\n    assert True\n\n"
+        "def test_setup_patch():\n    assert True\n\n"
+        "def test_agent_patch():\n    assert False\n"
+    )
+    (repo / "tests" / "test_new.py").write_text(
+        "def test_new_agent_file():\n    assert False\n"
+    )
+
+    patch = subprocess.run(
+        collector.diff_command(base_tree),
+        cwd=repo,
+        shell=True,
+        check=True,
+        capture_output=True,
+        text=True,
+    ).stdout
+
+    assert "diff --git a/pkg.py b/pkg.py" in patch
+    assert "+VALUE = 'agent'" in patch
+    assert "diff --git a/tests/test_pkg.py b/tests/test_pkg.py" in patch
+    assert "+def test_agent_patch():" in patch
+    assert "diff --git a/tests/test_new.py b/tests/test_new.py" in patch
+    assert "+def test_new_agent_file():" in patch
+    assert "+def test_setup_patch():" not in patch
+
+    subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=repo, check=True)
+
+
 # ── install_env ──────────────────────────────────────────────────────────
 
 

diff --git a/tests/test_rlm_composable_env.py b/tests/test_rlm_composable_env.py
@@ -20,6 +20,7 @@
 )
 from verifiers.envs.experimental.composable import (
     ComposableEnv,
+    GitPatchCollector,
     Harness,
     SandboxSpec,
     SandboxTaskSet,
@@ -206,6 +207,10 @@ def test_rlm_harness_uses_explicit_local_checkout(tmp_path):
     assert harness.metrics_key == "metrics"
     assert harness.metrics_prefix == "rlm_"
     assert harness.skills_path == "/task/rlm-skills"
+    assert harness.state_collectors is not None
+    assert len(harness.state_collectors) == 1
+    assert isinstance(harness.state_collectors[0], GitPatchCollector)
+    assert harness.state_collectors[0].state_key == "agent_patch"
 
 
 def test_resolve_local_checkout_rejects_missing_explicit_path(tmp_path):

diff --git a/verifiers/envs/experimental/composable/README.md b/verifiers/envs/experimental/composable/README.md
@@ -14,7 +14,7 @@ Separates **what to solve** (the task) from **how to solve it** (the agent) by r
 
 **Harness** — agent-side configuration. Declares how to install and run an agent binary, and where it expects to find task-provided content (instruction, system prompt).
 
-**ComposableEnv** — a `CliAgentEnv` subclass that wires a TaskSet + Harness. Inherits all interception machinery unchanged. Supports `install_env` for install-only environment variables, automatic upload of task-declared directories (via `TaskSet.get_upload_dirs()`), and harness-declared metrics collection (via `Harness.metrics_path`).
+**ComposableEnv** — a `CliAgentEnv` subclass that wires a TaskSet + Harness. Inherits all interception machinery unchanged. Supports `install_env` for install-only environment variables, automatic upload of task-declared directories (via `TaskSet.get_upload_dirs()`), harness-declared metrics collection (via `Harness.metrics_path`), and harness-owned state collectors for rollout artifacts.
 
 **Skills** are first-class: any taskset with a sibling `skills/` directory gets automatic upload for free. `TaskSet.get_skills_dir()` auto-discovers it, and `get_upload_dirs()` includes it under the `"skills"` key by default. The harness's `upload_dir_mapping` decides where skills land in the sandbox (e.g. RLM puts them at `/task/rlm-skills`).
 
@@ -167,7 +167,7 @@ ComposableEnv subclasses `CliAgentEnv` without modifying it. It overrides these
 - **`get_sandbox_resources(state)`** — reads CPU, memory, GPU from `SandboxSpec`
 - **`build_env_vars(state)`** — merges task env vars and exports `AGENT_WORKDIR`
 - **`post_sandbox_setup(state)`** — runs task setup, uploads instruction + system prompt, installs agent
-- **`post_rollout(state)`** — collects agent logs (scoring is done by the rubric)
+- **`post_rollout(state)`** — runs harness state collectors and collects agent logs (scoring is done by the rubric)
 
 Everything else — tunnel, HTTP interception, background job polling, and streaming — is inherited from `CliAgentEnv` unchanged.
 

diff --git a/verifiers/envs/experimental/composable/__init__.py b/verifiers/envs/experimental/composable/__init__.py
@@ -5,7 +5,8 @@
     SandboxTaskSet,
     discover_sibling_dir,
 )
-from verifiers.envs.experimental.composable.harness import Harness
+from verifiers.envs.experimental.composable.harness import Harness, StateCollector
+from verifiers.envs.experimental.composable.state_collectors import GitPatchCollector
 from verifiers.envs.experimental.composable.composable_env import ComposableEnv
 
 __all__ = [
@@ -14,6 +15,8 @@
     "TaskSet",
     "SandboxTaskSet",
     "Harness",
+    "StateCollector",
+    "GitPatchCollector",
     "ComposableEnv",
     "discover_sibling_dir",
 ]
diff --git a/verifiers/envs/experimental/composable/composable_env.py b/verifiers/envs/experimental/composable/composable_env.py
@@ -256,6 +256,7 @@ async def post_sandbox_setup(self, state: State) -> None:
         await self._after_harness_inputs_uploaded(state)
         await self._install_agent(sandbox_id)
         await self._run_post_install(sandbox_id)
+        await self._run_state_collectors("post_sandbox_setup", state)
 
     async def post_rollout(self, state: State) -> None:
         """Collect agent logs and harness metrics after the agent finishes.
@@ -265,6 +266,9 @@ async def post_rollout(self, state: State) -> None:
         stays alive for the rubric to run tests / read files.
         """
         sandbox_id = state.get("sandbox_id")
+        if sandbox_id:
+            await self._run_state_collectors("post_rollout", state)
+
         if sandbox_id and self.harness.log_path and "agent_logs" not in state:
             try:
                 log_path = shlex.quote(self.harness.log_path)
@@ -282,6 +286,15 @@ async def post_rollout(self, state: State) -> None:
 
         await super().post_rollout(state)
 
+    async def _run_state_collectors(self, hook: str, state: State) -> None:
+        collectors = self.harness.state_collectors or []
+        for collector in collectors:
+            try:
+                await getattr(collector, hook)(self, state)
+            except Exception as e:
+                name = type(collector).__name__
+                self.logger.warning(f"{name}.{hook} failed: {e}")
+
     async def _populate_sandbox_context(self, state: State) -> None:
         """Populate sandbox-specific context used by setup/evaluate hooks."""
         state["sandbox_client"] = self.sandbox_client

diff --git a/verifiers/envs/experimental/composable/harness.py b/verifiers/envs/experimental/composable/harness.py
@@ -19,13 +19,22 @@
 from dataclasses import dataclass
 from importlib.abc import Traversable
 from pathlib import Path
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Callable, Protocol
 
 if TYPE_CHECKING:
+    from verifiers.envs.experimental.composable.composable_env import ComposableEnv
     from verifiers.envs.experimental.composable.task import SandboxSpec
     from verifiers.types import State, TrajectoryStep
 
 
+class StateCollector(Protocol):
+    """Harness-owned lifecycle hook for writing rollout artifacts into state."""
+
+    async def post_sandbox_setup(self, env: ComposableEnv, state: State) -> None: ...
+
+    async def post_rollout(self, env: ComposableEnv, state: State) -> None: ...
+
+
 @dataclass
 class Harness:
     """Agent-side configuration.
@@ -123,6 +132,10 @@ class Harness:
         trajectory the trainer sees — e.g. rlm_harness uses it to drop
         sub-agent calls (``X-RLM-Depth`` header > 0) so only the
         parent agent's turns contribute to the policy gradient.
+    state_collectors:
+        Optional harness-owned artifact collectors. ``ComposableEnv``
+        runs each collector after sandbox setup and after rollout, before
+        scoring mutates the task sandbox.
     """
 
     install_script: str | None = None
@@ -147,6 +160,7 @@ class Harness:
     keep_trajectory_step: (
         Callable[[TrajectoryStep, State, dict[str, str]], bool] | None
     ) = None
+    state_collectors: list[StateCollector] | None = None
 
     def get_effective_upload_dir_mapping(self) -> dict[str, str] | None:
         """Return the merged upload mapping (skills_path + upload_dir_mapping)."""

diff --git a/verifiers/envs/experimental/composable/harnesses/rlm.py b/verifiers/envs/experimental/composable/harnesses/rlm.py
@@ -9,7 +9,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, Callable
 
-from verifiers.envs.experimental.composable import Harness
+from verifiers.envs.experimental.composable import GitPatchCollector, Harness
 from verifiers.envs.experimental.utils.git_checkout_cache import (
     resolve_git_checkout,
     validate_git_checkout,
@@ -203,6 +203,7 @@ def env_vars_for_rollout(state: State) -> dict[str, str]:
         tool_names=tool_names,
         environment_vars=env_vars_for_rollout,
         keep_trajectory_step=keep_trajectory_step,
+        state_collectors=[GitPatchCollector()],
     )