Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 196 additions & 0 deletions tests/test_composable_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import importlib
import json
import subprocess
from types import SimpleNamespace
from unittest.mock import AsyncMock, call

Expand All @@ -10,6 +11,7 @@
import verifiers as vf
from verifiers.envs.experimental.composable import (
ComposableEnv,
GitPatchCollector,
Harness,
SandboxSpec,
SandboxTaskSet,
Expand Down Expand Up @@ -72,6 +74,21 @@ def get_rubric(self):
return MockMathRubric()


class RecordingStateCollector:
def __init__(self):
self.events = []

async def post_sandbox_setup(self, env, state):
self.events.append(
("post_sandbox_setup", env.taskset.get_workdir(state["info"]))
)
state["collector_setup"] = True

async def post_rollout(self, env, state):
self.events.append(("post_rollout", env.taskset.get_workdir(state["info"])))
state["collector_rollout"] = True


def _make_dataset(n=3):
from datasets import Dataset

Expand Down Expand Up @@ -263,6 +280,185 @@ async def test_composable_env_quotes_log_path_when_collecting_logs():
assert state["agent_logs"] == "agent log"


@pytest.mark.asyncio
async def test_composable_env_runs_state_collectors():
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
collector = RecordingStateCollector()
env = ComposableEnv(
taskset=taskset,
harness=Harness(run_command="true", state_collectors=[collector]),
)
env.sandbox_client = SimpleNamespace(
execute_command=AsyncMock(
return_value=SimpleNamespace(stdout="", stderr="", exit_code=0)
),
teardown=lambda: None,
)
env.taskset.setup = AsyncMock()
env.upload_content = AsyncMock()

state = {"sandbox_id": "sbx", "info": {"id": 0}, "timing": {"total_ms": 0}}

await env.post_sandbox_setup(state)
await env.post_rollout(state)

assert state["collector_setup"] is True
assert state["collector_rollout"] is True
assert collector.events == [
("post_sandbox_setup", "/testbed"),
("post_rollout", "/testbed"),
]


@pytest.mark.asyncio
async def test_git_patch_collector_snapshots_and_collects_patch_state():
taskset = MockSandboxTaskSet(dataset=_make_dataset(), name="test")
diff = """diff --git a/tests/test_bug.py b/tests/test_bug.py
index 1111111..2222222 100644
--- a/tests/test_bug.py
+++ b/tests/test_bug.py
@@ -1 +1 @@
-old
+new
"""
collector = GitPatchCollector()
env = SimpleNamespace(
taskset=taskset,
sandbox_client=SimpleNamespace(
execute_command=AsyncMock(
side_effect=[
SimpleNamespace(stdout="tree123\n", stderr="", exit_code=0),
SimpleNamespace(stdout=diff, stderr="", exit_code=0),
]
)
),
)

state = {"sandbox_id": "sbx", "info": {"id": 0}}

await collector.post_sandbox_setup(env, state)
await collector.post_rollout(env, state)

assert state[collector._base_tree_key] == "tree123"
assert state[collector._workdir_key] == "/testbed"
assert state["agent_patch"] == diff
snapshot_call, patch_call = env.sandbox_client.execute_command.await_args_list
assert "GIT_INDEX_FILE" in snapshot_call.args[1]
assert "GIT_INDEX_FILE" in patch_call.args[1]
assert "commit" not in snapshot_call.args[1]
assert "commit" not in patch_call.args[1]
assert (
'diff --binary --full-index --text tree123 "$current_tree"'
in patch_call.args[1]
)
assert snapshot_call.kwargs == {"working_dir": "/testbed", "timeout": 120}
assert patch_call.kwargs == {"working_dir": "/testbed", "timeout": 120}


def test_git_patch_collector_commands_diff_against_post_setup_tree(tmp_path):
collector = GitPatchCollector()
repo = tmp_path / "repo"
repo.mkdir()
(repo / "pkg.py").write_text("VALUE = 'base'\n")
(repo / "tests").mkdir()
(repo / "tests" / "test_pkg.py").write_text("def test_base():\n assert True\n")

subprocess.run(["git", "init", "-b", "main"], cwd=repo, check=True)
subprocess.run(["git", "add", "."], cwd=repo, check=True)
subprocess.run(
[
"git",
"-c",
"user.name=Codex",
"-c",
"user.email=codex@example.com",
"commit",
"-m",
"base",
],
cwd=repo,
check=True,
)
head_before = subprocess.run(
["git", "rev-parse", "HEAD"],
cwd=repo,
check=True,
capture_output=True,
text=True,
).stdout.strip()

# Task setup may apply benchmark tests before the agent starts. The
# baseline tree must include these so they do not appear as agent edits.
(repo / "tests" / "test_pkg.py").write_text(
"def test_base():\n assert True\n\n"
"def test_setup_patch():\n assert True\n"
)
base_tree = subprocess.run(
collector.snapshot_command(),
cwd=repo,
shell=True,
check=True,
capture_output=True,
text=True,
).stdout.strip()

head_after = subprocess.run(
["git", "rev-parse", "HEAD"],
cwd=repo,
check=True,
capture_output=True,
text=True,
).stdout.strip()
assert head_after == head_before
subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=repo, check=True)

# Capture a committed source change, an unstaged test edit, and a new
# untracked test file. All are agent edits relative to the setup tree.
(repo / "pkg.py").write_text("VALUE = 'agent'\n")
subprocess.run(["git", "add", "pkg.py"], cwd=repo, check=True)
subprocess.run(
[
"git",
"-c",
"user.name=Codex",
"-c",
"user.email=codex@example.com",
"commit",
"-m",
"agent source edit",
],
cwd=repo,
check=True,
)
(repo / "tests" / "test_pkg.py").write_text(
"def test_base():\n assert True\n\n"
"def test_setup_patch():\n assert True\n\n"
"def test_agent_patch():\n assert False\n"
)
(repo / "tests" / "test_new.py").write_text(
"def test_new_agent_file():\n assert False\n"
)

patch = subprocess.run(
collector.diff_command(base_tree),
cwd=repo,
shell=True,
check=True,
capture_output=True,
text=True,
).stdout

assert "diff --git a/pkg.py b/pkg.py" in patch
assert "+VALUE = 'agent'" in patch
assert "diff --git a/tests/test_pkg.py b/tests/test_pkg.py" in patch
assert "+def test_agent_patch():" in patch
assert "diff --git a/tests/test_new.py b/tests/test_new.py" in patch
assert "+def test_new_agent_file():" in patch
assert "+def test_setup_patch():" not in patch

subprocess.run(["git", "diff", "--cached", "--quiet"], cwd=repo, check=True)


# ── install_env ──────────────────────────────────────────────────────────


Expand Down
5 changes: 5 additions & 0 deletions tests/test_rlm_composable_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
)
from verifiers.envs.experimental.composable import (
ComposableEnv,
GitPatchCollector,
Harness,
SandboxSpec,
SandboxTaskSet,
Expand Down Expand Up @@ -206,6 +207,10 @@ def test_rlm_harness_uses_explicit_local_checkout(tmp_path):
assert harness.metrics_key == "metrics"
assert harness.metrics_prefix == "rlm_"
assert harness.skills_path == "/task/rlm-skills"
assert harness.state_collectors is not None
assert len(harness.state_collectors) == 1
assert isinstance(harness.state_collectors[0], GitPatchCollector)
assert harness.state_collectors[0].state_key == "agent_patch"


def test_resolve_local_checkout_rejects_missing_explicit_path(tmp_path):
Expand Down
4 changes: 2 additions & 2 deletions verifiers/envs/experimental/composable/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Separates **what to solve** (the task) from **how to solve it** (the agent) by r

**Harness** — agent-side configuration. Declares how to install and run an agent binary, and where it expects to find task-provided content (instruction, system prompt).

**ComposableEnv** — a `CliAgentEnv` subclass that wires a TaskSet + Harness. Inherits all interception machinery unchanged. Supports `install_env` for install-only environment variables, automatic upload of task-declared directories (via `TaskSet.get_upload_dirs()`), and harness-declared metrics collection (via `Harness.metrics_path`).
**ComposableEnv** — a `CliAgentEnv` subclass that wires a TaskSet + Harness. Inherits all interception machinery unchanged. Supports `install_env` for install-only environment variables, automatic upload of task-declared directories (via `TaskSet.get_upload_dirs()`), harness-declared metrics collection (via `Harness.metrics_path`), and harness-owned state collectors for rollout artifacts.

**Skills** are first-class: any taskset with a sibling `skills/` directory gets automatic upload for free. `TaskSet.get_skills_dir()` auto-discovers it, and `get_upload_dirs()` includes it under the `"skills"` key by default. The harness's `upload_dir_mapping` decides where skills land in the sandbox (e.g. RLM puts them at `/task/rlm-skills`).

Expand Down Expand Up @@ -167,7 +167,7 @@ ComposableEnv subclasses `CliAgentEnv` without modifying it. It overrides these
- **`get_sandbox_resources(state)`** — reads CPU, memory, GPU from `SandboxSpec`
- **`build_env_vars(state)`** — merges task env vars and exports `AGENT_WORKDIR`
- **`post_sandbox_setup(state)`** — runs task setup, uploads instruction + system prompt, installs agent
- **`post_rollout(state)`** — collects agent logs (scoring is done by the rubric)
- **`post_rollout(state)`** — runs harness state collectors and collects agent logs (scoring is done by the rubric)

Everything else — tunnel, HTTP interception, background job polling, and streaming — is inherited from `CliAgentEnv` unchanged.

Expand Down
5 changes: 4 additions & 1 deletion verifiers/envs/experimental/composable/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
SandboxTaskSet,
discover_sibling_dir,
)
from verifiers.envs.experimental.composable.harness import Harness
from verifiers.envs.experimental.composable.harness import Harness, StateCollector
from verifiers.envs.experimental.composable.state_collectors import GitPatchCollector
from verifiers.envs.experimental.composable.composable_env import ComposableEnv

__all__ = [
Expand All @@ -14,6 +15,8 @@
"TaskSet",
"SandboxTaskSet",
"Harness",
"StateCollector",
"GitPatchCollector",
"ComposableEnv",
"discover_sibling_dir",
]
13 changes: 13 additions & 0 deletions verifiers/envs/experimental/composable/composable_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ async def post_sandbox_setup(self, state: State) -> None:
await self._after_harness_inputs_uploaded(state)
await self._install_agent(sandbox_id)
await self._run_post_install(sandbox_id)
await self._run_state_collectors("post_sandbox_setup", state)

async def post_rollout(self, state: State) -> None:
"""Collect agent logs and harness metrics after the agent finishes.
Expand All @@ -265,6 +266,9 @@ async def post_rollout(self, state: State) -> None:
stays alive for the rubric to run tests / read files.
"""
sandbox_id = state.get("sandbox_id")
if sandbox_id:
await self._run_state_collectors("post_rollout", state)

if sandbox_id and self.harness.log_path and "agent_logs" not in state:
try:
log_path = shlex.quote(self.harness.log_path)
Expand All @@ -282,6 +286,15 @@ async def post_rollout(self, state: State) -> None:

await super().post_rollout(state)

async def _run_state_collectors(self, hook: str, state: State) -> None:
collectors = self.harness.state_collectors or []
for collector in collectors:
try:
await getattr(collector, hook)(self, state)
except Exception as e:
name = type(collector).__name__
self.logger.warning(f"{name}.{hook} failed: {e}")

async def _populate_sandbox_context(self, state: State) -> None:
"""Populate sandbox-specific context used by setup/evaluate hooks."""
state["sandbox_client"] = self.sandbox_client
Expand Down
16 changes: 15 additions & 1 deletion verifiers/envs/experimental/composable/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,22 @@
from dataclasses import dataclass
from importlib.abc import Traversable
from pathlib import Path
from typing import TYPE_CHECKING, Callable
from typing import TYPE_CHECKING, Callable, Protocol

if TYPE_CHECKING:
from verifiers.envs.experimental.composable.composable_env import ComposableEnv
from verifiers.envs.experimental.composable.task import SandboxSpec
from verifiers.types import State, TrajectoryStep


class StateCollector(Protocol):
"""Harness-owned lifecycle hook for writing rollout artifacts into state."""

async def post_sandbox_setup(self, env: ComposableEnv, state: State) -> None: ...

async def post_rollout(self, env: ComposableEnv, state: State) -> None: ...


@dataclass
class Harness:
"""Agent-side configuration.
Expand Down Expand Up @@ -123,6 +132,10 @@ class Harness:
trajectory the trainer sees — e.g. rlm_harness uses it to drop
sub-agent calls (``X-RLM-Depth`` header > 0) so only the
parent agent's turns contribute to the policy gradient.
state_collectors:
Optional harness-owned artifact collectors. ``ComposableEnv``
runs each collector after sandbox setup and after rollout, before
scoring mutates the task sandbox.
"""

install_script: str | None = None
Expand All @@ -147,6 +160,7 @@ class Harness:
keep_trajectory_step: (
Callable[[TrajectoryStep, State, dict[str, str]], bool] | None
) = None
state_collectors: list[StateCollector] | None = None

def get_effective_upload_dir_mapping(self) -> dict[str, str] | None:
"""Return the merged upload mapping (skills_path + upload_dir_mapping)."""
Expand Down
3 changes: 2 additions & 1 deletion verifiers/envs/experimental/composable/harnesses/rlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pathlib import Path
from typing import TYPE_CHECKING, Callable

from verifiers.envs.experimental.composable import Harness
from verifiers.envs.experimental.composable import GitPatchCollector, Harness
from verifiers.envs.experimental.utils.git_checkout_cache import (
resolve_git_checkout,
validate_git_checkout,
Expand Down Expand Up @@ -203,6 +203,7 @@ def env_vars_for_rollout(state: State) -> dict[str, str]:
tool_names=tool_names,
environment_vars=env_vars_for_rollout,
keep_trajectory_step=keep_trajectory_step,
state_collectors=[GitPatchCollector()],
)


Expand Down
Loading
Loading