diff --git a/assets/lab/environments/AGENTS.md b/assets/lab/environments/AGENTS.md
index c606b5798..14095d742 100644
--- a/assets/lab/environments/AGENTS.md
+++ b/assets/lab/environments/AGENTS.md
@@ -911,6 +911,7 @@ Newer and more experimental environment classes include:
     )
     ```
 - **V1 `vf.Env` / `vf.Taskset` / `vf.Harness`** — preferred taskset/harness pattern for composing task data and program execution without subclassing. Use this for new environments that need reusable tasksets, reusable harnesses, config-driven metrics, rewards, toolsets, users, endpoint interception, or sandboxed Python/command programs. `vf.Taskset` owns train/eval rows, prompt shaping, setup/update/reward hooks, and toolsets. `vf.Harness` owns the framework program, endpoint proxy, model controls, sandbox options, and runtime hooks. `vf.Env` wires them into the standard evaluation and training surface.
+- **`SWEDebugEnv`** — no-agent debugger for SWE-style `SandboxTaskSet` instances. It creates the task sandbox, optionally runs `taskset.setup(state)`, performs one debug step (`none`, `gold_patch`, `command`, or `script`), and optionally runs the task tests and scorer. It records setup, sandbox creation, gold patch, debug command, and test timings in state for validation and timing investigations.
 - **`HarborEnv`** — loads Harbor-format agent benchmark tasks
 - **`RLMEnv`** — implements [Recursive Language Models](https://alexzhang13.github.io/blog/2025/rlm/) for unbounded context processing via REPL-based decomposition and recursive sub-LLM calls
 - **`OpenCodeEnv`** — runs [OpenCode](https://opencode.ai) CLI agents inside sandboxes with API call interception
diff --git a/docs/environments.md b/docs/environments.md
index 3a6c4dce5..00247f98b 100644
--- a/docs/environments.md
+++ b/docs/environments.md
@@ -905,6 +905,7 @@ Newer and more experimental environment classes include:
     )
     ```
 - **V1 `vf.Env` / `vf.Taskset` / `vf.Harness`** — preferred taskset/harness pattern for composing task data and program execution without subclassing. Use this for new environments that need reusable tasksets, reusable harnesses, config-driven metrics, rewards, toolsets, users, endpoint interception, or sandboxed Python/command programs. `vf.Taskset` owns train/eval rows, prompt shaping, setup/update/reward hooks, and toolsets. `vf.Harness` owns the framework program, endpoint proxy, model controls, sandbox options, and runtime hooks. `vf.Env` wires them into the standard evaluation and training surface.
+- **`SWEDebugEnv`** — no-agent debugger for SWE-style `SandboxTaskSet` instances. It creates the task sandbox, optionally runs `taskset.setup(state)`, performs one debug step (`none`, `gold_patch`, `command`, or `script`), and optionally runs the task tests and scorer. It records setup, sandbox creation, gold patch, debug command, and test timings in state for validation and timing investigations.
 - **`HarborEnv`** — loads Harbor-format agent benchmark tasks
 - **`RLMEnv`** — implements [Recursive Language Models](https://alexzhang13.github.io/blog/2025/rlm/) for unbounded context processing via REPL-based decomposition and recursive sub-LLM calls
 - **`OpenCodeEnv`** — runs [OpenCode](https://opencode.ai) CLI agents inside sandboxes with API call interception
diff --git a/docs/reference.md b/docs/reference.md
index 1e41786b2..8ad34c0ff 100644
--- a/docs/reference.md
+++ b/docs/reference.md
@@ -516,6 +516,35 @@ class OpenEnvEnv(MultiTurnEnv):
 
 OpenEnv integration that runs OpenEnv projects in Prime Sandboxes using a prebuilt image manifest (`.build.json`), supports both gym and MCP contracts, and requires a `prompt_renderer` to convert observations into chat messages.
 
+#### SWEDebugEnv
+
+```python
+class SWEDebugEnv(SandboxMixin, MultiTurnEnv):
+    def __init__(
+        self,
+        taskset: SandboxTaskSet,
+        dataset: Any = None,
+        *,
+        run_setup: bool = True,
+        debug_step: Literal["none", "gold_patch", "command", "script"] = "gold_patch",
+        run_tests: bool = True,
+        debug_command: str | None = None,
+        debug_script: str | None = None,
+        debug_script_path: str | None = None,
+        debug_timeout: int | None = None,
+        test_timeout: int = 900,
+        cpu_cores: int | None = None,
+        memory_gb: int | None = None,
+        disk_size_gb: int | None = None,
+        labels: list[str] | None = None,
+        timeout_seconds: float = 1800.0,
+        output_tail_chars: int = 2000,
+        **sandbox_kwargs,
+    ): ...
+```
+
+No-agent debugger for SWE-style `SandboxTaskSet` instances. It creates the task sandbox, optionally runs task setup, runs one debug step (`none`, `gold_patch`, `command`, or `script`), and optionally runs tests and scores the result.
+
 #### EnvGroup
 
 ```python
diff --git a/environments/AGENTS.md b/environments/AGENTS.md
index 846eae8fb..2c6a3b814 100644
--- a/environments/AGENTS.md
+++ b/environments/AGENTS.md
@@ -911,6 +911,7 @@ Newer and more experimental environment classes include:
     )
     ```
 - **V1 `vf.Env` / `vf.Taskset` / `vf.Harness`** — preferred taskset/harness pattern for composing task data and program execution without subclassing. Use this for new environments that need reusable tasksets, reusable harnesses, config-driven metrics, rewards, toolsets, users, endpoint interception, or sandboxed Python/command programs. `vf.Taskset` owns train/eval rows, prompt shaping, setup/update/reward hooks, and toolsets. `vf.Harness` owns the framework program, endpoint proxy, model controls, sandbox options, and runtime hooks. `vf.Env` wires them into the standard evaluation and training surface.
+- **`SWEDebugEnv`** — no-agent debugger for SWE-style `SandboxTaskSet` instances. It creates the task sandbox, optionally runs `taskset.setup(state)`, performs one debug step (`none`, `gold_patch`, `command`, or `script`), and optionally runs the task tests and scorer. It records setup, sandbox creation, gold patch, debug command, and test timings in state for validation and timing investigations.
 - **`HarborEnv`** — loads Harbor-format agent benchmark tasks
 - **`RLMEnv`** — implements [Recursive Language Models](https://alexzhang13.github.io/blog/2025/rlm/) for unbounded context processing via REPL-based decomposition and recursive sub-LLM calls
 - **`OpenCodeEnv`** — runs [OpenCode](https://opencode.ai) CLI agents inside sandboxes with API call interception
diff --git a/verifiers/envs/experimental/__init__.py b/verifiers/envs/experimental/__init__.py
index 37278b8ee..b3f954858 100644
--- a/verifiers/envs/experimental/__init__.py
+++ b/verifiers/envs/experimental/__init__.py
@@ -8,6 +8,7 @@
     "TaskSet",
     "Harness",
     "ComposableEnv",
+    "SWEDebugEnv",
 ]
 
 
@@ -19,6 +20,7 @@ def __getattr__(name: str):
         "TaskSet": "verifiers.envs.experimental.composable:TaskSet",
         "Harness": "verifiers.envs.experimental.composable:Harness",
         "ComposableEnv": "verifiers.envs.experimental.composable:ComposableEnv",
+        "SWEDebugEnv": "verifiers.envs.experimental.composable:SWEDebugEnv",
     }
     if name in _lazy:
         import importlib
diff --git a/verifiers/envs/experimental/composable/__init__.py b/verifiers/envs/experimental/composable/__init__.py
index 35afc0858..6b537f1bc 100644
--- a/verifiers/envs/experimental/composable/__init__.py
+++ b/verifiers/envs/experimental/composable/__init__.py
@@ -7,6 +7,7 @@
 )
 from verifiers.envs.experimental.composable.harness import Harness
 from verifiers.envs.experimental.composable.composable_env import ComposableEnv
+from verifiers.envs.experimental.composable.swe_debug_env import SWEDebugEnv
 
 __all__ = [
     "SandboxSpec",
@@ -15,5 +16,6 @@
     "SandboxTaskSet",
     "Harness",
     "ComposableEnv",
+    "SWEDebugEnv",
     "discover_sibling_dir",
 ]
diff --git a/verifiers/envs/experimental/composable/swe_debug_env.py b/verifiers/envs/experimental/composable/swe_debug_env.py
new file mode 100644
index 000000000..766e64b10
--- /dev/null
+++ b/verifiers/envs/experimental/composable/swe_debug_env.py
@@ -0,0 +1,327 @@
+"""No-agent debugger for SWE-style SandboxTaskSet instances."""
+
+from __future__ import annotations
+
+import shlex
+import time
+from typing import Any, Literal
+
+import verifiers as vf
+from prime_sandboxes import CreateSandboxRequest
+from verifiers.envs.experimental.sandbox_mixin import (
+    SandboxMixin,
+    SandboxMonitorRubric,
+    SandboxSetupError,
+)
+from verifiers.types import Messages, State
+
+from .task import SandboxTaskSet
+
+DebugStep = Literal["none", "gold_patch", "command", "script"]
+
+
+class SWEDebugRubric(SandboxMonitorRubric):
+    """Reads the reward set by SWEDebugEnv during setup."""
+
+    def __init__(self, **kwargs: Any):
+        super().__init__(**kwargs)
+        self.add_reward_func(self.debug_reward, weight=1.0)
+
+    async def debug_reward(self, state: vf.State, **kwargs: Any) -> float:
+        return float(state.get("reward") or 0.0)
+
+
+class SWEDebugEnv(SandboxMixin, vf.MultiTurnEnv):
+    """Create a task sandbox, optionally mutate it, optionally run tests.
+
+    Pipeline:
+    - entry: create sandbox and optionally run ``taskset.setup(state)``
+    - debug step: ``none``, ``gold_patch``, ``command``, or ``script``
+    - exit: optionally run task tests and score them
+    """
+
+    def __init__(
+        self,
+        taskset: SandboxTaskSet,
+        dataset: Any = None,
+        *,
+        run_setup: bool = True,
+        debug_step: DebugStep = "gold_patch",
+        run_tests: bool = True,
+        debug_command: str | None = None,
+        debug_script: str | None = None,
+        debug_script_path: str | None = None,
+        debug_timeout: int | None = None,
+        test_timeout: int = 900,
+        cpu_cores: int | None = None,
+        memory_gb: int | None = None,
+        disk_size_gb: int | None = None,
+        labels: list[str] | None = None,
+        timeout_seconds: float = 1800.0,
+        output_tail_chars: int = 2000,
+        **sandbox_kwargs: Any,
+    ):
+        if debug_step not in ("none", "gold_patch", "command", "script"):
+            raise ValueError(f"Unsupported debug_step: {debug_step!r}")
+        if debug_step == "command" and not debug_command:
+            raise ValueError("debug_command is required when debug_step='command'")
+        if debug_step == "script" and not (debug_script or debug_script_path):
+            raise ValueError(
+                "debug_script or debug_script_path is required when debug_step='script'"
+            )
+
+        self.taskset = taskset
+        self.run_setup = run_setup
+        self.debug_step = debug_step
+        self.run_tests = run_tests
+        self.debug_command = debug_command
+        self.debug_script = debug_script
+        self.debug_script_path = debug_script_path
+        self.debug_timeout = debug_timeout
+        self.test_timeout = test_timeout
+        self._cpu_cores = cpu_cores
+        self._memory_gb = memory_gb
+        self._disk_size_gb = disk_size_gb
+        self.labels = labels or ["swe-debug"]
+        self.timeout_seconds = timeout_seconds
+        self.output_tail_chars = output_tail_chars
+
+        super().__init__(
+            dataset=dataset or taskset.get_dataset,
+            rubric=SWEDebugRubric(),
+            timeout_seconds=timeout_seconds,
+        )
+        self.init_sandbox_client(**sandbox_kwargs)
+
+    async def env_response(
+        self, messages: Messages, state: State, **kwargs: Any
+    ) -> Messages:
+        raise NotImplementedError("SWEDebugEnv does not use multi-turn interaction")
+
+    async def setup_state(self, state: State) -> None:
+        await super().setup_state(state)
+        state["attempts"] = state.get("attempts", 0) + 1
+        state["debug_step"] = self.debug_step
+        state["run_setup"] = self.run_setup
+        state["run_tests"] = self.run_tests
+
+        t0 = time.perf_counter()
+        valid = False
+        exc: BaseException | None = None
+        try:
+            await self._create_task_sandbox(state)
+            valid = await self._run_debug_pipeline(state)
+        except Exception as e:  # noqa: BLE001
+            exc = e
+            state["error"] = vf.SandboxError(f"SWE debug failed: {repr(e)}")
+            state["reward"] = 0.0
+        finally:
+            state["elapsed_s"] = time.perf_counter() - t0
+
+        reason, tail = self._classify_outcome(valid, exc, state)
+        state.setdefault("reason", reason)
+        if tail:
+            state["test_output_tail"] = tail
+
+    async def _create_task_sandbox(self, state: State) -> None:
+        info = state["info"]
+        spec = self.taskset.get_sandbox_spec(info)
+        timeout_minutes = (
+            spec.timeout_minutes
+            if spec.timeout_minutes is not None
+            else self.compute_sandbox_timeout_minutes()
+        )
+        request = CreateSandboxRequest(
+            name=f"swe-debug-{state.get('example_id', 'unknown')}",
+            docker_image=spec.image,
+            cpu_cores=spec.cpu_cores if self._cpu_cores is None else self._cpu_cores,
+            memory_gb=spec.memory_gb if self._memory_gb is None else self._memory_gb,
+            disk_size_gb=spec.disk_size_gb
+            if self._disk_size_gb is None
+            else self._disk_size_gb,
+            gpu_count=spec.gpu_count,
+            gpu_type=spec.gpu_type,
+            vm=spec.gpu_count > 0,
+            timeout_minutes=timeout_minutes,
+            environment_vars=self.taskset.get_env_vars() or None,
+            labels=self.labels,
+        )
+        t0 = time.perf_counter()
+        await self.create_sandbox(state, request)
+        state["sandbox_create_s"] = time.perf_counter() - t0
+
+    async def post_sandbox_setup(self, state: State) -> None:
+        state["sandbox_client"] = self.sandbox_client
+        state["test_timeout"] = self.test_timeout
+        state["run_background_job"] = self.run_background_job
+        if not self.run_setup:
+            state["setup_s"] = 0.0
+            return
+        t0 = time.perf_counter()
+        await self.taskset.setup(state)
+        state["setup_s"] = time.perf_counter() - t0
+
+    async def _run_debug_pipeline(self, state: State) -> bool:
+        t0 = time.perf_counter()
+        if self.debug_step == "gold_patch":
+            await self._apply_gold_patch(state)
+        elif self.debug_step == "command":
+            valid = await self._run_debug_command(state, self.debug_command or "")
+            if not valid:
+                state["body_s"] = time.perf_counter() - t0
+                return False
+        elif self.debug_step == "script":
+            valid = await self._run_debug_script(state)
+            if not valid:
+                state["body_s"] = time.perf_counter() - t0
+                return False
+        state["body_s"] = time.perf_counter() - t0
+
+        if self.run_tests:
+            return await self._run_tests(state)
+
+        state["reward"] = 1.0
+        state["reason"] = "pass"
+        return True
+
+    async def _apply_gold_patch(self, state: State) -> None:
+        apply_gold_patch = getattr(self.taskset, "_apply_gold_patch", None)
+        if apply_gold_patch is None:
+            raise RuntimeError("Taskset does not support gold patch application")
+        t0 = time.perf_counter()
+        await apply_gold_patch(state["sandbox_client"], state["sandbox_id"], state)
+        state["gold_apply_s"] = time.perf_counter() - t0
+
+    async def _run_debug_command(self, state: State, command: str) -> bool:
+        return await self._execute_debug_command(state, command)
+
+    async def _run_debug_script(self, state: State) -> bool:
+        sandbox_id = state["sandbox_id"]
+        remote_path = "/tmp/swe_debug_script.sh"
+        if self.debug_script_path:
+            await self.upload_file(sandbox_id, remote_path, self.debug_script_path)
+        else:
+            await self.upload_content(sandbox_id, self.debug_script or "", remote_path)
+        command = f"chmod +x {remote_path} && {shlex.quote(remote_path)}"
+        return await self._execute_debug_command(state, command)
+
+    async def _execute_debug_command(self, state: State, command: str) -> bool:
+        t0 = time.perf_counter()
+        result = await self.sandbox_client.execute_command(
+            state["sandbox_id"],
+            command,
+            working_dir=self._workdir(state),
+            timeout=(
+                self.debug_timeout
+                if self.debug_timeout is not None
+                else self.test_timeout
+            ),
+        )
+        state["debug_run_s"] = time.perf_counter() - t0
+        state["debug_exit_code"] = result.exit_code
+        stdout = result.stdout or ""
+        stderr = result.stderr or ""
+        if stdout:
+            state["debug_stdout_tail"] = stdout[-self.output_tail_chars :]
+        if stderr:
+            state["debug_stderr_tail"] = stderr[-self.output_tail_chars :]
+        if result.exit_code == 0:
+            return True
+        state["reward"] = 0.0
+        state["reason"] = "debug_command_failed"
+        return False
+
+    async def _run_tests(self, state: State) -> bool:
+        run_tests = getattr(self.taskset, "_run_tests", None)
+        calculate_reward = getattr(self.taskset, "_calculate_reward", None)
+        if run_tests is None or calculate_reward is None:
+            raise RuntimeError("Taskset does not support direct test execution")
+        t0 = time.perf_counter()
+        test_output = await run_tests(
+            state["sandbox_client"],
+            state["sandbox_id"],
+            state,
+            state.get("test_timeout", self.test_timeout),
+        )
+        state["test_run_s"] = time.perf_counter() - t0
+        state["test_output"] = test_output
+        reward = float(calculate_reward(test_output, state.get("info") or {}))
+        state["reward"] = reward
+        valid = reward > 0
+        state["reason"] = "pass" if valid else "test_failed"
+        return valid
+
+    def _workdir(self, state: State) -> str:
+        return self.taskset.get_workdir(state.get("info") or {})
+
+    def _classify_outcome(
+        self, valid: bool, exc: BaseException | None, state: State
+    ) -> tuple[str, str | None]:
+        test_output = state.get("test_output")
+        tail = (
+            test_output[-self.output_tail_chars :]
+            if isinstance(test_output, str) and test_output
+            else None
+        )
+        if valid:
+            return "pass", tail
+        if state.get("reason"):
+            return str(state["reason"]), tail
+        if exc is None:
+            return "test_failed", tail
+
+        from prime_sandboxes import (
+            APIError,
+            APITimeoutError,
+            CommandTimeoutError,
+            DownloadTimeoutError,
+            PaymentRequiredError,
+            SandboxImagePullError,
+            SandboxNotRunningError,
+            SandboxTimeoutError,
+            UploadTimeoutError,
+        )
+
+        if isinstance(exc, PaymentRequiredError):
+            return "billing_error", tail
+        if isinstance(
+            exc,
+            (
+                TimeoutError,
+                APITimeoutError,
+                CommandTimeoutError,
+                DownloadTimeoutError,
+                SandboxTimeoutError,
+                UploadTimeoutError,
+            ),
+        ):
+            return "timeout", tail
+        if isinstance(exc, SandboxSetupError):
+            return "setup_failed", tail
+        if isinstance(
+            exc,
+            (
+                vf.InfraError,
+                APIError,
+                SandboxImagePullError,
+                SandboxNotRunningError,
+            ),
+        ):
+            return "sandbox_error", tail
+
+        msg = str(exc).lower()
+        if "apply failed" in msg or "patch failed" in msg or "no gold patch" in msg:
+            return "gold_apply_failed", tail
+        if "does not support" in msg:
+            return "unsupported_action", tail
+        return "setup_failed", tail
+
+    @vf.stop
+    async def debug_completed(self, state: State) -> bool:
+        return True
+
+    @vf.cleanup
+    async def destroy_sandbox(self, state: State) -> None:
+        sandbox_id = state.get("sandbox_id")
+        if sandbox_id:
+            await self.delete_sandbox(sandbox_id)
diff --git a/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py b/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py
index 68f9e1e89..b57c73e0e 100644
--- a/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py
+++ b/verifiers/envs/experimental/composable/tasksets/swe/multi_swe.py
@@ -177,8 +177,6 @@ def __init__(
         self,
         dataset_name: str = "PrimeIntellect/Multi-SWE-RL",
         split: str = "train",
-        exclude_langs: tuple[str, ...] = ("c", "cpp"),
-        filter_repos: list[str] | None = None,
         filter_fn: str | None = None,
         ds_num_proc: int | None = None,
         ds_keep_in_memory: bool = True,
@@ -194,8 +192,6 @@ def __init__(
         """
         self.dataset_name = dataset_name
         self.split = split
-        self.exclude_langs = tuple(exclude_langs)
-        self.filter_repos = filter_repos
         self.ds_num_proc = ds_num_proc
         self.ds_keep_in_memory = ds_keep_in_memory
         self.timeout_minutes = timeout_minutes
@@ -219,12 +215,6 @@ def _build_dataset(self) -> Any:
             keep_in_memory=self.ds_keep_in_memory,
             num_proc=self.ds_num_proc,
         )
-        if self.exclude_langs:
-            excluded = frozenset(self.exclude_langs)
-            dataset = dataset.filter(lambda x: x.get("lang") not in excluded, **_kw)
-        if self.filter_repos:
-            filter_set = frozenset(self.filter_repos)
-            dataset = dataset.filter(lambda x: x.get("repo") not in filter_set, **_kw)
         # Use num_proc=1 for map: the output nests original rows inside an "info" dict,
         # and multiprocess map re-infers types per shard. Shards where all list columns
         # (e.g. skipped_tests) are empty get List(null) instead of List(string), causing
diff --git a/verifiers/envs/experimental/composable/tasksets/swe/openswe.py b/verifiers/envs/experimental/composable/tasksets/swe/openswe.py
index d179ebc4e..0acf54393 100644
--- a/verifiers/envs/experimental/composable/tasksets/swe/openswe.py
+++ b/verifiers/envs/experimental/composable/tasksets/swe/openswe.py
@@ -80,7 +80,6 @@ def __init__(
         self,
         dataset_name: str = "GAIR/OpenSWE",
         config: str = "openswe_oss",
-        filter_repos: list[str] | None = None,
         filter_fn: str | None = None,
         ds_num_proc: int | None = None,
         ds_keep_in_memory: bool = True,
@@ -96,7 +95,6 @@ def __init__(
         """
         self.dataset_name = dataset_name
         self.config = config
-        self.filter_repos = filter_repos
         self.ds_num_proc = ds_num_proc
         self.ds_keep_in_memory = ds_keep_in_memory
         self.timeout_minutes = timeout_minutes
@@ -119,9 +117,6 @@ def _build_dataset(self) -> Any:
             keep_in_memory=self.ds_keep_in_memory,
             num_proc=self.ds_num_proc,
         )
-        if self.filter_repos:
-            filter_set = frozenset(self.filter_repos)
-            dataset = dataset.filter(lambda x: x.get("repo") not in filter_set, **_kw)
         return dataset.map(_process_example, remove_columns=dataset.column_names, **_kw)
 
     def get_instruction(self, info: dict) -> str:
diff --git a/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py b/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py
index 4564bf4d3..bfe3fec41 100644
--- a/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py
+++ b/verifiers/envs/experimental/composable/tasksets/swe/r2e_gym.py
@@ -179,7 +179,6 @@ def __init__(
         dataset_name: str = "R2E-Gym/R2E-Gym-Subset",
         repo_path: str = "/testbed",
         alt_path: str = "/root",
-        filter_repos: list[str] | None = None,
         filter_fn: str | None = None,
         ds_num_proc: int | None = None,
         ds_keep_in_memory: bool = True,
@@ -206,7 +205,6 @@ def __init__(
         self.dataset_name = dataset_name
         self.repo_path = repo_path
         self.alt_path = alt_path
-        self.filter_repos = filter_repos
         self.ds_num_proc = ds_num_proc
         self.ds_keep_in_memory = ds_keep_in_memory
         self.timeout_minutes = timeout_minutes
@@ -231,11 +229,6 @@ def _build_dataset(self) -> Any:
             keep_in_memory=self.ds_keep_in_memory,
             num_proc=self.ds_num_proc,
         )
-        if self.filter_repos:
-            filter_set = frozenset(self.filter_repos)
-            dataset = dataset.filter(
-                lambda x: x.get("repo_name") not in filter_set, **_kw
-            )
         return dataset.map(_process_example, remove_columns=dataset.column_names, **_kw)
 
     def get_instruction(self, info: dict) -> str:
diff --git a/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py b/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py
index 25166642e..df4101018 100644
--- a/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py
+++ b/verifiers/envs/experimental/composable/tasksets/swe/swe_bench.py
@@ -351,7 +351,6 @@ def __init__(
         self,
         dataset_name: str = "princeton-nlp/SWE-bench_Verified",
         skip_install: bool = True,
-        filter_repos: list[str] | None = None,
         filter_fn: str | None = None,
         ds_num_proc: int | None = None,
         ds_keep_in_memory: bool = True,
@@ -367,7 +366,6 @@ def __init__(
         """
         self.dataset_name = dataset_name
         self.skip_install = skip_install
-        self.filter_repos = filter_repos
         self.ds_num_proc = ds_num_proc
         self.ds_keep_in_memory = ds_keep_in_memory
         self.timeout_minutes = timeout_minutes
@@ -392,12 +390,6 @@ def _build_dataset(self) -> Any:
             keep_in_memory=self.ds_keep_in_memory,
             num_proc=self.ds_num_proc,
         )
-        if self.filter_repos:
-            filter_set = set(self.filter_repos)
-            dataset = dataset.filter(
-                lambda x: filter_set.isdisjoint((x.get("repo"), x.get("repo_name"))),
-                **_kw,
-            )
         return dataset.map(_process_example, remove_columns=dataset.column_names, **_kw)
 
     def get_instruction(self, info: dict) -> str:
diff --git a/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py b/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py
index 0499fdc2d..ce1ebdef6 100644
--- a/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py
+++ b/verifiers/envs/experimental/composable/tasksets/swe/swe_lego.py
@@ -193,7 +193,6 @@ def __init__(
         self,
         dataset_name: str = "PrimeIntellect/SWE-Lego-Real-Data",
         split: str = "resolved",
-        filter_repos: list[str] | None = None,
         filter_fn: str | None = None,
         ds_num_proc: int | None = None,
         ds_keep_in_memory: bool = True,
@@ -209,7 +208,6 @@ def __init__(
         """
         self.dataset_name = dataset_name
         self.split = split
-        self.filter_repos = filter_repos
         self.ds_num_proc = ds_num_proc
         self.ds_keep_in_memory = ds_keep_in_memory
         self.timeout_minutes = timeout_minutes
@@ -231,9 +229,6 @@ def _build_dataset(self) -> Any:
             keep_in_memory=self.ds_keep_in_memory,
             num_proc=self.ds_num_proc,
         )
-        if self.filter_repos:
-            filter_set = frozenset(self.filter_repos)
-            dataset = dataset.filter(lambda x: x.get("repo") not in filter_set, **_kw)
         # Some datasets (e.g. Real-Data) have struct columns with variable sub-keys
         # (e.g. install_config.JUPYTER_PLATFORM_DIRS). Arrow can't infer a consistent
         # schema across batches, so pre-serialize them to JSON strings.
diff --git a/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py b/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py
index 64c96d17a..a88df4a6a 100644
--- a/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py
+++ b/verifiers/envs/experimental/composable/tasksets/swe/swe_rebench_v2.py
@@ -225,26 +225,21 @@ class SWERebenchV2TaskSet(SandboxTaskSet):
 
     def __init__(
         self,
-        language: str | None = None,
         dataset_name: str = DATASET_NAME,
         split: str = "train",
-        filter_repos: list[str] | None = None,
         filter_fn: str | None = None,
         ds_num_proc: int | None = None,
         ds_keep_in_memory: bool = True,
         timeout_minutes: int | None = None,
     ):
-        self.language = language
         self.dataset_name = dataset_name
         self.split = split
-        self.filter_repos = filter_repos
         self.ds_num_proc = ds_num_proc
         self.ds_keep_in_memory = ds_keep_in_memory
         self.timeout_minutes = timeout_minutes
-        suffix = f"-{language}" if language else ""
         super().__init__(
             dataset=self._build_dataset,
-            name=f"swe/swerebench-v2{suffix}",
+            name="swe/swerebench-v2",
             filter_fn=filter_fn,
         )
 
@@ -260,12 +255,6 @@ def _build_dataset(self) -> Any:
             keep_in_memory=self.ds_keep_in_memory,
             num_proc=self.ds_num_proc,
         )
-        if self.language:
-            lang = self.language
-            dataset = dataset.filter(lambda x: x.get("language") == lang, **_kw)
-        if self.filter_repos:
-            filter_set = frozenset(self.filter_repos)
-            dataset = dataset.filter(lambda x: x.get("repo") not in filter_set, **_kw)
         # Row-level dicts (``install_config``, ``meta``) have variable
         # sub-schemas; Arrow can't infer a consistent struct type across
         # batches, so pre-serialize them to JSON strings. Same trick as
diff --git a/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py b/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py
index d40684f00..ea2cdb822 100644
--- a/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py
+++ b/verifiers/envs/experimental/composable/tasksets/swe/swe_smith.py
@@ -185,7 +185,6 @@ def __init__(
         language: str = "py",
         dataset_name: str | None = None,
         split: str = "train",
-        filter_repos: list[str] | None = None,
         filter_fn: str | None = None,
         ds_num_proc: int | None = None,
         ds_keep_in_memory: bool = True,
@@ -199,7 +198,6 @@ def __init__(
         self.language = language
         self.dataset_name = dataset_name or LANGUAGE_TO_DATASET[language]
         self.split = split
-        self.filter_repos = filter_repos
         self.ds_num_proc = ds_num_proc
         self.ds_keep_in_memory = ds_keep_in_memory
         self.timeout_minutes = timeout_minutes
@@ -236,10 +234,6 @@ def _has_profile(x: dict) -> bool:
 
         dataset = dataset.filter(_has_profile, **_kw)
 
-        if self.filter_repos:
-            filter_set = frozenset(self.filter_repos)
-            dataset = dataset.filter(lambda x: x.get("repo") not in filter_set, **_kw)
-
         return dataset.map(_process_example, remove_columns=dataset.column_names, **_kw)
 
     def get_instruction(self, info: dict) -> str:
diff --git a/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py b/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py
index 49c47ff6a..b18b8fe56 100644
--- a/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py
+++ b/verifiers/envs/experimental/composable/tasksets/swe/swe_tasksets.py
@@ -94,8 +94,8 @@ def make_swelego_real_taskset(**kwargs: Any) -> TaskSet:
 def make_swerebench_v2_taskset(**kwargs: Any) -> TaskSet:
     """SWE-rebench-V2 TaskSet (nebius/SWE-rebench-V2, 32k rows, 20 languages).
 
-    Pass ``language=<one of the 20 language labels>`` to filter to a single
-    language; omit for the full cross-language mix.
+    Use ``filter_fn`` to filter rows, for example
+    ``"lambda x: x['info']['language'] == 'python'"`` for one language.
     """
     from verifiers.envs.experimental.composable.tasksets.swe.swe_rebench_v2 import (
         SWERebenchV2TaskSet,