vals-ai · tthuwng · Jun 10, 2026 · Jun 25, 2026 · Jun 25, 2026 · JarettForzano
diff --git a/README.md b/README.md
@@ -116,6 +116,29 @@ Subclass `BenchmarkService` and implement its abstract methods. On instantiation
 
 Sandbox setup and live sandbox evaluation require request-scoped `sandbox_provider` config so one hosted service can use different sandbox providers per request. Eval-only retry uses `/ws/evaluate-response` with `{"task_id": "…", "eval_resume_state": {...}, "dataset": "…"}` and does not require a sandbox provider.
 
+#### Sandbox providers
+
+`sandbox_provider` is selected per setup/evaluate-instance request:
+
+```json
+{"type": "daytona", "api_key": "...", "api_url": "...", "target": "..."}
+```
+
+or:
+
+```json
+{"type": "modal", "MODAL_TOKEN_ID": "...", "MODAL_TOKEN_SECRET": "...", "MODAL_ENVIRONMENT": "..."}
+```
+
+Modal credentials are required and resolved per request, exactly like Daytona's `DAYTONA_API_KEY`: the caller carries them in the `sandbox_provider` config so the process that creates and talks to the sandbox (the Valkyrie tracker) has them. The tracker resolves this config from AWS Secrets Manager via `sandbox_provider_secret_name`, so the Modal secret holds `MODAL_TOKEN_ID`, `MODAL_TOKEN_SECRET`, and optional `MODAL_ENVIRONMENT`. Only set `MODAL_ENVIRONMENT` when that Modal environment exists; otherwise Modal uses the active/default environment for the token profile.
+
+Provider compatibility notes:
+
+- Modal currently supports `ImageSource` only. `SnapshotSource` is rejected because Daytona snapshots do not have a Modal equivalent.
+- Modal sandboxes do not expose a disk-size parameter; `Resources.disk` is accepted for schema compatibility but not enforced.
+- Docker-in-sandbox is **not** enabled by default. Benchmarks that need nested Docker can set `resources.enable_docker=true` on their `RetrieveTaskResponse`; this only asks the sandbox provider for Docker support. The benchmark service still owns the Docker-capable image, dockerd startup flags, compose workflow, and cleanup.
+- Transient Modal connection errors are retried up to three attempts, matching the Daytona adapter's provider-level retry shape. Non-transient command failures still surface as `SandboxCommandError` with the command exit code.
+
 Benchmark services can send `eval_resume_state` updates to the tracker while evaluation is running. The tracker stores the latest value and sends it back on eval-only retry, so the benchmark service can continue evaluation without recreating the original agent sandbox.
 
 Eval-only retry flow:
@@ -198,9 +221,9 @@ Pydantic models used across requests and responses:
 
 - **`RetrieveTaskResponse`** — `source`, `problem_path`, `cwd`, `agent_timeout`, `Resources`
 - **`SandboxSource`** — `ImageSource(type="image", image=...)` or `SnapshotSource(type="snapshot", snapshot=...)`
-- **`SandboxProviderConfig`** — request-scoped provider config selected by `type`; currently `DaytonaProviderConfig(type="daytona", api_key, api_url, target)` or `ModalProviderConfig(type="modal")` (adapter not implemented yet)
-- **`Resources`** — `vcpu`, `memory`, `disk`
-- **`SetupTaskRequest`** / **`EvaluateInstanceRequest`** — `task_id`, `instance_id`, required `sandbox_provider`, `dataset`
+- **`SandboxProviderConfig`** — request-scoped provider config selected by `type`; currently `DaytonaProviderConfig(type="daytona", DAYTONA_API_KEY, DAYTONA_API_URL, DAYTONA_TARGET)` or `ModalProviderConfig(type="modal", MODAL_TOKEN_ID, MODAL_TOKEN_SECRET, MODAL_ENVIRONMENT?)`
+- **`Resources`** — `vcpu`, `memory`, `disk`, `enable_docker` (default `false`; requests provider-level nested Docker support while benchmark code owns dockerd/image/runtime setup)
+- **`SetupTaskRequest`** / **`EvaluateInstanceRequest`** — `task_id`, `instance_id`, optional `sandbox_provider` with Daytona header fallback, `dataset`
 - **`EvaluateResponseRequest`** — `task_id`, `response` or `eval_resume_state`, `dataset`
 - **`FinalScoreResult`** / **`FinalScoreResponse`** — `score` (float), `metadata`, `tasks_evaluated`
 - **`TaskFilter`** — `task_ids` list or `slice_str`; `parse_slice()` converts `"start:stop:step"` to a Python `slice`

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,7 @@ requires-python = ">=3.12"
 
 dependencies = [
     "daytona>=0.189.0",
+    "modal>=1.4.2",
     "fastapi[standard]>=0.135.3",
     "click>=8.3.2",
     "jinja2>=3.1.6",

diff --git a/src/benchmark_service/sandbox/modal.py b/src/benchmark_service/sandbox/modal.py
@@ -1,46 +1,246 @@
 from __future__ import annotations
 
-from collections.abc import AsyncGenerator, Mapping
-from typing import Literal
+import asyncio
+import shlex
+from collections.abc import AsyncGenerator
+from typing import Any, Literal
 
+from modal import App, Client, Image
+from modal import Sandbox as ModalSdkSandbox
+from modal.exception import ConnectionError as ModalConnectionError
+from modal.exception import Error as ModalError
+from modal.exception import NotFoundError as ModalNotFoundError
 from pydantic import BaseModel
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
 
 from benchmark_service.sandbox.types import (
+    ExecResult,
+    ImageSource,
     Sandbox,
+    SandboxCommandError,
+    SandboxConnectionError,
     SandboxCreateRequest,
     SandboxError,
+    SandboxNotFoundError,
     SandboxProvider,
     SandboxQuery,
 )
 
+_APP_NAME = "benchmark-service"
+# Modal terminates a sandbox when its entrypoint exits, so keep a long-lived
+# entrypoint and run task commands through exec.
+_KEEPALIVE = ("/bin/sh", "-lc", "while true; do sleep 3600; done")
+# Adapter-owned max sandbox lifetime; Modal defaults to 5 minutes otherwise.
+_MAX_LIFETIME_SECONDS = 24 * 60 * 60
+
+
+_PROVIDER_RETRY = retry(
+    retry=retry_if_exception_type(SandboxConnectionError),
+    stop=stop_after_attempt(3),
+    wait=wait_fixed(2),
+    reraise=True,
+)
+
 
 class ModalProviderConfig(BaseModel):
     type: Literal["modal"] = "modal"
-
-    @classmethod
-    def from_headers(cls, headers: Mapping[str, str]) -> "ModalProviderConfig":
-        return cls()
+    MODAL_TOKEN_ID: str
+    MODAL_TOKEN_SECRET: str
+    MODAL_ENVIRONMENT: str | None = None
 
     def create_provider(self) -> SandboxProvider:
         return ModalSandboxProvider(self)
 
 
+def _sandbox_error(exc: ModalError) -> SandboxError:
+    if isinstance(exc, ModalNotFoundError):
+        return SandboxNotFoundError(str(exc))
+    if isinstance(exc, ModalConnectionError):
+        return SandboxConnectionError(str(exc))
+    return SandboxError(str(exc))
+
+
+def _command(command: str, cwd: str | None, timeout: float | None) -> str:
+    # Mirrors the Daytona adapter: a timed-out command exits with code 124
+    # instead of raising, and cwd wraps the timeout prefix. stderr is merged
+    # into stdout to match the combined PTY output of the Daytona adapter.
+    if timeout is not None:
+        command = f"timeout {timeout:g} {command}"
+    if cwd:
+        command = f"cd {shlex.quote(cwd)} && {command}"
+    return f"{{ {command} ; }} 2>&1"
+
+
+class ModalSandbox(Sandbox):
+    def __init__(self, sandbox: ModalSdkSandbox, name: str | None = None) -> None:
+        self._sandbox = sandbox
+        self._name = name
+
+    @property
+    def id(self) -> str:
+        return self._sandbox.object_id
+
+    @property
+    def name(self) -> str:
+        return self._name or self._sandbox.object_id
+
+    @property
+    def state(self) -> str:
+        # Modal does not expose a cached lifecycle state on the sandbox handle.
+        return "unknown"
+
+    async def exec(
+        self,
+        command: str,
+        *,
+        cwd: str | None = None,
+        timeout: float | None = None,
+    ) -> ExecResult:
+        process = await self._start_process(command, cwd=cwd, timeout=timeout)
+        try:
+            output = "".join([str(chunk) async for chunk in process.stdout])
+            exit_code = await process.wait.aio()
+        except ModalError as exc:
+            raise _sandbox_error(exc) from exc
+        return ExecResult(exit_code=exit_code, output=output)
+
+    @_PROVIDER_RETRY
+    async def _start_process(self, command: str, *, cwd: str | None, timeout: float | None) -> Any:
+        try:
+            return await self._sandbox.exec.aio("/bin/sh", "-lc", _command(command, cwd, timeout), text=True)
+        except ModalError as exc:
+            raise _sandbox_error(exc) from exc
+
+    async def command(
+        self,
+        command: str,
+        *,
+        cwd: str | None = None,
+        timeout: float | None = None,
+    ) -> AsyncGenerator[str, None]:
+        process = await self._start_process(command, cwd=cwd, timeout=timeout)
+
+        try:
+            async for chunk in process.stdout:
+                yield str(chunk)
+            exit_code = await process.wait.aio()
+        except ModalError as exc:
+            raise _sandbox_error(exc) from exc
+
+        if exit_code != 0:
+            raise SandboxCommandError(exit_code)
+
+    @_PROVIDER_RETRY
+    async def upload_file(self, remote_path: str, content: bytes) -> None:
+        try:
+            await asyncio.to_thread(self._sandbox.filesystem.write_bytes, content, remote_path)
+        except ModalError as exc:
+            raise _sandbox_error(exc) from exc
+
+    @_PROVIDER_RETRY
+    async def download_file(self, remote_path: str) -> bytes:
+        try:
+            content = await asyncio.to_thread(self._sandbox.filesystem.read_bytes, remote_path)
+        except ModalError as exc:
+            raise _sandbox_error(exc) from exc
+        return bytes(content)
+
+
 class ModalSandboxProvider(SandboxProvider):
     def __init__(self, config: ModalProviderConfig) -> None:
         self._config = config
+        self._client: Client | None = None
+        self._app: App | None = None
 
+    async def _connect(self) -> tuple[Client, App]:
+        if self._client is None or self._app is None:
+            try:
+                client = await Client.from_credentials.aio(self._config.MODAL_TOKEN_ID, self._config.MODAL_TOKEN_SECRET)
+                self._app = await App.lookup.aio(
+                    _APP_NAME,
+                    client=client,
+                    environment_name=self._config.MODAL_ENVIRONMENT,
+                    create_if_missing=True,
+                )
+                self._client = client
+            except ModalError as exc:
+                raise _sandbox_error(exc) from exc
+        return self._client, self._app
+
+    @_PROVIDER_RETRY
     async def create_sandbox(self, request: SandboxCreateRequest) -> Sandbox:
-        raise SandboxError("Modal sandbox provider is not implemented")
+        if not isinstance(request.source, ImageSource):
+            raise SandboxError(f"Modal sandbox provider does not support source type {request.source.type!r}")
+        client, app = await self._connect()
+        image = Image.from_registry(request.source.image)  # pyright: ignore[reportUnknownMemberType]
+        create_kwargs: dict[str, Any] = {
+            "app": app,
+            "name": request.name,
+            "image": image,
+            "env": dict(request.env_vars),
+            "tags": request.labels,
+            "cpu": float(request.resources.vcpu),
+            "memory": request.resources.memory * 1024,
+            "idle_timeout": request.auto_stop_interval * 60 if request.auto_stop_interval else None,
+            "timeout": _MAX_LIFETIME_SECONDS,
+            "client": client,
+        }
+        if request.resources.enable_docker:
+            create_kwargs["experimental_options"] = {"enable_docker": True}
+
+        try:
+            # Modal sandboxes have no disk parameter; request.resources.disk is
+            # accepted but not enforced. memory is MiB, cpu is fractional cores.
+            inner = await asyncio.wait_for(
+                ModalSdkSandbox.create.aio(  # pyright: ignore[reportUnknownMemberType]
+                    *_KEEPALIVE,
+                    **create_kwargs,
+                ),
+                timeout=request.create_timeout,
+            )
+        except TimeoutError as exc:
+            raise SandboxError(f"Failed to create Modal sandbox within {request.create_timeout}s") from exc
+        except ModalError as exc:
+            raise _sandbox_error(exc) from exc
+        return ModalSandbox(inner, name=request.name)
 
+    @_PROVIDER_RETRY
     async def get_sandbox(self, instance_id: str) -> Sandbox:
-        raise SandboxError("Modal sandbox provider is not implemented")
+        client, _ = await self._connect()
+        try:
+            inner = await ModalSdkSandbox.from_id.aio(instance_id, client=client)
+        except ModalError as exc:
+            raise _sandbox_error(exc) from exc
+        return ModalSandbox(inner)
 
+    @_PROVIDER_RETRY
     async def delete_sandbox(self, instance_id: str) -> None:
-        raise SandboxError("Modal sandbox provider is not implemented")
+        client, _ = await self._connect()
+        try:
+            inner = await ModalSdkSandbox.from_id.aio(instance_id, client=client)
+            await inner.terminate.aio()
+        except ModalNotFoundError:
+            return
+        except ModalError as exc:
+            raise _sandbox_error(exc) from exc
 
     async def list_sandboxes(self, query: SandboxQuery) -> AsyncGenerator[Sandbox, None]:
-        raise SandboxError("Modal sandbox provider is not implemented")
-        yield
+        for inner in await self._list_sandboxes(query):
+            yield ModalSandbox(inner)
+
+    @_PROVIDER_RETRY
+    async def _list_sandboxes(self, query: SandboxQuery) -> list[ModalSdkSandbox]:
+        client, app = await self._connect()
+        try:
+            return [
+                inner
+                async for inner in ModalSdkSandbox.list.aio(app_id=app.app_id, tags=query.labels or None, client=client)
+            ]
+        except ModalError as exc:
+            raise _sandbox_error(exc) from exc
 
     async def close(self) -> None:
-        pass
+        if self._client is not None:
+            await self._client._close.aio()  # pyright: ignore[reportPrivateUsage, reportUnknownMemberType]
+            self._client = None
+            self._app = None
diff --git a/src/benchmark_service/sandbox/types.py b/src/benchmark_service/sandbox/types.py
@@ -24,6 +24,15 @@ class Resources(BaseModel):
     vcpu: int = Field(description="Logical sandbox CPU count")
     memory: int = Field(description="Sandbox memory")
     disk: int = Field(description="Sandbox ephemeral disk")
+    enable_docker: bool = Field(
+        default=False,
+        description=(
+            "Request a sandbox that permits nested Docker (Docker-in-Docker). "
+            "The provider grants the capability; starting dockerd and running "
+            "containers is the benchmark service's job. Providers without "
+            "nested-Docker support ignore it."
+        ),
+    )
 
 
 class SandboxCreateRequest(BaseModel):

diff --git a/tests/test_app.py b/tests/test_app.py
@@ -248,6 +248,7 @@ def test_websocket_setup_task_resolves_sandbox_provider(
     Test cases:
     - A provider config object in the request body creates that provider directly.
     """
+
     def create_provider(_config: ModalProviderConfig | DaytonaProviderConfig) -> SandboxProvider:
         return ProviderSelectionProvider()
 
@@ -260,7 +261,17 @@ async def setup_task(self, task_id: str, sandbox: Sandbox, dataset: str | None =
 
     with TestClient(BenchmarkServiceApp(RuntimeProviderBenchmark)) as c:
         with c.websocket_connect("/ws/setup-task") as ws:
-            ws.send_json({"task_id": "task-1", "instance_id": "i-1", "sandbox_provider": {"type": "modal"}})
+            ws.send_json(
+                {
+                    "task_id": "task-1",
+                    "instance_id": "i-1",
+                    "sandbox_provider": {
+                        "type": "modal",
+                        "MODAL_TOKEN_ID": "id",
+                        "MODAL_TOKEN_SECRET": "secret",
+                    },
+                }
+            )
             assert ws.receive_json() == {
                 "type": "result",
                 "data": {"task_id": "task-1", "sandbox_name": "selected-sandbox-name"},
@@ -275,6 +286,7 @@ def test_websocket_setup_task_falls_back_to_header_provider_config(
     Test cases:
     - Daytona headers create the provider when the request body has no provider config.
     """
+
     def create_provider(_config: DaytonaProviderConfig) -> SandboxProvider:
         return ProviderSelectionProvider()
 
@@ -513,7 +525,16 @@ def test_setup_task_ws_close_for_disallowed_dataset(auth_client: TestClient) ->
                 headers={"x-descope-api-key": "key-acme"},
             ) as ws:
                 ws.send_json(
-                    {"task_id": "task-1", "instance_id": "i-1", "sandbox_provider": {"type": "modal"}, "dataset": "alt"}
+                    {
+                        "task_id": "task-1",
+                        "instance_id": "i-1",
+                        "sandbox_provider": {
+                            "type": "modal",
+                            "MODAL_TOKEN_ID": "id",
+                            "MODAL_TOKEN_SECRET": "secret",
+                        },
+                        "dataset": "alt",
+                    }
                 )
                 ws.receive_json()
     assert exc_info.value.code == 1008