diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 3bf8bd7..9c1d7b1 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -37,6 +37,7 @@ jobs: outputs: changed-apps: ${{ steps.changed-apps.outputs.changed_files }} changed-bases: ${{ steps.changed-bases.outputs.changed_files }} + changed-bases-runtime: ${{ steps.filter-runtime.outputs.bases }} steps: - name: Get Changed Apps id: changed-apps @@ -54,10 +55,39 @@ jobs: include_only_directories: true max_depth: 1 + - name: Checkout (for runtime filter) + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + persist-credentials: false + + - name: Filter Bases With Runtime Variant + id: filter-runtime + env: + CHANGED_BASES: ${{ steps.changed-bases.outputs.changed_files }} + DISPATCH_TYPE: ${{ github.event_name == 'workflow_dispatch' && inputs.type || '' }} + DISPATCH_IMAGE: ${{ github.event_name == 'workflow_dispatch' && inputs.image || '' }} + shell: bash + run: | + set -euo pipefail + if [[ "$DISPATCH_TYPE" == "base" ]]; then + candidates=$(jq -nc --arg b "$DISPATCH_IMAGE" '[$b]') + else + candidates="${CHANGED_BASES:-[]}" + fi + bases='[]' + for b in $(echo "$candidates" | jq -r '.[]'); do + if [[ -f "base/$b/Dockerfile.runtime" ]]; then + bases=$(echo "$bases" | jq --arg b "$b" '. + [$b]') + fi + done + echo "Runtime-variant bases: $bases" + echo "bases=$bases" >> "$GITHUB_OUTPUT" + # Build base images: devel first, then runtime (runtime pulls from published devel) - # Runtime variant - depends on devel being published first + # Runtime variant - depends on devel being published first. + # Only runs for bases that have a Dockerfile.runtime; devel-only bases are skipped. build-bases: - if: ${{ always() && !failure() && !cancelled() && (needs.prepare.outputs.changed-bases != '[]' || (github.event_name == 'workflow_dispatch' && inputs.type == 'base')) }} + if: ${{ always() && !failure() && !cancelled() && needs.prepare.outputs.changed-bases-runtime != '[]' }} name: Build Base ${{ matrix.base }} needs: ["prepare", "build-bases-devel"] uses: ./.github/workflows/image-builder.yaml @@ -70,7 +100,7 @@ jobs: secrets: inherit strategy: matrix: - base: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.image)) || fromJSON(needs.prepare.outputs.changed-bases) }} + base: ${{ fromJSON(needs.prepare.outputs.changed-bases-runtime) }} fail-fast: false with: image: ${{ matrix.base }} diff --git a/base/pytorch/.dockerignore b/base/pytorch/.dockerignore new file mode 100644 index 0000000..1bb765c --- /dev/null +++ b/base/pytorch/.dockerignore @@ -0,0 +1,5 @@ +# Ignore everything except the files we explicitly need in the build context. +* + +!Dockerfile +!docker-bake.hcl diff --git a/base/pytorch/Dockerfile b/base/pytorch/Dockerfile new file mode 100644 index 0000000..ce6d47d --- /dev/null +++ b/base/pytorch/Dockerfile @@ -0,0 +1,80 @@ +# CUDA 13.0 + PyTorch 2.11 + Python 3.13 base image. +# +# Slim PyTorch foundation. uv-managed Python; venv at /opt/venv. +# Devel variant only — provides nvcc, NVRTC, CUPTI, cuDNN headers for +# downstream apps that compile CUDA extensions. +# +# Build locally: +# docker buildx bake image-devel-local +# +# Tags: pytorch:cuda13.0-torch2.11-devel, pytorch:devel, pytorch:latest + +ARG CUDA_VERSION="13.0.3" +ARG CUDA_DISTRO="ubuntu24.04" +ARG UV_VERSION="0.11.8" + +FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv + +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-${CUDA_DISTRO} + +ARG PYTHON_VERSION="3.13" +ARG TORCH_VERSION="2.11.0" +ARG TORCHVISION_VERSION="0.26.0" +ARG TORCHAUDIO_VERSION="2.11.0" +ARG XFORMERS_VERSION="0.0.35" +ARG TRITON_VERSION="3.6.0" +ARG TORCH_INDEX="https://download.pytorch.org/whl/cu130" + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + CUDA_HOME=/usr/local/cuda \ + CPATH=/usr/local/cuda/include \ + TORCH_CUDA_ARCH_LIST="12.0" \ + UV_COMPILE_BYTECODE=1 \ + UV_LINK_MODE=copy \ + UV_HTTP_TIMEOUT=300 \ + UV_PYTHON_INSTALL_DIR=/opt/python \ + UV_PYTHON_PREFERENCE=only-managed \ + VIRTUAL_ENV=/opt/venv \ + PATH=/usr/local/cuda/bin:/opt/venv/bin:${PATH} + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential ninja-build git curl ca-certificates tini && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +COPY --from=uv /uv /uvx /usr/local/bin/ + +RUN uv python install ${PYTHON_VERSION} && \ + uv venv /opt/venv --python ${PYTHON_VERSION} + +# Single resolve across all packages — internally consistent pins. +# `--index-strategy unsafe-best-match` makes resolution deterministic across +# the cu130 index + PyPI mix. +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install \ + --index-url ${TORCH_INDEX} \ + --extra-index-url https://pypi.org/simple \ + --index-strategy unsafe-best-match \ + torch==${TORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + xformers==${XFORMERS_VERSION} \ + triton==${TRITON_VERSION} \ + accelerate numpy safetensors nvidia-ml-py \ + sympy packaging pybind11 ninja psutil wheel + +# Constraints file for downstream apps to inherit pins (torch ecosystem +# plus the nvidia-* transitive deps and the cuda-toolkit umbrella). +RUN uv pip freeze | grep -E \ + "^(torch|torchvision|torchaudio|xformers|triton|numpy|safetensors|accelerate|nvidia-|cuda-toolkit)==" \ + > /constraints.txt && \ + echo "Constraints:" && cat /constraints.txt + +# Build-time validation. torch.cuda.is_available() requires --gpus all and is +# intentionally not checked here — only that torch was built against CUDA and +# that xformers' py3-none wheel imports against torch's C++ ABI on cp313. +RUN python -c "import torch; print(f'PyTorch {torch.__version__} CUDA {torch.version.cuda}'); assert torch.version.cuda is not None" && \ + python -c "import xformers; print(f'xformers {xformers.__version__}')" + +ENTRYPOINT ["/usr/bin/tini", "--"] diff --git a/base/pytorch/docker-bake.hcl b/base/pytorch/docker-bake.hcl new file mode 100644 index 0000000..eb3b09a --- /dev/null +++ b/base/pytorch/docker-bake.hcl @@ -0,0 +1,41 @@ +target "docker-metadata-action" {} + +variable "APP" { + default = "pytorch" +} + +variable "VERSION" { + // Format: cuda{CUDA_VERSION}-torch{TORCH_VERSION} + default = "cuda13.0-torch2.11" +} + +variable "SOURCE" { + default = "https://github.com/arsac/containers" +} + +variable "VENDOR" { + default = "arsac" +} + +group "default" { + targets = ["image-devel-local"] +} + +target "image-devel" { + inherits = ["docker-metadata-action"] + dockerfile = "Dockerfile" + labels = { + "org.opencontainers.image.source" = "${SOURCE}" + } +} + +target "image-devel-local" { + inherits = ["image-devel"] + output = ["type=docker"] + tags = ["${APP}:${VERSION}-devel", "${APP}:devel", "${APP}:latest"] +} + +target "image-devel-all" { + inherits = ["image-devel"] + platforms = ["linux/amd64"] +} diff --git a/docs/superpowers/plans/2026-04-28-pytorch-base-image.md b/docs/superpowers/plans/2026-04-28-pytorch-base-image.md new file mode 100644 index 0000000..8b0351b --- /dev/null +++ b/docs/superpowers/plans/2026-04-28-pytorch-base-image.md @@ -0,0 +1,716 @@ +# `base/pytorch` Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Create a new slim PyTorch base image at `base/pytorch/` (CUDA 13.0, Python 3.13, PyTorch 2.11.0, uv-managed Python and venv), parallel to the existing `base/cuda-ml/` but devel-only, smaller, and minimal. + +**Architecture:** Single-stage Dockerfile `FROM nvidia/cuda:13.0.3-cudnn-devel-ubuntu24.04`. uv installs Python 3.13 (managed standalone build) to `/opt/python`, creates a venv at `/opt/venv`, and runs a single resolve installing torch + torchvision + torchaudio + xformers + triton from the cu130 PyTorch index plus PyPI for transitive `nvidia-*-cu13` deps. The existing CI workflow assumes every base image has both a devel and a runtime variant; we modify `release.yaml` to gate the runtime build job on the existence of a `Dockerfile.runtime` so that this devel-only image doesn't break the matrix. + +**Tech Stack:** Docker buildx + bake (HCL), uv 0.11.8, Python 3.13, PyTorch 2.11.0+cu130, NVIDIA CUDA 13.0.3 cudnn-devel base image, GitHub Actions. + +**Spec:** `docs/superpowers/specs/2026-04-28-pytorch-base-image-design.md` + +--- + +## File structure + +**New files (under `base/pytorch/`):** +- `Dockerfile` — single-stage devel image +- `docker-bake.hcl` — buildx bake config with devel-only targets +- `.dockerignore` — minimal, ignore everything but Dockerfile and bake file + +**Modified files:** +- `.github/workflows/release.yaml` — add a `changed-bases-runtime` output to the `prepare` job that filters `changed-bases` to those that have a `Dockerfile.runtime`; switch the `build-bases` (runtime) job to use that filtered list + +**Files NOT touched:** +- `base/cuda-ml/*` — sibling image stays untouched +- `.github/workflows/image-builder.yaml` — no changes needed; the gate happens in `release.yaml` +- `.renovaterc.json5` — no annotations on the new bake/Dockerfile (matches `cuda-ml` sibling pattern) +- `build-push-local.sh` — does not work for this image (assumes a `builder` target that doesn't exist in a single-stage Dockerfile); local builds use `docker buildx bake image-devel-local` instead. Out of scope for this plan. + +--- + +### Task 1: Scaffold `base/pytorch/` directory with `.dockerignore` and `docker-bake.hcl` + +**Files:** +- Create: `base/pytorch/.dockerignore` +- Create: `base/pytorch/docker-bake.hcl` + +- [ ] **Step 1: Create the directory and `.dockerignore`** + +```bash +mkdir -p base/pytorch +``` + +Write `base/pytorch/.dockerignore` with: + +``` +# Ignore everything except the files we explicitly need in the build context. +* + +!Dockerfile +!docker-bake.hcl +``` + +- [ ] **Step 2: Create `base/pytorch/docker-bake.hcl`** + +Write `base/pytorch/docker-bake.hcl`: + +```hcl +target "docker-metadata-action" {} + +variable "APP" { + default = "pytorch" +} + +variable "VERSION" { + // Format: cuda{CUDA_VERSION}-torch{TORCH_VERSION} + default = "cuda13.0-torch2.11" +} + +variable "SOURCE" { + default = "https://github.com/arsac/containers" +} + +variable "VENDOR" { + default = "arsac" +} + +group "default" { + targets = ["image-devel-local"] +} + +target "image-devel" { + inherits = ["docker-metadata-action"] + dockerfile = "Dockerfile" + labels = { + "org.opencontainers.image.source" = "${SOURCE}" + } +} + +target "image-devel-local" { + inherits = ["image-devel"] + output = ["type=docker"] + tags = ["${APP}:${VERSION}-devel", "${APP}:devel", "${APP}:latest"] +} + +target "image-devel-all" { + inherits = ["image-devel"] + platforms = ["linux/amd64"] +} +``` + +- [ ] **Step 3: Verify bake config parses** + +Run: + +```bash +cd base/pytorch && docker buildx bake image-devel-local --print +``` + +Expected: prints a JSON structure containing the `image-devel-local` target with `tags` `["pytorch:cuda13.0-torch2.11-devel","pytorch:devel","pytorch:latest"]` and `dockerfile` `"Dockerfile"`. No error. + +If you see `target "image-devel-local" does not exist` or HCL parse errors, fix the bake file before continuing. + +- [ ] **Step 4: Verify `app-options` GitHub action would parse `VERSION` and `SOURCE`** + +Run from inside `base/pytorch/`: + +```bash +docker buildx bake --list type=variables,format=json --progress=quiet | jq -r '.[] | select(.name == "VERSION" or .name == "SOURCE") | "\(.name)=\(.value)"' +``` + +Expected output: + +``` +VERSION=cuda13.0-torch2.11 +SOURCE=https://github.com/arsac/containers +``` + +This is the exact extraction the CI's `.github/actions/app-options/action.yaml` does. If it fails, the CI integration will fail later. + +--- + +### Task 2: Write `base/pytorch/Dockerfile` + +**Files:** +- Create: `base/pytorch/Dockerfile` + +- [ ] **Step 1: Write the Dockerfile** + +Write `base/pytorch/Dockerfile`: + +```dockerfile +# CUDA 13.0 + PyTorch 2.11 + Python 3.13 base image. +# +# Slim PyTorch foundation. uv-managed Python; venv at /opt/venv. +# Devel variant only — provides nvcc, NVRTC, CUPTI, cuDNN headers for +# downstream apps that compile CUDA extensions. +# +# Build locally: +# docker buildx bake image-devel-local +# +# Tags: pytorch:cuda13.0-torch2.11-devel, pytorch:devel, pytorch:latest + +ARG CUDA_VERSION="13.0.3" +ARG CUDA_DISTRO="ubuntu24.04" +ARG UV_VERSION="0.11.8" + +# Named uv stage so ${UV_VERSION} can be expanded — `COPY --from=:${VAR}` +# does not expand ARGs even when declared in global scope; only `FROM` does. +FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv + +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-${CUDA_DISTRO} + +ARG PYTHON_VERSION="3.13" +ARG TORCH_VERSION="2.11.0" +ARG TORCHVISION_VERSION="0.26.0" +ARG TORCHAUDIO_VERSION="2.11.0" +ARG XFORMERS_VERSION="0.0.35" +ARG TRITON_VERSION="3.6.0" +ARG TORCH_INDEX="https://download.pytorch.org/whl/cu130" + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + CUDA_HOME=/usr/local/cuda \ + CPATH=/usr/local/cuda/include \ + TORCH_CUDA_ARCH_LIST="12.0" \ + UV_COMPILE_BYTECODE=1 \ + UV_LINK_MODE=copy \ + UV_HTTP_TIMEOUT=300 \ + UV_PYTHON_INSTALL_DIR=/opt/python \ + UV_PYTHON_PREFERENCE=only-managed \ + VIRTUAL_ENV=/opt/venv \ + PATH=/usr/local/cuda/bin:/opt/venv/bin:${PATH} + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential ninja-build git curl ca-certificates tini && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +COPY --from=uv /uv /uvx /usr/local/bin/ + +RUN uv python install ${PYTHON_VERSION} && \ + uv venv /opt/venv --python ${PYTHON_VERSION} + +# Single resolve across all packages — internally consistent pins. +# `--index-strategy unsafe-best-match` makes resolution deterministic across +# the cu130 index + PyPI mix. +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install \ + --index-url ${TORCH_INDEX} \ + --extra-index-url https://pypi.org/simple \ + --index-strategy unsafe-best-match \ + torch==${TORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + xformers==${XFORMERS_VERSION} \ + triton==${TRITON_VERSION} \ + accelerate numpy safetensors nvidia-ml-py \ + sympy packaging pybind11 ninja psutil wheel + +# Constraints file for downstream apps to inherit pins (torch ecosystem +# plus the nvidia-* transitive deps and the cuda-toolkit umbrella). +RUN uv pip freeze | grep -E \ + "^(torch|torchvision|torchaudio|xformers|triton|numpy|safetensors|accelerate|nvidia-|cuda-toolkit)==" \ + > /constraints.txt && \ + echo "Constraints:" && cat /constraints.txt + +# Build-time validation. torch.cuda.is_available() requires --gpus all and is +# intentionally not checked here — only that torch was built against CUDA and +# that xformers' py3-none wheel imports against torch's C++ ABI on cp313. +RUN python -c "import torch; print(f'PyTorch {torch.__version__} CUDA {torch.version.cuda}'); assert torch.version.cuda is not None" && \ + python -c "import xformers; print(f'xformers {xformers.__version__}')" + +ENTRYPOINT ["/usr/bin/tini", "--"] +``` + +- [ ] **Step 2: Lint the Dockerfile syntactically** + +Run: + +```bash +docker buildx build --check base/pytorch/ +``` + +Expected: no warnings other than possibly a hint about pinning the uv image to a digest. If you see syntax errors (e.g., bad heredoc, malformed `ENV`), fix them before building. + +--- + +### Task 3: Build the image locally and run smoke tests + +This task pulls ~6 GB of base image and downloads ~3 GB of wheels. First build can take 10-20 minutes; subsequent builds are cache-hot. + +**Files:** +- None (verification only) + +- [ ] **Step 1: Build the image** + +Run: + +```bash +cd base/pytorch && docker buildx bake image-devel-local +``` + +Expected: build completes, ending with three tags applied: +``` +=> => naming to docker.io/library/pytorch:cuda13.0-torch2.11-devel +=> => naming to docker.io/library/pytorch:devel +=> => naming to docker.io/library/pytorch:latest +``` + +If any of the in-build `RUN python -c "import torch"` or `import xformers` smoke tests fail, the build will halt with the failing assertion. Read the error and fix the spec/Dockerfile before proceeding — do **not** patch around a real ABI mismatch. + +- [ ] **Step 2: Verify Python version** + +Run: + +```bash +docker run --rm pytorch:latest python --version +``` + +Expected: `Python 3.13.x` (some patch version). + +- [ ] **Step 3: Verify tini entrypoint** + +Run: + +```bash +docker run --rm pytorch:latest /bin/sh -c 'echo $$; ps -o pid,comm 1' +``` + +Expected: PID 1 is `tini` (or `tini-static`); the shell is PID 2 or higher. This confirms `ENTRYPOINT ["/usr/bin/tini", "--"]` is wired up. + +- [ ] **Step 4: Verify nvcc and NVRTC are available** + +Run: + +```bash +docker run --rm pytorch:latest /bin/sh -c 'nvcc --version && ls /usr/local/cuda/lib64/libnvrtc.so*' +``` + +Expected: prints CUDA 13.0.x release info and lists at least `libnvrtc.so` and `libnvrtc.so.13`. + +- [ ] **Step 5: Verify torch imports and CUDA libs are wired up** + +Run: + +```bash +docker run --rm pytorch:latest python -c " +import torch +print(f'torch {torch.__version__} CUDA {torch.version.cuda} cuDNN {torch.backends.cudnn.version()}') +import ctypes, os +nvidia_dir = os.path.join(os.path.dirname(torch.__file__), '..', 'nvidia') +print(f'nvidia/ libs dir: {os.path.realpath(nvidia_dir)}') +print(f'cusparselt present: {os.path.isdir(os.path.join(nvidia_dir, \"cusparselt\"))}') +print(f'nvshmem present: {os.path.isdir(os.path.join(nvidia_dir, \"nvshmem\"))}') +" +``` + +Expected: +- `torch 2.11.0+cu130 CUDA 13.0` (or similar) +- `cuDNN ...` prints a non-`None` integer (e.g., `91900` for 9.19.0) +- `cusparselt present: True` +- `nvshmem present: True` + +If any of these are False or `None`, torch is missing a transitive nvidia-* package; re-check the cu130 wheel METADATA against the spec's "Validated requirements" table. + +- [ ] **Step 6: Verify the constraints file** + +Run: + +```bash +docker run --rm pytorch:latest cat /constraints.txt +``` + +Expected: lines for at least: +``` +torch==2.11.0+cu130 +torchvision==0.26.0+cu130 +torchaudio==2.11.0+cu130 +xformers==0.0.35 +triton==3.6.0 +nvidia-cudnn-cu13==9.19.0.56 +nvidia-cusparselt-cu13==0.8.0 +nvidia-nvshmem-cu13==3.4.5 +nvidia-nccl-cu13==2.28.9 +cuda-toolkit==13.0.2 +numpy== +safetensors== +accelerate== +``` + +Plus other `nvidia-*-cu13` transitive deps. If any of the explicit pins above are missing, the build's regex grep is wrong. + +- [ ] **Step 7: Verify image size is in the expected range** + +Run: + +```bash +docker images pytorch:latest --format '{{.Size}}' +``` + +Expected: roughly 12-14 GB. If it's under 8 GB you've likely lost the cuDNN base or the torch wheel; if it's over 20 GB, an apt cache layer wasn't cleaned. + +- [ ] **Step 8: Tag with the ghcr-ready name and verify** + +Run: + +```bash +docker tag pytorch:latest ghcr.io/arsac/pytorch:cuda13.0-torch2.11 +docker images ghcr.io/arsac/pytorch +``` + +Expected: shows the new tag pointing at the same image ID as `pytorch:latest`. (Don't push yet — that's CI's job once the PR merges.) + +--- + +### Task 4: Modify `.github/workflows/release.yaml` to gate runtime builds on `Dockerfile.runtime` existence + +**Files:** +- Modify: `.github/workflows/release.yaml` + +The current `prepare` job emits `changed-bases` (a JSON array of changed base directory names). The `build-bases` job (which builds runtime variants) iterates that list as a matrix. For `base/pytorch/`, no `Dockerfile.runtime` exists, so the runtime build would fail. + +**Fix:** add a new step in `prepare` that filters `changed-bases` to only those with a `Dockerfile.runtime`, exposed as `changed-bases-runtime`. Switch `build-bases` to consume that filtered output. + +- [ ] **Step 1: Read the current `prepare` job** + +Open `.github/workflows/release.yaml` and locate the `prepare` job (around line 30-55 in the current file). Note its existing `outputs` block: + +```yaml +outputs: + changed-apps: ${{ steps.changed-apps.outputs.changed_files }} + changed-bases: ${{ steps.changed-bases.outputs.changed_files }} +``` + +- [ ] **Step 2: Add a `changed-bases-runtime` output and a filter step** + +Edit the `prepare` job. After the `Get Changed Bases` step, add a checkout step (so the filter can read `base/*/Dockerfile.runtime`) and a filter step. Update the `outputs` block. + +Replace: + +```yaml + prepare: + name: Prepare + runs-on: ubuntu-latest + outputs: + changed-apps: ${{ steps.changed-apps.outputs.changed_files }} + changed-bases: ${{ steps.changed-bases.outputs.changed_files }} + steps: + - name: Get Changed Apps + id: changed-apps + uses: bjw-s-labs/action-changed-files@930cef8463348e168cab7235c47fe95a7a235f65 # v0.3.3 + with: + path: apps + include_only_directories: true + max_depth: 1 + + - name: Get Changed Bases + id: changed-bases + uses: bjw-s-labs/action-changed-files@930cef8463348e168cab7235c47fe95a7a235f65 # v0.3.3 + with: + path: base + include_only_directories: true + max_depth: 1 +``` + +with: + +```yaml + prepare: + name: Prepare + runs-on: ubuntu-latest + outputs: + changed-apps: ${{ steps.changed-apps.outputs.changed_files }} + changed-bases: ${{ steps.changed-bases.outputs.changed_files }} + changed-bases-runtime: ${{ steps.filter-runtime.outputs.bases }} + steps: + - name: Get Changed Apps + id: changed-apps + uses: bjw-s-labs/action-changed-files@930cef8463348e168cab7235c47fe95a7a235f65 # v0.3.3 + with: + path: apps + include_only_directories: true + max_depth: 1 + + - name: Get Changed Bases + id: changed-bases + uses: bjw-s-labs/action-changed-files@930cef8463348e168cab7235c47fe95a7a235f65 # v0.3.3 + with: + path: base + include_only_directories: true + max_depth: 1 + + - name: Checkout (for runtime filter) + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + persist-credentials: false + + - name: Filter Bases With Runtime Variant + id: filter-runtime + env: + CHANGED_BASES: ${{ steps.changed-bases.outputs.changed_files }} + DISPATCH_TYPE: ${{ github.event_name == 'workflow_dispatch' && inputs.type || '' }} + DISPATCH_IMAGE: ${{ github.event_name == 'workflow_dispatch' && inputs.image || '' }} + shell: bash + run: | + set -euo pipefail + if [[ "$DISPATCH_TYPE" == "base" ]]; then + candidates=$(jq -nc --arg b "$DISPATCH_IMAGE" '[$b]') + else + candidates="${CHANGED_BASES:-[]}" + fi + bases='[]' + for b in $(echo "$candidates" | jq -r '.[]'); do + if [[ -f "base/$b/Dockerfile.runtime" ]]; then + bases=$(echo "$bases" | jq --arg b "$b" '. + [$b]') + fi + done + echo "Runtime-variant bases: $bases" + echo "bases=$bases" >> "$GITHUB_OUTPUT" +``` + +- [ ] **Step 3: Switch `build-bases` to consume the filtered output** + +Locate the `build-bases` job. Replace its `if:` and `strategy.matrix.base`: + +Replace: + +```yaml + build-bases: + if: ${{ always() && !failure() && !cancelled() && (needs.prepare.outputs.changed-bases != '[]' || (github.event_name == 'workflow_dispatch' && inputs.type == 'base')) }} + name: Build Base ${{ matrix.base }} + needs: ["prepare", "build-bases-devel"] + uses: ./.github/workflows/image-builder.yaml + permissions: + attestations: write + contents: write + id-token: write + packages: write + security-events: write + secrets: inherit + strategy: + matrix: + base: ${{ github.event_name == 'workflow_dispatch' && fromJSON(format('["{0}"]', inputs.image)) || fromJSON(needs.prepare.outputs.changed-bases) }} + fail-fast: false + with: + image: ${{ matrix.base }} + path: base + release: ${{ github.event_name == 'workflow_dispatch' && inputs.release || github.event_name == 'push' }} +``` + +with: + +```yaml + build-bases: + if: ${{ always() && !failure() && !cancelled() && needs.prepare.outputs.changed-bases-runtime != '[]' }} + name: Build Base ${{ matrix.base }} + needs: ["prepare", "build-bases-devel"] + uses: ./.github/workflows/image-builder.yaml + permissions: + attestations: write + contents: write + id-token: write + packages: write + security-events: write + secrets: inherit + strategy: + matrix: + base: ${{ fromJSON(needs.prepare.outputs.changed-bases-runtime) }} + fail-fast: false + with: + image: ${{ matrix.base }} + path: base + release: ${{ github.event_name == 'workflow_dispatch' && inputs.release || github.event_name == 'push' }} +``` + +The `build-bases-devel` job stays unchanged — every base still needs a devel build. + +The downstream `build-apps` and `status` jobs both reference `build-bases` in `needs:`. When the runtime matrix is empty, `build-bases` resolves to `skipped`. The existing `status` job's checks (`contains(needs.*.result, 'failure')`) treat `skipped` as not-failure, so it still passes. No further changes required to those jobs. + +- [ ] **Step 4: Verify YAML parses** + +Run: + +```bash +python3 -c "import yaml; yaml.safe_load(open('.github/workflows/release.yaml'))" +``` + +Expected: no output (parses cleanly). If it errors with a `yaml.scanner.ScannerError` or `yaml.parser.ParserError`, the indentation or quoting is wrong. + +- [ ] **Step 5: Sanity-check the matrix-filter logic locally with mock data** + +Run: + +```bash +# Simulate: changed-bases=["cuda-ml","pytorch"], DISPATCH_TYPE empty +CHANGED_BASES='["cuda-ml","pytorch"]' +candidates="$CHANGED_BASES" +bases='[]' +for b in $(echo "$candidates" | jq -r '.[]'); do + if [[ -f "base/$b/Dockerfile.runtime" ]]; then + bases=$(echo "$bases" | jq --arg b "$b" '. + [$b]') + fi +done +echo "$bases" +``` + +Expected: `["cuda-ml"]` — `cuda-ml` has a `Dockerfile.runtime`, `pytorch` does not. + +If the output is `[]` or `["cuda-ml","pytorch"]`, the filter logic is wrong; re-check the `[[ -f ... ]]` test. + +--- + +### Task 5: Commit all changes + +**Files:** +- All files from tasks 1, 2, and 4 + +- [ ] **Step 1: Review the diff** + +Run: + +```bash +git status +git diff --stat +``` + +Expected file changes: + +- `base/pytorch/.dockerignore` (new) +- `base/pytorch/Dockerfile` (new) +- `base/pytorch/docker-bake.hcl` (new) +- `.github/workflows/release.yaml` (modified) +- `docs/superpowers/specs/2026-04-28-pytorch-base-image-design.md` (new — already exists from brainstorming) +- `docs/superpowers/plans/2026-04-28-pytorch-base-image.md` (new — this file) + +- [ ] **Step 2: Stage and commit** + +Run: + +```bash +git checkout -b feat/base-pytorch +git add base/pytorch/Dockerfile base/pytorch/docker-bake.hcl base/pytorch/.dockerignore \ + .github/workflows/release.yaml \ + docs/superpowers/specs/2026-04-28-pytorch-base-image-design.md \ + docs/superpowers/plans/2026-04-28-pytorch-base-image.md +git commit -m "$(cat <<'EOF' +feat(base): add slim pytorch image (cuda13.0, py3.13, torch2.11) + +New base/pytorch/ image: CUDA 13.0.3 + cuDNN devel + Python 3.13 (uv-managed) + +PyTorch 2.11.0+cu130 + xformers 0.0.35 + triton 3.6.0, plus the nvidia-* runtime +libraries torch dlopens at startup (cuSPARSELt, NVSHMEM, cuDNN, NCCL). +Devel-only — no Dockerfile.runtime, since downstream apps need nvcc/NVRTC/CUPTI. + +Also gates the release.yaml `build-bases` (runtime) job on the existence of a +Dockerfile.runtime in each changed base directory, so the new devel-only image +doesn't break the matrix. +EOF +)" +``` + +- [ ] **Step 3: Verify the commit looks right** + +Run: + +```bash +git log -1 --stat +``` + +Expected: shows the six file changes listed above and the commit message. + +--- + +### Task 6: Open the PR and verify CI + +**Files:** +- None (CI verification only) + +- [ ] **Step 1: Push the branch** + +Run: + +```bash +git push -u origin feat/base-pytorch +``` + +- [ ] **Step 2: Open the PR** + +Run: + +```bash +gh pr create --title "feat(base): add slim pytorch image (cuda13.0, py3.13, torch2.11)" --body "$(cat <<'EOF' +## Summary +- New `base/pytorch/` slim PyTorch base image: CUDA 13.0.3 + Python 3.13 (uv-managed) + PyTorch 2.11.0+cu130 + xformers + triton + utility deps. Devel-only (deployment target), no Dockerfile.runtime. +- Gates `release.yaml` runtime-build matrix on `Dockerfile.runtime` existence so devel-only images don't break the matrix. + +## Test plan +- [ ] `docker buildx bake image-devel-local --print` succeeds locally +- [ ] `docker buildx bake image-devel-local` builds successfully locally +- [ ] In-image: `python -c "import torch; assert torch.version.cuda is not None"` passes +- [ ] In-image: `python -c "import xformers"` passes +- [ ] In-image: `cat /constraints.txt` shows expected pins +- [ ] CI `Build Base pytorch (devel)` job succeeds and pushes `ghcr.io/arsac/pytorch:cuda13.0-torch2.11-devel` etc. +- [ ] CI `Build Base pytorch` (runtime) is skipped (no `Dockerfile.runtime`) +- [ ] CI `Build Base cuda-ml` (runtime) still runs and succeeds (regression check on the workflow change) + +Spec: `docs/superpowers/specs/2026-04-28-pytorch-base-image-design.md` +Plan: `docs/superpowers/plans/2026-04-28-pytorch-base-image.md` +EOF +)" +``` + +- [ ] **Step 3: Watch CI** + +Run: + +```bash +gh pr checks --watch +``` + +Expected: + +- `Build Base pytorch (devel)` job runs and succeeds. This pushes `ghcr.io/arsac/pytorch:cuda13.0-torch2.11-devel`, `:devel`, plus semver-derived tags. +- `Build Base pytorch` (the runtime matrix) is **skipped** because `pytorch` is not in `changed-bases-runtime`. +- If `cuda-ml` is in the changed-bases list (it shouldn't be unless its files were touched), its `Build Base cuda-ml` runtime job should still run normally — confirms the workflow change didn't break the existing image. + +If the devel build fails: read the job logs, fix locally, push again. Do **not** disable the smoke tests inside the Dockerfile to "make CI pass" — the smoke tests catch real ABI/install issues. + +If the runtime job is *not* skipped for `pytorch`: the `changed-bases-runtime` filter or the `build-bases` `if:` conditional is wrong; re-check Task 4 Step 2 and Step 3. + +- [ ] **Step 4: Verify the published image is reachable** + +Once CI is green: + +```bash +docker pull ghcr.io/arsac/pytorch:cuda13.0-torch2.11-devel +docker run --rm ghcr.io/arsac/pytorch:cuda13.0-torch2.11-devel python -c "import torch, xformers; print(torch.__version__, xformers.__version__)" +``` + +Expected: `2.11.0+cu130 0.0.35`. + +- [ ] **Step 5: Merge** + +Run: + +```bash +gh pr merge --merge --auto +``` + +Or merge via GitHub UI per repo policy. Once merged, the published `:latest` tag will be pinned to the new build. + +--- + +## Self-review notes + +**Spec coverage:** +- Goals 1-9 (Python 3.13, CUDA 13.0, PyTorch 2.11 cu130, cuDNN, NVRTC, cuSPARSELt, NVSHMEM, uv, devel variant) — all satisfied by Task 2's Dockerfile and verified in Task 3 Steps 4-6. +- Architecture (single-stage, uv-managed Python, venv at /opt/venv, constraints file, ENTRYPOINT tini) — Task 2. +- File-by-file (Dockerfile + docker-bake.hcl + .dockerignore) — Tasks 1 and 2. +- Build & CI section's runtime-gating fix — Task 4. +- All five Risks have mitigations baked into the Dockerfile (in-build smoke tests for ABI; UV_VERSION pinned; UV_LINK_MODE=copy; constraints regex includes nvidia- and cuda-toolkit). + +**No placeholders detected.** Every step has concrete commands, code, or file paths. + +**Type/name consistency:** target names (`image-devel`, `image-devel-local`, `image-devel-all`), env var names (`UV_*`, `VIRTUAL_ENV`, `CPATH`), and ARG names match across Tasks 1, 2, and the spec. Output names (`changed-bases-runtime`) match between Task 4 Steps 2 and 3. diff --git a/docs/superpowers/specs/2026-04-28-pytorch-base-image-design.md b/docs/superpowers/specs/2026-04-28-pytorch-base-image-design.md new file mode 100644 index 0000000..32710c7 --- /dev/null +++ b/docs/superpowers/specs/2026-04-28-pytorch-base-image-design.md @@ -0,0 +1,278 @@ +# `base/pytorch` — slim PyTorch base image (CUDA 13.0, Python 3.13) + +**Date:** 2026-04-28 +**Status:** Design approved, ready for implementation plan + +## Overview + +A new sibling base image at `base/pytorch/`, parallel to the existing `base/cuda-ml/`. Provides a slim PyTorch foundation on CUDA 13.0 + Python 3.13 + PyTorch 2.11.0, intended as the deployment target for downstream apps that need a CUDA-13 dev environment with PyTorch and uv preinstalled. Apps build on top of this image rather than reinstalling torch. + +## Goals + +- Standalone, deployable image (devel variant only — see non-goals). +- uv-managed Python 3.13 (`UV_PYTHON_PREFERENCE=only-managed`). +- PyTorch 2.11.0 + torchvision 0.26.0 + torchaudio 2.11.0 + xformers 0.0.35 + triton 3.6.0, all from the cu130 PyTorch index. +- All NVIDIA runtime libraries that torch dlopens at startup are present (cuDNN, cuSPARSELt, NVSHMEM, NCCL, CUDA toolkit pip umbrella). +- Full devel toolchain available for downstream apps that compile CUDA extensions (nvcc, NVRTC, CUPTI, cuDNN headers). +- Common build/utility helpers (`accelerate`, `numpy`, `safetensors`, `nvidia-ml-py`, `sympy`, `packaging`, `pybind11`, `ninja`, `psutil`, `wheel`). +- Constraints file at `/constraints.txt` for downstream apps to pin against. + +## Non-goals + +- **No runtime (slim) variant.** `nvidia/cuda:*-runtime-*` images deliberately omit nvcc / NVRTC / CUPTI, and these are required for `torch.compile` / Triton JIT and for downstream apps that compile CUDA extensions. Ship only the devel image as the deployment target. +- **No flash-attn.** v2.8.x has no upstream cu13 wheels and multiple open build-failure issues against CUDA 13. v4 is alpha and on consumer Blackwell (SM 12.0) it falls back to SM80 kernels (~5% slower than FA2 in the only published benchmark, with open crash reports). Apps that need flash attention install per-app or use `torch.nn.functional.scaled_dot_product_attention` (cuDNN-backed FA-style kernels via PyTorch SDPA on Blackwell). +- **No heavy ML extras.** No xformers-source-builds, hunyuan3d, diso, nvdiffrast, sageattention, opencv, librosa, ffmpeg, etc. Those stay in `base/cuda-ml/` (cu128/py312/full stack) and apps that need them either use `cuda-ml` directly or install on top of `base/pytorch/`. +- **No `pyproject.toml` + `uv.lock`.** A base image isn't a project; the lockfile pretense (`package = false`) adds two committed files for what amounts to a version-pin list. Pins live in `Dockerfile` `ARG`s, matching the existing `base/cuda-ml/` pattern. +- **No `--generate-hashes` / wheel-hash reproducibility.** Not conventional for ML/AI base images at this scale (`pytorch/pytorch`, `nvidia/cuda`, the existing `cuda-ml` all use plain version pins). Version pin + immutable index covers the realistic threat model. + +## Validated requirements + +| # | Requirement | How satisfied | +|---|---|---| +| 1 | Python 3.13 | `uv python install 3.13` to `/opt/python`, managed standalone build | +| 2 | CUDA 13.0 | `nvidia/cuda:13.0.3-cudnn-devel-ubuntu24.04` (13.0.0 tag does not exist on Docker Hub; .3 is the latest patch) | +| 3 | PyTorch 2.11 cu130 | `torch==2.11.0` from `https://download.pytorch.org/whl/cu130`, plus matching torchvision 0.26.0 + torchaudio 2.11.0 + xformers 0.0.35 + triton 3.6.0 | +| 4 | cuDNN | `cudnn-devel` base image variant ships system cuDNN at `/usr/lib/x86_64-linux-gnu/`. Torch additionally pulls `nvidia-cudnn-cu13==9.19.0.56` as a transitive dep — torch uses the wheel-bundled version at runtime | +| 5 | NVRTC | Present in `cuda-13-0` toolkit installed by the `*-devel-*` image. Torch also transitively pulls `cuda-toolkit[...nvrtc]==13.0.2` (PyPI umbrella) | +| 6 | cuSPARSELt | Torch declares `Requires-Dist: nvidia-cusparselt-cu13==0.8.0; platform_system == "Linux"`, auto-installed. Ships `libcusparseLt.so.0` at `site-packages/nvidia/cusparselt/lib/`. Torch's `__init__.py` adds the path to the loader | +| 7 | NVSHMEM | Same pattern: `Requires-Dist: nvidia-nvshmem-cu13==3.4.5; platform_system == "Linux"`. Ships `libnvshmem_host.so.3` at `site-packages/nvidia/nvshmem/lib/` | +| 8 | uv | `COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /usr/local/bin/`. Python managed by uv (`UV_PYTHON_PREFERENCE=only-managed`, `UV_PYTHON_INSTALL_DIR=/opt/python`); venv created by uv at `/opt/venv`; packages installed via `uv pip install --python /opt/venv/bin/python` | +| 9 | devel variant | `cudnn-devel` base includes nvcc, NVRTC, CUPTI, cuDNN headers, full toolkit. No runtime variant ships | + +## Architecture + +``` +base/pytorch/ +├── Dockerfile # devel image, deployment target +├── docker-bake.hcl # build config + tags +└── .dockerignore # ignore everything but the Dockerfile and bake file +``` + +**Single-stage Dockerfile** built `FROM nvidia/cuda:13.0.3-cudnn-devel-ubuntu24.04`. uv installs Python 3.13 to `/opt/python`, creates a venv at `/opt/venv`, and installs torch + ecosystem from the cu130 index plus PyPI for the nvidia-* transitive deps. `/opt/venv/bin` is at the front of `PATH` so `python`, `pip` (via uv), and tools resolve correctly without explicit activation in `RUN`/`CMD` layers or `docker run`. Constraints file written to `/constraints.txt` for downstream apps to pin against. + +### Image contents + +**System packages (apt):** `build-essential`, `ninja-build`, `git`, `curl`, `ca-certificates`, `tini`. Plus the `cuda-13-0` toolkit + `libcudnn9-cuda-13` already provided by the base image. + +**Python packages (cu130 index):** +- `torch==2.11.0` +- `torchvision==0.26.0` +- `torchaudio==2.11.0` +- `xformers==0.0.35` (abi3 wheel on cu130 index, requires torch≥2.10 ✓) +- `triton==3.6.0` (only on the PyTorch index — not on PyPI) + +**Python packages (PyPI, pulled transitively by torch):** +- `nvidia-cudnn-cu13==9.19.0.56` +- `nvidia-cusparselt-cu13==0.8.0` +- `nvidia-nvshmem-cu13==3.4.5` +- `nvidia-nccl-cu13==2.28.9` +- `cuda-toolkit==13.0.2` (umbrella package: cublas, cudart, cufft, cufile, cupti, curand, cusolver, cusparse, nvjitlink, nvrtc, nvtx) + +**Python packages (PyPI, explicit utility deps):** +- `accelerate`, `numpy`, `safetensors`, `nvidia-ml-py`, `sympy`, `packaging`, `pybind11`, `ninja`, `psutil`, `wheel` + +### Environment + +``` +DEBIAN_FRONTEND=noninteractive +PYTHONDONTWRITEBYTECODE=1 +PYTHONUNBUFFERED=1 +CUDA_HOME=/usr/local/cuda +CPATH=/usr/local/cuda/include # broader than CPLUS_INCLUDE_PATH; covers C and C++ +TORCH_CUDA_ARCH_LIST="12.0" # consumer Blackwell (RTX 5090, RTX PRO 6000 Workstation) +UV_COMPILE_BYTECODE=1 +UV_LINK_MODE=copy # load-bearing — see Risks +UV_HTTP_TIMEOUT=300 +UV_PYTHON_INSTALL_DIR=/opt/python +UV_PYTHON_PREFERENCE=only-managed +VIRTUAL_ENV=/opt/venv # directs uv pip to the venv without --python +PATH=/usr/local/cuda/bin:/opt/venv/bin:${PATH} +``` + +### Constraints file + +Written by `uv pip freeze` (auto-targets `/opt/venv` via `VIRTUAL_ENV`) filtered through `grep -E` for the regex: + +``` +^(torch|torchvision|torchaudio|xformers|triton|numpy|safetensors|accelerate|nvidia-|cuda-toolkit)== +``` + +Wider than the existing `cuda-ml/constraints.txt` regex — adds `nvidia-` and `cuda-toolkit` so downstream apps inherit the locked NVIDIA library pins and don't accidentally upgrade `nvidia-cusparselt-cu13` or `cuda-toolkit` out from under torch. + +## File-by-file design + +### `base/pytorch/Dockerfile` + +```dockerfile +ARG CUDA_VERSION="13.0.3" +ARG CUDA_DISTRO="ubuntu24.04" +ARG UV_VERSION="0.11.8" + +# Named uv stage so ${UV_VERSION} can be expanded — `COPY --from=:${VAR}` +# does not expand ARGs even when they're declared in global scope; only `FROM` +# does. The named-stage indirection is the canonical Docker workaround. +FROM ghcr.io/astral-sh/uv:${UV_VERSION} AS uv + +FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-${CUDA_DISTRO} + +ARG PYTHON_VERSION="3.13" +ARG TORCH_VERSION="2.11.0" +ARG TORCHVISION_VERSION="0.26.0" +ARG TORCHAUDIO_VERSION="2.11.0" +ARG XFORMERS_VERSION="0.0.35" +ARG TRITON_VERSION="3.6.0" +ARG TORCH_INDEX="https://download.pytorch.org/whl/cu130" + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + CUDA_HOME=/usr/local/cuda \ + CPATH=/usr/local/cuda/include \ + TORCH_CUDA_ARCH_LIST="12.0" \ + UV_COMPILE_BYTECODE=1 \ + UV_LINK_MODE=copy \ + UV_HTTP_TIMEOUT=300 \ + UV_PYTHON_INSTALL_DIR=/opt/python \ + UV_PYTHON_PREFERENCE=only-managed \ + VIRTUAL_ENV=/opt/venv \ + PATH=/usr/local/cuda/bin:/opt/venv/bin:${PATH} + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential ninja-build git curl ca-certificates tini && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +COPY --from=uv /uv /uvx /usr/local/bin/ + +RUN uv python install ${PYTHON_VERSION} && \ + uv venv /opt/venv --python ${PYTHON_VERSION} + +# Single resolve across all packages — internally consistent pins. +# `--index-strategy unsafe-best-match` makes resolution deterministic across the +# cu130 index + PyPI mix (default `first-index` stops at the first index that has +# a candidate, which is brittle when transitive deps live on PyPI only). +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install \ + --index-url ${TORCH_INDEX} \ + --extra-index-url https://pypi.org/simple \ + --index-strategy unsafe-best-match \ + torch==${TORCH_VERSION} \ + torchvision==${TORCHVISION_VERSION} \ + torchaudio==${TORCHAUDIO_VERSION} \ + xformers==${XFORMERS_VERSION} \ + triton==${TRITON_VERSION} \ + accelerate numpy safetensors nvidia-ml-py \ + sympy packaging pybind11 ninja psutil wheel + +RUN uv pip freeze | grep -E \ + "^(torch|torchvision|torchaudio|xformers|triton|numpy|safetensors|accelerate|nvidia-|cuda-toolkit)==" \ + > /constraints.txt && \ + echo "Constraints:" && cat /constraints.txt + +# Build-time validation. `torch.cuda.is_available()` requires `--gpus all` and is +# intentionally not checked here — only that torch was built against CUDA and that +# xformers' py3-none wheel imports against torch's C++ ABI on cp313. +RUN python -c "import torch; print(f'PyTorch {torch.__version__} CUDA {torch.version.cuda}'); assert torch.version.cuda is not None" && \ + python -c "import xformers; print(f'xformers {xformers.__version__}')" + +ENTRYPOINT ["/usr/bin/tini", "--"] +``` + +### `base/pytorch/docker-bake.hcl` + +```hcl +target "docker-metadata-action" {} + +variable "APP" { + default = "pytorch" +} + +variable "VERSION" { + // Format: cuda{CUDA_VERSION}-torch{TORCH_VERSION} + default = "cuda13.0-torch2.11" +} + +variable "SOURCE" { + default = "https://github.com/arsac/containers" +} + +variable "VENDOR" { + default = "arsac" +} + +group "default" { + targets = ["image-devel-local"] +} + +target "image-devel" { + inherits = ["docker-metadata-action"] + dockerfile = "Dockerfile" + labels = { + "org.opencontainers.image.source" = "${SOURCE}" + } +} + +target "image-devel-local" { + inherits = ["image-devel"] + output = ["type=docker"] + tags = ["${APP}:${VERSION}-devel", "${APP}:devel", "${APP}:latest"] +} + +target "image-devel-all" { + inherits = ["image-devel"] + platforms = ["linux/amd64"] +} +``` + +Notes: +- Only devel targets exist (no `image` / `image-local` / `image-all` runtime targets). +- `image-devel-local` adds `${APP}:latest` so a freshly-built local image is reachable as `pytorch:latest` for downstream-app local builds. +- `cudnn-devel` adds ~3.5 GB to the base layer vs. plain `devel`. Estimated final image ~12-14 GB. Acceptable since this is the deployment target and the CUDA libs are required. + +## Build & CI + +**Local build:** +``` +cd base/pytorch +docker buildx bake image-devel-local +``` + +**Root `build-push-local.sh`:** existing script handles base images by directory walk; should work without modification. Verify during implementation. + +**GitHub Actions:** `.github/workflows/release.yaml` auto-detects new directories under `base/**` via `bjw-s-labs/action-changed-files` and dispatches to `image-builder.yaml`. The existing `build-bases-devel` job builds devel targets; the `build-bases` job builds runtime targets. + +**Implication for CI:** the existing `release.yaml` calls `build-bases` (runtime) for every changed base. Since `base/pytorch/`'s bake file has no runtime targets, that job will fail at `docker buildx bake --print` for the missing target. + +**Resolution:** modify `.github/workflows/release.yaml` to gate the `build-bases` (runtime) job on the existence of a `Dockerfile.runtime` in the base directory. A small composite-action or inline check at the matrix level — e.g., a `runtime-bases` job output that filters `changed-bases` by whether `base//Dockerfile.runtime` exists — keeps the workflow declarative and avoids publishing duplicate-tag aliases (which would be misleading: `:runtime` consumers would receive the full 12-14 GB devel image). The earlier alternative (stub `image` targets in the bake file aliasing `:devel`) is rejected because publishing identical content under different tag suffixes hides the fact that this image has no runtime variant. + +**Renovate:** existing `.renovaterc.json5` regex manager scans for `# datasource=X depName=Y` annotations next to bake variables. The existing `base/cuda-ml/` doesn't use these (manual bumps). Match the pattern: no annotations on the new bake file — manual bumps via PR. + +## Risks & open questions + +**`cuda-toolkit` PyPI umbrella + system toolkit duplication.** +The base image installs the CUDA 13 toolkit via apt (in `/usr/local/cuda`), and torch's transitive deps install the `cuda-toolkit==13.0.2` PyPI umbrella into `site-packages/nvidia/`. They coexist; torch loads from `site-packages/nvidia/`, while `nvcc` and CUDA headers come from `/usr/local/cuda`. Disk overhead ~1-2 GB. Eliminating it would require either (a) skipping the apt toolkit and relying entirely on the pip-installed one (loses nvcc, headers — fails req #5/#9), or (b) excluding the pip umbrella from torch's deps (would break torch). Accept the duplication; document in the Dockerfile. + +**Triton 3.6.0 not on PyPI.** +PyPI's latest is triton 3.5.x; 3.6.0 only exists on the PyTorch index. Resolved deterministically by `--index-strategy unsafe-best-match` on `uv pip install`: uv considers all configured indexes and selects the best version match for each name, regardless of which index it was found on first. (Default `first-index` strategy stops at the first index with any candidate, which works coincidentally today because triton is on the torch index but is brittle.) + +**xformers 0.0.35 abi3 vs cp313.** +The cu130 index ships xformers as a `py3-none-manylinux_2_28_x86_64.whl` (universal py3 stable-ABI wheel), not a cp313-tagged one. py3-none wheels still link against torch's C++ ABI, which is cp-version-sensitive — if xformers' published wheel was built against a different torch ABI hash than 2.11+cu130/cp313 ships, you get import-time `_C` symbol errors. Mitigated by the `import xformers` smoke test in the Dockerfile build, which fails the image build if the ABI doesn't line up. + +**`cuda-toolkit` PyPI umbrella patch-version skew.** +The apt-installed system toolkit is 13.0.3 (in `/usr/local/cuda`), the wheel-installed toolkit is `cuda-toolkit==13.0.2` (in `site-packages/nvidia/`). Downstream extensions built against `/usr/local/cuda/include` and then dlopening from `site-packages/nvidia/` at runtime can hit subtle ABI skew in `nvjitlink`/`nvrtc` minor versions. Low impact in practice (NVIDIA maintains intra-13.0.x ABI), but worth knowing when debugging extension build failures. Future cu130 patch updates may close the gap. + +**`LD_LIBRARY_PATH` not set; wheel-side libs found only via torch's loader.** +`libcusparseLt.so.0` (in `site-packages/nvidia/cusparselt/lib/`) and `libnvshmem_host.so.3` (in `site-packages/nvidia/nvshmem/lib/`) are added to the dlopen path by `torch/__init__.py` at import time. Code that calls `ctypes.CDLL("libcusparseLt.so.0")` *before* importing torch — uncommon but possible in profiling/diagnostic code — will not find these libraries. If a downstream app needs this guarantee, set `LD_LIBRARY_PATH` to include both directories in its own image layer. + +**`UV_LINK_MODE=copy` is load-bearing.** +`uv venv` and `uv pip install` default to `hardlink`, which fails when the venv (`/opt/venv`) and the uv cache (`/root/.cache/uv`) are on different filesystems — common in Docker layered storage. `UV_LINK_MODE=copy` is set explicitly to avoid this. A future "drop it for speed" change would silently break the build; do not remove without re-verifying both build paths. + +**Image-rebuild cache invalidation when uv version bumps.** +The `UV_VERSION` ARG pin to `0.11.8` (vs `:latest`) means uv version is part of the layer cache key — bumping it invalidates the apt layer below. This is the right tradeoff: silent bumps via `:latest` defeat reproducibility and can change resolver behavior under your feet. Bump `UV_VERSION` deliberately via PR; expect a full rebuild. + +## Out of scope / future work + +- **`base/cuda-ml/` cu130 migration.** Currently cu128/py312 with a heavy stack. A future PR could either bump it in place or layer it `FROM ghcr.io/arsac/pytorch:cuda13.0-torch2.11` once both share cu130/py313, eliminating duplicate torch installs across the base family. +- **Adding flash-attn back** when v4 has functional SM 12.0 kernels (PRs #2349, #2406, #2499 merged) or when v2.x publishes confirmed cu13 wheels. +- **Multi-arch CUDA support.** Currently `TORCH_CUDA_ARCH_LIST="12.0"`. If the cluster adds Hopper (H100, SM 9.0) or Ada (L40, SM 8.9) nodes, expand the list and rebuild. +- **Renovate annotations** on the bake file and Dockerfile `ARG`s once added to `base/cuda-ml/` too — keep siblings consistent. +- **`--torch-backend=cu130` flag.** uv's modern torch-CUDA selection flag (`uv pip install torch --torch-backend=cu130`) is declined here because it only handles `torch` itself; we still need `torchvision`, `torchaudio`, `xformers`, and `triton` from the same cu130 index. Sticking with `--index-url` + `--index-strategy unsafe-best-match` covers all five names uniformly. Re-evaluate if uv extends `--torch-backend` to cover the broader ecosystem.