Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
09ce350
Add ART GPU image build script
bradhilton Jun 9, 2026
60ccf59
Add GPU image build workflow.
bradhilton Jun 10, 2026
8352323
Harden GPU image workflow defaults.
bradhilton Jun 10, 2026
b72b2e3
Run smoke test for push-triggered workflow.
bradhilton Jun 10, 2026
98f57d7
Initialize SkyPilot before workflow smoke launch.
bradhilton Jun 10, 2026
d82eedb
Install socat for workflow SkyPilot smoke test.
bradhilton Jun 10, 2026
ba05518
Run workflow SkyPilot smoke test on Python 3.10.
bradhilton Jun 10, 2026
f900b81
Use registry auth for GPU image prewarm pulls.
bradhilton Jun 10, 2026
04a0fd9
Run GPU image workflow on main pushes.
bradhilton Jun 10, 2026
41feb79
Pull GPU images through CoreWeave cache.
bradhilton Jun 10, 2026
78197cf
Bump workflow smoke test SkyPilot patch.
bradhilton Jun 10, 2026
d369a11
Log workflow smoke diagnostics on failure.
bradhilton Jun 10, 2026
9272258
Use SkyPilot 0.12 for workflow smoke launch.
bradhilton Jun 10, 2026
c72e003
Use persistent SkyPilot venv in workflow smoke test.
bradhilton Jun 10, 2026
fc3d943
Pin workflow smoke Kubernetes client.
bradhilton Jun 10, 2026
ad64f0b
Prewarm full GPU dependency graph.
bradhilton Jun 10, 2026
08b55e5
Include vLLM runtime locks in GPU build context.
bradhilton Jun 10, 2026
8464254
Seed prebaked ART workdir environment.
bradhilton Jun 11, 2026
2b9f34b
Keep prebaked ART venv outside synced workdir.
bradhilton Jun 11, 2026
ec0e5b5
Link workdir venv to prebaked ART environment.
bradhilton Jun 11, 2026
b7d553a
chore: Update dependency exclusions and add megatron-core to build de…
bradhilton Jun 11, 2026
8378312
fix: Enhance registry host validation and improve environment variabl…
bradhilton Jun 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
234 changes: 234 additions & 0 deletions .github/workflows/build-gpu-image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
name: Build GPU Image

on:
push:
branches: [main]
workflow_dispatch:
inputs:
image_repo:
description: "Image repository to push"
required: true
default: "docker.io/bradhiltonnw/art-gpu"
type: string
pull_image_repo:
description: "Image repository for cluster pulls/prewarm"
required: true
default: "images.coreweave.com/cluster-images/bradhiltonnw/art-gpu"
type: string
tag:
description: "Image tag to push"
required: true
default: "latest"
type: string
no_cache:
description: "Disable registry-backed BuildKit cache"
required: true
default: false
type: boolean
prewarm_nodes:
description: "Pre-pull the pushed image on GPU nodes"
required: true
default: true
type: boolean
prewarm_timeout:
description: "Timeout for GPU node prewarm rollout"
required: true
default: "30m"
type: string
smoke_launch:
description: "Launch a minimal SkyPilot task with the pushed image"
required: true
default: true
type: boolean
smoke_gpus:
description: "GPU request for the smoke launch"
required: true
default: "H200:1"
type: string

permissions:
contents: read
packages: write

concurrency:
group: build-gpu-image-cks-wb3
cancel-in-progress: false

jobs:
build-gpu-image:
runs-on: ubuntu-latest
timeout-minutes: 360

env:
SKY_INFRA: k8s/cks-wb3

steps:
- uses: actions/checkout@v4

- name: Install uv
run: |
curl -LsSf https://astral.sh/uv/install.sh | sh
echo "${HOME}/.local/bin" >> "${GITHUB_PATH}"
echo "${HOME}/.cargo/bin" >> "${GITHUB_PATH}"

- name: Install kubectl
run: |
sudo apt-get update
sudo apt-get install -y --no-install-recommends socat
kubectl_version="$(curl -LsSf https://dl.k8s.io/release/stable.txt)"
curl -LsSf -o kubectl "https://dl.k8s.io/release/${kubectl_version}/bin/linux/amd64/kubectl"
sudo install -m 0755 kubectl /usr/local/bin/kubectl
kubectl version --client=true

- name: Configure cks-wb3 kubeconfig
env:
CKS_WB3_KUBECONFIG: ${{ secrets.CKS_WB3_KUBECONFIG }}
run: |
if [ -z "${CKS_WB3_KUBECONFIG}" ]; then
echo "::error::Missing required CKS_WB3_KUBECONFIG secret."
exit 1
fi

mkdir -p "${HOME}/.kube"
export KUBECONFIG_PATH="${HOME}/.kube/config"
python3 - <<'PY'
import base64
import binascii
import os
from pathlib import Path

value = os.environ["CKS_WB3_KUBECONFIG"].encode()
compact_value = b"".join(value.split())
try:
decoded = base64.b64decode(compact_value, validate=True)
data = decoded if decoded.lstrip().startswith(b"apiVersion:") else value
except binascii.Error:
data = value

Path(os.environ["KUBECONFIG_PATH"]).write_bytes(data)
PY
chmod 600 "${KUBECONFIG_PATH}"
kubectl --context cks-wb3 get nodes >/dev/null

- name: Configure registry auth
env:
IMAGE_REPO: ${{ inputs.image_repo || 'docker.io/bradhiltonnw/art-gpu' }}
REGISTRY_AUTH_JSON_B64_SECRET: ${{ secrets.REGISTRY_AUTH_JSON_B64 }}
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
DOCKERHUB_PASSWORD: ${{ secrets.DOCKERHUB_PASSWORD }}
run: |
if [ -n "${REGISTRY_AUTH_JSON_B64_SECRET}" ]; then
echo "REGISTRY_AUTH_JSON_B64=${REGISTRY_AUTH_JSON_B64_SECRET}" >> "${GITHUB_ENV}"
exit 0
fi

registry_host="${IMAGE_REPO%%/*}"
if [ -z "${registry_host}" ] || [ "${registry_host}" = "${IMAGE_REPO}" ]; then
registry_host="docker.io"
fi

if [ "${registry_host}" != "docker.io" ]; then
echo "Using script-managed auth for ${registry_host}."
exit 0
fi

dockerhub_password="${DOCKERHUB_TOKEN:-${DOCKERHUB_PASSWORD:-}}"
if [ -z "${DOCKERHUB_USERNAME}" ] || [ -z "${dockerhub_password}" ]; then
echo "::error::Docker Hub pushes require REGISTRY_AUTH_JSON_B64 or DOCKERHUB_USERNAME plus DOCKERHUB_TOKEN."
exit 1
fi

printf '%s' "${dockerhub_password}" | docker login docker.io \
--username "${DOCKERHUB_USERNAME}" \
--password-stdin
{
printf 'REGISTRY_AUTH_JSON_B64='
base64 -w 0 "${HOME}/.docker/config.json"
printf '\n'
} >> "${GITHUB_ENV}"

- name: Build, push, and prewarm GPU image
env:
GH_TOKEN: ${{ github.token }}
IMAGE_REPO: ${{ inputs.image_repo || 'docker.io/bradhiltonnw/art-gpu' }}
PULL_IMAGE_REPO: ${{ inputs.pull_image_repo || 'images.coreweave.com/cluster-images/bradhiltonnw/art-gpu' }}
IMAGE_TAG: ${{ inputs.tag }}
NO_CACHE: ${{ inputs.no_cache }}
PREWARM_NODES: ${{ inputs.prewarm_nodes }}
PREWARM_TIMEOUT: ${{ inputs.prewarm_timeout }}
run: |
IMAGE_TAG="${IMAGE_TAG:-latest}"
NO_CACHE="${NO_CACHE:-false}"
PREWARM_NODES="${PREWARM_NODES:-true}"
PREWARM_TIMEOUT="${PREWARM_TIMEOUT:-30m}"

args=(
--infra "${SKY_INFRA}"
--image-repo "${IMAGE_REPO}"
--pull-image-repo "${PULL_IMAGE_REPO}"
--tag "${IMAGE_TAG}"
--prewarm-timeout "${PREWARM_TIMEOUT}"
)

if [ "${NO_CACHE}" = "true" ]; then
args+=(--no-cache)
fi

if [ "${PREWARM_NODES}" != "true" ]; then
args+=(--no-prewarm-nodes)
fi

bash scripts/build-gpu-image.sh "${args[@]}"

- name: Smoke launch pushed image
if: ${{ github.event_name != 'workflow_dispatch' || inputs.smoke_launch }}
env:
PULL_IMAGE_REPO: ${{ inputs.pull_image_repo || 'images.coreweave.com/cluster-images/bradhiltonnw/art-gpu' }}
IMAGE_TAG: ${{ inputs.tag }}
SMOKE_GPUS: ${{ inputs.smoke_gpus }}
run: |
IMAGE_TAG="${IMAGE_TAG:-latest}"
SMOKE_GPUS="${SMOKE_GPUS:-H200:1}"
cluster="art-gpu-smoke-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
sky_venv="${RUNNER_TEMP}/skypilot-smoke"
uv venv --python 3.10 "${sky_venv}"
uv pip install --python "${sky_venv}/bin/python" 'skypilot[kubernetes,remote]==0.12.0' 'kubernetes==34.1.0'
sky_cmd=("${sky_venv}/bin/sky")
dump_diagnostics() {
echo "::group::Smoke diagnostics"
free -h || true
ps -eo pid,ppid,stat,rss,comm,args | grep -E 'SkyPilot|sky|python' || true
"${sky_cmd[@]}" api status || true
if [ -f "${HOME}/.sky/api_server/server.log" ]; then
tail -n 240 "${HOME}/.sky/api_server/server.log" || true
fi
echo "::endgroup::"
}
cleanup() {
"${sky_cmd[@]}" down -y "${cluster}" || true
}
finish() {
status=$?
if [ "${status}" -ne 0 ]; then
dump_diagnostics
fi
cleanup
exit "${status}"
}
trap finish EXIT

"${sky_cmd[@]}" check kubernetes

/usr/bin/time -p "${sky_cmd[@]}" launch -y \
-c "${cluster}" \
--infra "${SKY_INFRA}" \
--gpus "${SMOKE_GPUS}" \
--image-id "docker:${PULL_IMAGE_REPO}:${IMAGE_TAG}" \
--config kubernetes.pod_config.spec.schedulerName=binpack-scheduler \
--config kubernetes.pod_config.spec.activeDeadlineSeconds=604800 \
'python -c "import pathlib, subprocess; print(\"ART_IMAGE_SMOKE_OK\"); print(\"SKY_PYTHON_PATH\", pathlib.Path.home().joinpath(\".sky/python_path\").read_text().strip()); print(\"RAY_PATH\", pathlib.Path.home().joinpath(\".sky/ray_path\").read_text().strip()); print(\"UV\", subprocess.check_output([\"uv\", \"--version\"], text=True).strip())"'

kubectl --context cks-wb3 get pods -n default \
-l "skypilot-cluster=${cluster}" \
-o jsonpath='{range .items[*]}{.metadata.name} {.spec.containers[0].image}{"\n"}{end}'
6 changes: 4 additions & 2 deletions dev/yes-no-maybe-fork-pipeline.sky.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ workdir: .

resources:
accelerators: ["H200:2", "H100-SXM:2", "H100:2", "A100-80GB:2"]
image_id: docker:pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel
image_id: docker:images.coreweave.com/cluster-images/bradhiltonnw/art-gpu:latest
ports:
- 7999 # main ART server
- 8000 # vLLM server

envs:
GIT_RESET_CLEAN: "false"
INSTALL_EXTRAS: "false"
PYTHONUTF8: "1"
GIT_USER_NAME: "Your Name"
GIT_USER_EMAIL: "your.email@example.com"
Expand All @@ -27,7 +28,8 @@ config:
kubernetes:
pod_config:
spec:
schedulerName: gpu-binpack
schedulerName: binpack-scheduler
activeDeadlineSeconds: 604800
containers:
- name: ray-node
env:
Expand Down
6 changes: 4 additions & 2 deletions dev/yes-no-maybe-fork.sky.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@ workdir: .

resources:
accelerators: ["H200:1", "H100-SXM:1", "H100:1", "A100-80GB:1"]
image_id: docker:pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel
image_id: docker:images.coreweave.com/cluster-images/bradhiltonnw/art-gpu:latest
ports:
- 7999 # main ART server
- 8000 # vLLM server

envs:
GIT_RESET_CLEAN: "false" # preserve workdir files synced by SkyPilot
INSTALL_EXTRAS: "false"
PYTHONUTF8: "1"
GIT_USER_NAME: "Your Name"
GIT_USER_EMAIL: "your.email@example.com"
Expand All @@ -27,7 +28,8 @@ config:
kubernetes:
pod_config:
spec:
schedulerName: gpu-binpack
schedulerName: binpack-scheduler
activeDeadlineSeconds: 604800
containers:
- name: ray-node
env:
Expand Down
Loading
Loading