OpenPipe · bradhilton · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
diff --git a/.github/workflows/build-gpu-image.yml b/.github/workflows/build-gpu-image.yml
@@ -13,7 +13,7 @@ on:
       pull_image_repo:
         description: "Image repository for cluster pulls/prewarm"
         required: true
-        default: "images.coreweave.com/cluster-images/bradhiltonnw/art-gpu"
+        default: "docker.io/bradhiltonnw/art-gpu"
         type: string
       tag:
         description: "Image tag to push"
@@ -152,7 +152,7 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
           IMAGE_REPO: ${{ inputs.image_repo || 'docker.io/bradhiltonnw/art-gpu' }}
-          PULL_IMAGE_REPO: ${{ inputs.pull_image_repo || 'images.coreweave.com/cluster-images/bradhiltonnw/art-gpu' }}
+          PULL_IMAGE_REPO: ${{ inputs.pull_image_repo || 'docker.io/bradhiltonnw/art-gpu' }}
           IMAGE_TAG: ${{ inputs.tag }}
           NO_CACHE: ${{ inputs.no_cache }}
           PREWARM_NODES: ${{ inputs.prewarm_nodes }}
@@ -181,10 +181,10 @@ jobs:
 
           bash scripts/build-gpu-image.sh "${args[@]}"
 
-      - name: Smoke launch pushed image
+      - name: Smoke launch prewarmed image
         if: ${{ github.event_name != 'workflow_dispatch' || inputs.smoke_launch }}
         env:
-          SMOKE_IMAGE_REPO: ${{ inputs.image_repo || 'docker.io/bradhiltonnw/art-gpu' }}
+          SMOKE_IMAGE_REPO: ${{ inputs.pull_image_repo || 'docker.io/bradhiltonnw/art-gpu' }}
           IMAGE_TAG: ${{ inputs.tag }}
           SMOKE_GPUS: ${{ inputs.smoke_gpus }}
         run: |
@@ -220,14 +220,16 @@ jobs:
 
           "${sky_cmd[@]}" check kubernetes
 
+          smoke_code='import os, pathlib, subprocess, sys; expected = os.environ["EXPECTED_ART_IMAGE_REVISION"]; actual = os.environ.get("ART_IMAGE_REVISION"); print("ART_IMAGE_REVISION", actual); sys.exit(f"ART_IMAGE_REVISION mismatch: expected {expected}, got {actual}") if actual != expected else None; print("ART_IMAGE_SMOKE_OK"); print("SKY_PYTHON_PATH", pathlib.Path.home().joinpath(".sky/python_path").read_text().strip()); print("RAY_PATH", pathlib.Path.home().joinpath(".sky/ray_path").read_text().strip()); print("UV", subprocess.check_output(["uv", "--version"], text=True).strip())'
+
           /usr/bin/time -p "${sky_cmd[@]}" launch -y \
               -c "${cluster}" \
               --infra "${SKY_INFRA}" \
               --gpus "${SMOKE_GPUS}" \
               --image-id "docker:${SMOKE_IMAGE_REPO}:${IMAGE_TAG}" \
               --config kubernetes.pod_config.spec.schedulerName=binpack-scheduler \
               --config kubernetes.pod_config.spec.activeDeadlineSeconds=604800 \
-              'python -c "import pathlib, subprocess; print(\"ART_IMAGE_SMOKE_OK\"); print(\"SKY_PYTHON_PATH\", pathlib.Path.home().joinpath(\".sky/python_path\").read_text().strip()); print(\"RAY_PATH\", pathlib.Path.home().joinpath(\".sky/ray_path\").read_text().strip()); print(\"UV\", subprocess.check_output([\"uv\", \"--version\"], text=True).strip())"'
+              "EXPECTED_ART_IMAGE_REVISION=${GITHUB_SHA} python -c '${smoke_code}'"
 
           kubectl --context cks-wb3 get pods -n default \
             -l "skypilot-cluster=${cluster}" \

diff --git a/docker/art-gpu.Dockerfile b/docker/art-gpu.Dockerfile
@@ -99,7 +99,6 @@ ENV CUDA_HOME=/usr/local/cuda-12.8 \
 SHELL ["/bin/bash", "-c"]
 
 LABEL org.opencontainers.image.source="https://github.com/openpipe/art" \
-      org.opencontainers.image.revision="${ART_SHA}" \
       org.opencontainers.image.description="ART GPU image with warmed uv caches for SkyPilot launches." \
       org.opencontainers.image.title="art-gpu"
 
@@ -163,3 +162,6 @@ RUN mkdir -p "${HOME}/.local/bin" "${HOME}/.sky/sky_app" "${HOME}/sky_workdir" \
  && VIRTUAL_ENV="${HOME}/skypilot-runtime" UV_LINK_MODE=copy UV_SYSTEM_PYTHON=false env -u PYTHONPATH -C "${HOME}" uv pip uninstall skypilot \
  && printf '%s\n' "${HOME}/skypilot-runtime/bin/python" > "${HOME}/.sky/python_path" \
  && VIRTUAL_ENV="${HOME}/skypilot-runtime" UV_LINK_MODE=copy UV_SYSTEM_PYTHON=false env -u PYTHONPATH -C "${HOME}" uv run --no-project --no-config which ray > "${HOME}/.sky/ray_path"
+
+ENV ART_IMAGE_REVISION=${ART_SHA}
+LABEL org.opencontainers.image.revision="${ART_SHA}"
diff --git a/scripts/build-gpu-image.sh b/scripts/build-gpu-image.sh
@@ -36,6 +36,9 @@ prewarm_name="${PREWARM_NAME:-art-gpu-image-prewarm}"
 prewarm_image_pull_secret="${PREWARM_IMAGE_PULL_SECRET:-art-gpu-registry-auth}"
 prewarm_node_selector="${PREWARM_NODE_SELECTOR:-node.coreweave.cloud/class=gpu}"
 prewarm_timeout="${PREWARM_TIMEOUT:-30m}"
+prewarm_node_timeout="${PREWARM_NODE_TIMEOUT:-10m}"
+prewarm_node_retries="${PREWARM_NODE_RETRIES:-3}"
+prewarm_node_parallelism="${PREWARM_NODE_PARALLELISM:-3}"
 
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -390,11 +393,119 @@ PY
 )"
 prewarm_image="${pull_image_repo}:${image_tag}"
 if [[ -n "${image_digest}" ]]; then
-  prewarm_image="${pull_image_repo}@${image_digest}"
+  if [[ "${pull_image_repo}" == "${image_repo}" ]]; then
+    prewarm_image="${pull_image_repo}@${image_digest}"
+  else
+    echo "Prewarm pull repo differs from pushed image repo; using mutable tag for pull-through freshness:"
+    echo "  Pushed image digest: ${image_repo}@${image_digest}"
+    echo "  Prewarm image: ${prewarm_image}"
+  fi
 fi
 
+dump_prewarm_diagnostics() {
+  echo "::group::Prewarm diagnostics"
+  "${kubectl_cmd[@]}" get daemonset -n "${prewarm_namespace}" "${prewarm_name}" -o wide || true
+  "${kubectl_cmd[@]}" get pods -n "${prewarm_namespace}" -l "app=${prewarm_name}" -o wide || true
+  "${kubectl_cmd[@]}" get pods -n "${prewarm_namespace}" -l "art.openpipe/prewarm-name=${prewarm_name}" -o wide || true
+  "${kubectl_cmd[@]}" describe daemonset -n "${prewarm_namespace}" "${prewarm_name}" || true
+  first_prewarm_pod="$(
+    "${kubectl_cmd[@]}" get pods -n "${prewarm_namespace}" -l "app=${prewarm_name}" \
+      -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true
+  )"
+  if [[ -n "${first_prewarm_pod}" ]]; then
+    "${kubectl_cmd[@]}" describe pod -n "${prewarm_namespace}" "${first_prewarm_pod}" || true
+  fi
+  "${kubectl_cmd[@]}" get events -n "${prewarm_namespace}" --sort-by=.lastTimestamp | tail -n 80 || true
+  echo "::endgroup::"
+}
+
+sanitize_k8s_name_part() {
+  printf '%s' "$1" \
+    | tr '[:upper:]' '[:lower:]' \
+    | tr -c 'a-z0-9-' '-' \
+    | sed -E 's/^-+//; s/-+$//; s/-+/-/g' \
+    | cut -c1-35
+}
+
+prewarm_single_node() {
+  local node="$1"
+  local node_slug
+  local pod
+  local attempt
+
+  node_slug="$(sanitize_k8s_name_part "${node}")"
+  if [[ -z "${node_slug}" ]]; then
+    echo "Could not derive Kubernetes pod name for node ${node}" >&2
+    return 1
+  fi
+  pod="${prewarm_name}-${node_slug}"
+
+  for attempt in $(seq 1 "${prewarm_node_retries}"); do
+    echo "Prewarming ${prewarm_image} on GPU node ${node} (attempt ${attempt}/${prewarm_node_retries})"
+    "${kubectl_cmd[@]}" delete pod -n "${prewarm_namespace}" "${pod}" \
+      --ignore-not-found --wait=true >/dev/null 2>&1 || true
+    "${kubectl_cmd[@]}" apply -n "${prewarm_namespace}" -f - <<EOF
+apiVersion: v1
+kind: Pod
+metadata:
+  name: ${pod}
+  labels:
+    app: ${prewarm_name}-oneshot
+    art.openpipe/prewarm-name: ${prewarm_name}
+    art.openpipe/prewarm-token: "${timestamp}-${art_short_sha}"
+spec:
+  restartPolicy: Never
+  nodeName: ${node}
+  imagePullSecrets:
+    - name: ${prewarm_image_pull_secret}
+  tolerations:
+    - operator: Exists
+  initContainers:
+    - name: prepull
+      image: ${prewarm_image}
+      imagePullPolicy: Always
+      command: ["bash", "-lc", "true"]
+      resources:
+        requests:
+          cpu: 10m
+          memory: 16Mi
+  containers:
+    - name: pause
+      image: registry.k8s.io/pause:3.10
+      resources:
+        requests:
+          cpu: 10m
+          memory: 16Mi
+EOF
+    if "${kubectl_cmd[@]}" wait -n "${prewarm_namespace}" \
+      --for=condition=Ready "pod/${pod}" \
+      --timeout="${prewarm_node_timeout}"; then
+      "${kubectl_cmd[@]}" delete pod -n "${prewarm_namespace}" "${pod}" \
+        --ignore-not-found --wait=true >/dev/null 2>&1 || true
+      return 0
+    fi
+
+    echo "Prewarm failed on node ${node}; pod diagnostics:"
+    "${kubectl_cmd[@]}" describe pod -n "${prewarm_namespace}" "${pod}" || true
+    "${kubectl_cmd[@]}" delete pod -n "${prewarm_namespace}" "${pod}" \
+      --ignore-not-found --wait=true >/dev/null 2>&1 || true
+    sleep "$((attempt * 10))"
+  done
+
+  echo "Failed to prewarm ${prewarm_image} on node ${node}" >&2
+  return 1
+}
+
 if [[ "${prewarm_nodes}" == "true" ]]; then
-  gpu_node_count="$("${kubectl_cmd[@]}" get nodes -l "${prewarm_node_selector}" --no-headers 2>/dev/null | wc -l | tr -d ' ')"
+  if ! [[ "${prewarm_node_parallelism}" =~ ^[1-9][0-9]*$ ]]; then
+    echo "PREWARM_NODE_PARALLELISM must be a positive integer, got: ${prewarm_node_parallelism}" >&2
+    exit 1
+  fi
+  mapfile -t gpu_nodes < <(
+    "${kubectl_cmd[@]}" get nodes -l "${prewarm_node_selector}" \
+      -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null
+  )
+  gpu_node_count="${#gpu_nodes[@]}"
   if [[ "${gpu_node_count}" == "0" ]]; then
     echo "Skipping GPU node prewarm: no nodes match ${prewarm_node_selector}"
   else
@@ -405,6 +516,35 @@ if [[ "${prewarm_nodes}" == "true" ]]; then
       --type=kubernetes.io/dockerconfigjson \
       --dry-run=client -o yaml \
       | "${kubectl_cmd[@]}" apply -n "${prewarm_namespace}" -f -
+
+    echo "Stopping existing ${prewarm_name} DaemonSet before batched node prewarm"
+    "${kubectl_cmd[@]}" delete daemonset -n "${prewarm_namespace}" "${prewarm_name}" \
+      --ignore-not-found --wait=true >/dev/null 2>&1 || true
+
+    prewarm_failures=0
+    prewarm_pids=()
+    for gpu_node in "${gpu_nodes[@]}"; do
+      prewarm_single_node "${gpu_node}" &
+      prewarm_pids+=("$!")
+
+      if (( ${#prewarm_pids[@]} >= prewarm_node_parallelism )); then
+        if ! wait "${prewarm_pids[0]}"; then
+          prewarm_failures=1
+        fi
+        prewarm_pids=("${prewarm_pids[@]:1}")
+      fi
+    done
+    for prewarm_pid in "${prewarm_pids[@]}"; do
+      if ! wait "${prewarm_pid}"; then
+        prewarm_failures=1
+      fi
+    done
+    if [[ "${prewarm_failures}" != "0" ]]; then
+      dump_prewarm_diagnostics
+      exit 1
+    fi
+
+    echo "Installing steady-state ${prewarm_name} DaemonSet"
     "${kubectl_cmd[@]}" apply -n "${prewarm_namespace}" -f - <<EOF
 apiVersion: apps/v1
 kind: DaemonSet
@@ -419,7 +559,7 @@ spec:
   updateStrategy:
     type: RollingUpdate
     rollingUpdate:
-      maxUnavailable: 100%
+      maxUnavailable: 1
   template:
     metadata:
       labels:
@@ -450,7 +590,10 @@ spec:
               cpu: 10m
               memory: 16Mi
 EOF
-    "${kubectl_cmd[@]}" rollout status -n "${prewarm_namespace}" "daemonset/${prewarm_name}" --timeout="${prewarm_timeout}"
+    if ! "${kubectl_cmd[@]}" rollout status -n "${prewarm_namespace}" "daemonset/${prewarm_name}" --timeout="${prewarm_timeout}"; then
+      dump_prewarm_diagnostics
+      exit 1
+    fi
   fi
 else
   echo "Skipping GPU node prewarm"