Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions .github/workflows/build-gpu-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on:
pull_image_repo:
description: "Image repository for cluster pulls/prewarm"
required: true
default: "images.coreweave.com/cluster-images/bradhiltonnw/art-gpu"
default: "docker.io/bradhiltonnw/art-gpu"
type: string
tag:
description: "Image tag to push"
Expand Down Expand Up @@ -152,7 +152,7 @@ jobs:
env:
GH_TOKEN: ${{ github.token }}
IMAGE_REPO: ${{ inputs.image_repo || 'docker.io/bradhiltonnw/art-gpu' }}
PULL_IMAGE_REPO: ${{ inputs.pull_image_repo || 'images.coreweave.com/cluster-images/bradhiltonnw/art-gpu' }}
PULL_IMAGE_REPO: ${{ inputs.pull_image_repo || 'docker.io/bradhiltonnw/art-gpu' }}
IMAGE_TAG: ${{ inputs.tag }}
NO_CACHE: ${{ inputs.no_cache }}
PREWARM_NODES: ${{ inputs.prewarm_nodes }}
Expand Down Expand Up @@ -181,10 +181,10 @@ jobs:

bash scripts/build-gpu-image.sh "${args[@]}"

- name: Smoke launch pushed image
- name: Smoke launch prewarmed image
if: ${{ github.event_name != 'workflow_dispatch' || inputs.smoke_launch }}
env:
SMOKE_IMAGE_REPO: ${{ inputs.image_repo || 'docker.io/bradhiltonnw/art-gpu' }}
SMOKE_IMAGE_REPO: ${{ inputs.pull_image_repo || 'docker.io/bradhiltonnw/art-gpu' }}
IMAGE_TAG: ${{ inputs.tag }}
SMOKE_GPUS: ${{ inputs.smoke_gpus }}
run: |
Expand Down Expand Up @@ -220,14 +220,16 @@ jobs:

"${sky_cmd[@]}" check kubernetes

smoke_code='import os, pathlib, subprocess, sys; expected = os.environ["EXPECTED_ART_IMAGE_REVISION"]; actual = os.environ.get("ART_IMAGE_REVISION"); print("ART_IMAGE_REVISION", actual); sys.exit(f"ART_IMAGE_REVISION mismatch: expected {expected}, got {actual}") if actual != expected else None; print("ART_IMAGE_SMOKE_OK"); print("SKY_PYTHON_PATH", pathlib.Path.home().joinpath(".sky/python_path").read_text().strip()); print("RAY_PATH", pathlib.Path.home().joinpath(".sky/ray_path").read_text().strip()); print("UV", subprocess.check_output(["uv", "--version"], text=True).strip())'

/usr/bin/time -p "${sky_cmd[@]}" launch -y \
-c "${cluster}" \
--infra "${SKY_INFRA}" \
--gpus "${SMOKE_GPUS}" \
--image-id "docker:${SMOKE_IMAGE_REPO}:${IMAGE_TAG}" \
--config kubernetes.pod_config.spec.schedulerName=binpack-scheduler \
--config kubernetes.pod_config.spec.activeDeadlineSeconds=604800 \
'python -c "import pathlib, subprocess; print(\"ART_IMAGE_SMOKE_OK\"); print(\"SKY_PYTHON_PATH\", pathlib.Path.home().joinpath(\".sky/python_path\").read_text().strip()); print(\"RAY_PATH\", pathlib.Path.home().joinpath(\".sky/ray_path\").read_text().strip()); print(\"UV\", subprocess.check_output([\"uv\", \"--version\"], text=True).strip())"'
"EXPECTED_ART_IMAGE_REVISION=${GITHUB_SHA} python -c '${smoke_code}'"

kubectl --context cks-wb3 get pods -n default \
-l "skypilot-cluster=${cluster}" \
Expand Down
4 changes: 3 additions & 1 deletion docker/art-gpu.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ ENV CUDA_HOME=/usr/local/cuda-12.8 \
SHELL ["/bin/bash", "-c"]

LABEL org.opencontainers.image.source="https://github.com/openpipe/art" \
org.opencontainers.image.revision="${ART_SHA}" \
org.opencontainers.image.description="ART GPU image with warmed uv caches for SkyPilot launches." \
org.opencontainers.image.title="art-gpu"

Expand Down Expand Up @@ -163,3 +162,6 @@ RUN mkdir -p "${HOME}/.local/bin" "${HOME}/.sky/sky_app" "${HOME}/sky_workdir" \
&& VIRTUAL_ENV="${HOME}/skypilot-runtime" UV_LINK_MODE=copy UV_SYSTEM_PYTHON=false env -u PYTHONPATH -C "${HOME}" uv pip uninstall skypilot \
&& printf '%s\n' "${HOME}/skypilot-runtime/bin/python" > "${HOME}/.sky/python_path" \
&& VIRTUAL_ENV="${HOME}/skypilot-runtime" UV_LINK_MODE=copy UV_SYSTEM_PYTHON=false env -u PYTHONPATH -C "${HOME}" uv run --no-project --no-config which ray > "${HOME}/.sky/ray_path"

ENV ART_IMAGE_REVISION=${ART_SHA}
LABEL org.opencontainers.image.revision="${ART_SHA}"
151 changes: 147 additions & 4 deletions scripts/build-gpu-image.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ prewarm_name="${PREWARM_NAME:-art-gpu-image-prewarm}"
prewarm_image_pull_secret="${PREWARM_IMAGE_PULL_SECRET:-art-gpu-registry-auth}"
prewarm_node_selector="${PREWARM_NODE_SELECTOR:-node.coreweave.cloud/class=gpu}"
prewarm_timeout="${PREWARM_TIMEOUT:-30m}"
prewarm_node_timeout="${PREWARM_NODE_TIMEOUT:-10m}"
prewarm_node_retries="${PREWARM_NODE_RETRIES:-3}"
prewarm_node_parallelism="${PREWARM_NODE_PARALLELISM:-3}"

while [[ $# -gt 0 ]]; do
case "$1" in
Expand Down Expand Up @@ -390,11 +393,119 @@ PY
)"
prewarm_image="${pull_image_repo}:${image_tag}"
if [[ -n "${image_digest}" ]]; then
prewarm_image="${pull_image_repo}@${image_digest}"
if [[ "${pull_image_repo}" == "${image_repo}" ]]; then
prewarm_image="${pull_image_repo}@${image_digest}"
else
echo "Prewarm pull repo differs from pushed image repo; using mutable tag for pull-through freshness:"
echo " Pushed image digest: ${image_repo}@${image_digest}"
echo " Prewarm image: ${prewarm_image}"
fi
fi

dump_prewarm_diagnostics() {
echo "::group::Prewarm diagnostics"
"${kubectl_cmd[@]}" get daemonset -n "${prewarm_namespace}" "${prewarm_name}" -o wide || true
"${kubectl_cmd[@]}" get pods -n "${prewarm_namespace}" -l "app=${prewarm_name}" -o wide || true
"${kubectl_cmd[@]}" get pods -n "${prewarm_namespace}" -l "art.openpipe/prewarm-name=${prewarm_name}" -o wide || true
"${kubectl_cmd[@]}" describe daemonset -n "${prewarm_namespace}" "${prewarm_name}" || true
first_prewarm_pod="$(
"${kubectl_cmd[@]}" get pods -n "${prewarm_namespace}" -l "app=${prewarm_name}" \
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true
)"
if [[ -n "${first_prewarm_pod}" ]]; then
"${kubectl_cmd[@]}" describe pod -n "${prewarm_namespace}" "${first_prewarm_pod}" || true
fi
"${kubectl_cmd[@]}" get events -n "${prewarm_namespace}" --sort-by=.lastTimestamp | tail -n 80 || true
echo "::endgroup::"
}

sanitize_k8s_name_part() {
printf '%s' "$1" \
| tr '[:upper:]' '[:lower:]' \
| tr -c 'a-z0-9-' '-' \
| sed -E 's/^-+//; s/-+$//; s/-+/-/g' \
| cut -c1-35
}

prewarm_single_node() {
local node="$1"
local node_slug
local pod
local attempt

node_slug="$(sanitize_k8s_name_part "${node}")"
if [[ -z "${node_slug}" ]]; then
echo "Could not derive Kubernetes pod name for node ${node}" >&2
return 1
fi
pod="${prewarm_name}-${node_slug}"

for attempt in $(seq 1 "${prewarm_node_retries}"); do
echo "Prewarming ${prewarm_image} on GPU node ${node} (attempt ${attempt}/${prewarm_node_retries})"
"${kubectl_cmd[@]}" delete pod -n "${prewarm_namespace}" "${pod}" \
--ignore-not-found --wait=true >/dev/null 2>&1 || true
"${kubectl_cmd[@]}" apply -n "${prewarm_namespace}" -f - <<EOF
apiVersion: v1
kind: Pod
metadata:
name: ${pod}
labels:
app: ${prewarm_name}-oneshot
art.openpipe/prewarm-name: ${prewarm_name}
art.openpipe/prewarm-token: "${timestamp}-${art_short_sha}"
spec:
restartPolicy: Never
nodeName: ${node}
imagePullSecrets:
- name: ${prewarm_image_pull_secret}
tolerations:
- operator: Exists
initContainers:
- name: prepull
image: ${prewarm_image}
imagePullPolicy: Always
command: ["bash", "-lc", "true"]
resources:
requests:
cpu: 10m
memory: 16Mi
containers:
- name: pause
image: registry.k8s.io/pause:3.10
resources:
requests:
cpu: 10m
memory: 16Mi
EOF
if "${kubectl_cmd[@]}" wait -n "${prewarm_namespace}" \
--for=condition=Ready "pod/${pod}" \
--timeout="${prewarm_node_timeout}"; then
"${kubectl_cmd[@]}" delete pod -n "${prewarm_namespace}" "${pod}" \
--ignore-not-found --wait=true >/dev/null 2>&1 || true
return 0
fi

echo "Prewarm failed on node ${node}; pod diagnostics:"
"${kubectl_cmd[@]}" describe pod -n "${prewarm_namespace}" "${pod}" || true
"${kubectl_cmd[@]}" delete pod -n "${prewarm_namespace}" "${pod}" \
--ignore-not-found --wait=true >/dev/null 2>&1 || true
sleep "$((attempt * 10))"
done

echo "Failed to prewarm ${prewarm_image} on node ${node}" >&2
return 1
}

if [[ "${prewarm_nodes}" == "true" ]]; then
gpu_node_count="$("${kubectl_cmd[@]}" get nodes -l "${prewarm_node_selector}" --no-headers 2>/dev/null | wc -l | tr -d ' ')"
if ! [[ "${prewarm_node_parallelism}" =~ ^[1-9][0-9]*$ ]]; then
echo "PREWARM_NODE_PARALLELISM must be a positive integer, got: ${prewarm_node_parallelism}" >&2
exit 1
fi
mapfile -t gpu_nodes < <(
"${kubectl_cmd[@]}" get nodes -l "${prewarm_node_selector}" \
-o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null
)
gpu_node_count="${#gpu_nodes[@]}"
if [[ "${gpu_node_count}" == "0" ]]; then
echo "Skipping GPU node prewarm: no nodes match ${prewarm_node_selector}"
else
Expand All @@ -405,6 +516,35 @@ if [[ "${prewarm_nodes}" == "true" ]]; then
--type=kubernetes.io/dockerconfigjson \
--dry-run=client -o yaml \
| "${kubectl_cmd[@]}" apply -n "${prewarm_namespace}" -f -

echo "Stopping existing ${prewarm_name} DaemonSet before batched node prewarm"
"${kubectl_cmd[@]}" delete daemonset -n "${prewarm_namespace}" "${prewarm_name}" \
--ignore-not-found --wait=true >/dev/null 2>&1 || true

prewarm_failures=0
prewarm_pids=()
for gpu_node in "${gpu_nodes[@]}"; do
prewarm_single_node "${gpu_node}" &
prewarm_pids+=("$!")

if (( ${#prewarm_pids[@]} >= prewarm_node_parallelism )); then
if ! wait "${prewarm_pids[0]}"; then
prewarm_failures=1
fi
prewarm_pids=("${prewarm_pids[@]:1}")
fi
done
for prewarm_pid in "${prewarm_pids[@]}"; do
if ! wait "${prewarm_pid}"; then
prewarm_failures=1
fi
done
if [[ "${prewarm_failures}" != "0" ]]; then
dump_prewarm_diagnostics
exit 1
fi

echo "Installing steady-state ${prewarm_name} DaemonSet"
"${kubectl_cmd[@]}" apply -n "${prewarm_namespace}" -f - <<EOF
apiVersion: apps/v1
kind: DaemonSet
Expand All @@ -419,7 +559,7 @@ spec:
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 100%
maxUnavailable: 1
template:
metadata:
labels:
Expand Down Expand Up @@ -450,7 +590,10 @@ spec:
cpu: 10m
memory: 16Mi
EOF
"${kubectl_cmd[@]}" rollout status -n "${prewarm_namespace}" "daemonset/${prewarm_name}" --timeout="${prewarm_timeout}"
if ! "${kubectl_cmd[@]}" rollout status -n "${prewarm_namespace}" "daemonset/${prewarm_name}" --timeout="${prewarm_timeout}"; then
dump_prewarm_diagnostics
exit 1
fi
fi
else
echo "Skipping GPU node prewarm"
Expand Down
Loading