From a7a277d54c5e5690614540a4363b14e338355a1c Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 15 May 2026 17:01:50 -0700 Subject: [PATCH 1/4] Add Packer template for macOS GHA runner AMIs Mirrors the layout of aws/ami/windows. The template uses an Ansible provisioner that runs a bakeable subset of pytorch-gha-infra's bootstrap-runner.yml: Homebrew packages (gh, jq, tmux, libomp, pstree, miniconda), conda init, the runner user, the SSM agent, the CloudWatch agent binary + plist, /opt/runner_scripts/, and boto3/botocore. Per-instance steps (IAM role attach, GH runner registration, starting the CloudWatch daemon) stay in the runtime playbooks. build_macos_ami.py wraps `packer init` + `packer build`, resolves the host's AZ via DescribeHosts so launches always match the dedicated host, and supports building multiple macOS versions back-to-back on one host (sequential, waiting out the ~1-2h scrub window between builds) so the 24h Mac host billing minimum is amortized. A single arm64 base AMI is portable across every Mac2 instance family (mac2 / mac2-m2 / mac2-m2pro / mac2-m4*), so the template hardcodes arm64; an x86_64 fork can be added later if needed. --- aws/ami/macos/README.md | 155 +++++++++ aws/ami/macos/ansible/bake.yml | 109 ++++++ aws/ami/macos/ansible/requirements.yml | 4 + aws/ami/macos/build_macos_ami.py | 342 +++++++++++++++++++ aws/ami/macos/configs/cloudwatch_config.json | 78 +++++ aws/ami/macos/macos.pkr.hcl | 123 +++++++ aws/ami/macos/plugins.pkr.hcl | 12 + aws/ami/macos/scripts/create-runner-user.sh | 32 ++ aws/ami/macos/scripts/install-ssm-agent.sh | 20 ++ aws/ami/macos/variables.pkr.hcl | 41 +++ 10 files changed, 916 insertions(+) create mode 100644 aws/ami/macos/README.md create mode 100644 aws/ami/macos/ansible/bake.yml create mode 100644 aws/ami/macos/ansible/requirements.yml create mode 100755 aws/ami/macos/build_macos_ami.py create mode 100644 aws/ami/macos/configs/cloudwatch_config.json create mode 100644 aws/ami/macos/macos.pkr.hcl create mode 100644 aws/ami/macos/plugins.pkr.hcl create mode 100755 aws/ami/macos/scripts/create-runner-user.sh create mode 100755 aws/ami/macos/scripts/install-ssm-agent.sh create mode 100644 aws/ami/macos/variables.pkr.hcl diff --git a/aws/ami/macos/README.md b/aws/ami/macos/README.md new file mode 100644 index 0000000000..b9ad355d1f --- /dev/null +++ b/aws/ami/macos/README.md @@ -0,0 +1,155 @@ +# Build macOS AMIs + +This folder uses Packer to bake reusable macOS AMIs for PyTorch GHA CI +runners, mirroring the layout under `../windows`. + +The baked AMI contains everything host-shape-independent — Homebrew +packages (`gh`, `jq`, `tmux`, `libomp`, `pstree`, `miniconda` cask), +`conda init`, the `runner` user, the SSM agent, the CloudWatch agent +binary + plist, `/opt/runner_scripts/`, and `boto3`/`botocore` for the +runtime Ansible plays. Per-instance steps (IAM role attach, GH runner +registration, starting the CloudWatch daemon with the live config) +remain in the runtime playbooks under +`pytorch-gha-infra/macos-runners/playbooks`. + +## Why per-arch AMIs are enough + +AWS publishes one `arm64_mac` base AMI per macOS version, and one +`x86_64_mac` base AMI per macOS version. There is no per-chip variant +(no separate M1/M2/M2-Pro/M4 AMI). A custom AMI built from one of those +base images is portable across every Apple Silicon Mac instance family +(`mac2.metal`, `mac2-m2.metal`, `mac2-m2pro.metal`, `mac2-m4*.metal`). +Build matrix is therefore `(arch, macos_version)` — 2-4 AMIs total in +practice, not 10+. + +## Why a Python driver instead of plain `packer build` + +EC2 Mac instances require a Dedicated Host. Dedicated Mac hosts have: + +- A **24-hour minimum billing window**. Releasing earlier still costs a + full day. +- A **~1-2 hour scrubbing window** after every instance terminates, + during which the host cannot accept a new launch. + +Letting Packer allocate and release a host per build would cost one +host-day per AMI. The driver script (`build_macos_ami.py`) allocates a +single host, runs N packer builds sequentially against it (waiting out +the scrub window between builds), and leaves it allocated by default so +you don't pay for a fresh day on the next invocation. + +## Setup + +1. Configure AWS credentials (`AWS_PROFILE=fbossci` for the PyTorch CI + account). +2. Install Packer + ([instructions](https://developer.hashicorp.com/packer/tutorials/docker-get-started/get-started-install-cli)). +3. Install Ansible locally (Packer's Ansible provisioner runs it from + the build host, not the target): + ```bash + pip install ansible boto3 + ansible-galaxy install -r ansible/requirements.yml + ``` +4. `cd` here and run `packer init .` (the driver also does this). + +## Usage + +### Host discovery + +If `--host-id` is not passed, the driver looks for an existing Dedicated +Host tagged `Name=packer-macos-arm64-builder` in `--region`. The first +idle match (state `available`, no running instances) is reused; +otherwise the driver allocates a fresh host with that same tag. Pass +`--no-reuse` to force allocation, or `--host-id h-...` to pin to a +specific host. + +This means the common case is one command, no manual host bookkeeping: + +```bash +AWS_PROFILE=fbossci python build_macos_ami.py --region us-east-2 --macos-version 14 +``` + +### Build all supported macOS versions on one host (cost-optimal) + +```bash +AWS_PROFILE=fbossci python build_macos_ami.py \ + --region us-east-2 \ + --macos-version 14 \ + --macos-version 15 \ + --macos-version 26 +``` + +Mac dedicated hosts have a 24h billing minimum, so amortizing multiple +builds across one host avoids paying for multiple host-days. + +### Smoke-test the provisioners without creating an AMI + +```bash +AWS_PROFILE=fbossci python build_macos_ami.py \ + --region us-east-2 --macos-version 14 --skip-create-ami +``` + +### Release the host when fully done + +```bash +aws ec2 release-hosts --host-ids h-0123456789abcdef0 --region us-east-1 +``` + +Or pass `--release-after` to the driver (note: still billed for 24h). + +## Multi-region publication + +The template defaults to publishing the AMI to both `us-east-1` and +`us-east-2` (the regions PyTorch CI currently runs Mac runners in). +Packer registers in the build region first, then issues `CopyImage` to +the other regions in the list — each copy creates a fresh AMI ID and a +fresh EBS snapshot in that region. + +To narrow or widen the set, pass `ami_regions` through: + +```bash +python build_macos_ami.py \ + --host-id h-... --region us-east-2 --macos-version 14 \ + --packer-extra-arg='-var=ami_regions=["us-east-1","us-east-2","us-west-2"]' +``` + +CopyImage is roughly free at the API level but each destination region +incurs snapshot storage (~$0.05/GB-month) and a one-time inter-region +data-transfer charge for the snapshot. + +## Consuming the AMI from Terraform + +Mirror the Windows pattern in +`pytorch-gha-infra/runners/regions/us-east-1/main.tf`: + +```hcl +ami_owners_macos_arm64 = [""] +ami_filter_macos_arm64 = { + name = ["pytorch-ci-macos-14-arm64-*"] + architecture = ["arm64_mac"] +} +``` + +Because the same AMI name lands in every region in `ami_regions` (with +different IDs), Terraform's per-region lookup naturally resolves to the +local copy without extra configuration. The AMI name embeds +`(macos_version, arch, timestamp)`, so filters can be as broad or +narrow as needed. + +## Layout + +``` +macos/ +├── README.md # this file +├── plugins.pkr.hcl # required packer plugins (amazon, ansible) +├── variables.pkr.hcl # input variables +├── macos.pkr.hcl # source + build blocks +├── build_macos_ami.py # host-lifecycle driver +├── ansible/ +│ ├── bake.yml # tasks baked into the AMI +│ └── requirements.yml # ansible-galaxy deps +├── scripts/ # shipped to /opt/runner_scripts/ in AMI +│ ├── create-runner-user.sh +│ └── install-ssm-agent.sh +└── configs/ + └── cloudwatch_config.json # staged for the runtime playbook +``` diff --git a/aws/ami/macos/ansible/bake.yml b/aws/ami/macos/ansible/bake.yml new file mode 100644 index 0000000000..1f7dc9fa78 --- /dev/null +++ b/aws/ami/macos/ansible/bake.yml @@ -0,0 +1,109 @@ +--- +# Tasks here are baked into the AMI. Anything that depends on per-instance +# state (IAM role attach, GH runner registration, instance-id-derived names) +# must stay in the runtime playbooks under pytorch-gha-infra/macos-runners. + +- name: Bake macOS GHA runner AMI + hosts: all + module_defaults: + shell: + executable: /bin/zsh + become: true + become_user: ec2-user + tasks: + - name: Ensure boto libraries are installed + ansible.builtin.pip: + name: + - boto3 + - botocore + executable: pip3 + + - name: Transfer runner scripts + become: true + become_user: root + ansible.builtin.copy: + src: ../scripts/ + dest: /opt/runner_scripts/ + directory_mode: true + mode: '0755' + + - name: Create post-job log directory + become: true + become_user: root + ansible.builtin.file: + path: /var/log/post_job + state: directory + mode: '0777' + + - name: Create runner user + environment: + RUNNER_USER: runner + register: create_runner_user + changed_when: create_runner_user.stdout != 'RUNNER_USER EXISTS' + ansible.builtin.shell: | + sudo /opt/runner_scripts/create-runner-user.sh + echo -n "RUNNER_USER EXISTS" + + - name: Install SSM Agent + become: true + become_user: root + register: install_ssm + changed_when: install_ssm.stdout != 'SSM INSTALLED' + ansible.builtin.shell: | + bash /opt/runner_scripts/install-ssm-agent.sh + + - name: Install Homebrew cask dependencies + community.general.homebrew_cask: + name: + - miniconda + state: present + + - name: Install Homebrew dependencies + community.general.homebrew: + name: + - gh + - jq + - tmux + - libomp + - pstree + state: present + + - name: Initialize conda in shell rc files + register: conda_init + changed_when: '"modified" in conda_init.stdout' + ansible.builtin.shell: | + /opt/homebrew/bin/conda init --all || /usr/local/bin/conda init --all + + # CloudWatch agent: download from the public bucket, install into /opt/aws, + # and place the LaunchDaemon plist. The first-boot playbook is still + # responsible for `fetch-config -s` (which starts the daemon with the + # current config file). + - name: Determine CloudWatch agent download URL + ansible.builtin.set_fact: + cw_agent_url: >- + {{ 'https://amazoncloudwatch-agent.s3.amazonaws.com/darwin/arm64/latest/amazon-cloudwatch-agent.pkg' + if ansible_architecture == 'arm64' + else 'https://amazoncloudwatch-agent.s3.amazonaws.com/darwin/amd64/latest/amazon-cloudwatch-agent.pkg' }} + + - name: Download CloudWatch agent installer + become: true + become_user: root + ansible.builtin.get_url: + url: "{{ cw_agent_url }}" + dest: /tmp/amazon-cloudwatch-agent.pkg + mode: '0644' + + - name: Install CloudWatch agent + become: true + become_user: root + ansible.builtin.command: + cmd: installer -pkg /tmp/amazon-cloudwatch-agent.pkg -target / + creates: /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl + + - name: Stage CloudWatch config (consumed at first boot) + become: true + become_user: root + ansible.builtin.copy: + src: ../configs/cloudwatch_config.json + dest: /opt/runner_scripts/cloudwatch_config.json + mode: '0644' diff --git a/aws/ami/macos/ansible/requirements.yml b/aws/ami/macos/ansible/requirements.yml new file mode 100644 index 0000000000..3a179c8653 --- /dev/null +++ b/aws/ami/macos/ansible/requirements.yml @@ -0,0 +1,4 @@ +--- +collections: + - name: community.general + - name: amazon.aws diff --git a/aws/ami/macos/build_macos_ami.py b/aws/ami/macos/build_macos_ami.py new file mode 100755 index 0000000000..ec44b5f6b6 --- /dev/null +++ b/aws/ami/macos/build_macos_ami.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +""" +Allocate (or reuse) an EC2 Dedicated Host for Mac, run `packer build` against +it, then optionally release the host. + +Mac dedicated hosts have a 24-hour minimum allocation period, so the cost- +efficient workflow is to allocate one host, run multiple builds against it +(e.g. one per macOS version), then release. + +Between builds the host enters a "scrubbing" state for ~1-2 hours after each +instance terminates; this script polls `describe-hosts` until the host is +`available` again before kicking off the next packer build. + +Host discovery order +-------------------- + +If --host-id is not given, the driver searches for an existing Dedicated Host +tagged Name=packer-macos-arm64-builder in the chosen region. The first idle +match (state=available, no instances) is reused. If none is found, a fresh +host is allocated with that same tag. Use --no-reuse to force allocation, or +--host-id to override discovery entirely. + +Examples +-------- + +Build a single Sonoma arm64 image, reusing or allocating a host as needed: + + AWS_PROFILE=fbossci python build_macos_ami.py \\ + --region us-east-2 \\ + --macos-version 14 + +Build Sonoma + Sequoia + Tahoe back-to-back on the same host (one host-day +amortized across three AMIs): + + AWS_PROFILE=fbossci python build_macos_ami.py \\ + --region us-east-2 \\ + --macos-version 14 --macos-version 15 --macos-version 26 + +Force a fresh host even if a tagged one already exists: + + AWS_PROFILE=fbossci python build_macos_ami.py \\ + --region us-east-2 --macos-version 14 --no-reuse + +Pin to a specific host: + + AWS_PROFILE=fbossci python build_macos_ami.py \\ + --host-id h-0123456789abcdef0 \\ + --region us-east-2 --macos-version 14 +""" + +from __future__ import annotations + +import argparse +import os +import shutil +import subprocess +import sys +import time +from pathlib import Path +from typing import Optional + +try: + import boto3 +except ImportError: + sys.exit("boto3 is required: pip install boto3") + +HERE = Path(__file__).resolve().parent + +# Apple Silicon only. The resulting AMI is portable across every Mac2 family, +# so building on mac2.metal (cheapest M1 host) covers M1/M2/M2-Pro/M4 fleets. +INSTANCE_TYPE = "mac2.metal" + +# Tag value used to discover (and tag) hosts managed by this driver. Any host +# carrying Name= is a candidate for reuse. +HOST_NAME_TAG = "packer-macos-arm64-builder" + +# Poll cadence for waiting on host state. +POLL_INTERVAL_SECONDS = 60 +# Generous upper bound for scrubbing window between builds (~2h observed). +HOST_AVAILABLE_TIMEOUT_SECONDS = 4 * 60 * 60 + + +def log(msg: str) -> None: + print(f"[build_macos_ami] {msg}", flush=True) + + +def allocate_host( + ec2, + instance_type: str, + availability_zone: str, + tags: dict[str, str], +) -> str: + log(f"Allocating Dedicated Host: instance-type={instance_type} az={availability_zone}") + resp = ec2.allocate_hosts( + AvailabilityZone=availability_zone, + InstanceType=instance_type, + AutoPlacement="off", + Quantity=1, + TagSpecifications=[ + { + "ResourceType": "dedicated-host", + "Tags": [{"Key": k, "Value": v} for k, v in tags.items()], + } + ], + ) + host_id = resp["HostIds"][0] + log(f"Allocated host {host_id}") + return host_id + + +def release_host(ec2, host_id: str) -> None: + log(f"Releasing host {host_id}") + resp = ec2.release_hosts(HostIds=[host_id]) + successful = resp.get("Successful", []) + unsuccessful = resp.get("Unsuccessful", []) + if unsuccessful: + for entry in unsuccessful: + log(f" unsuccessful: {entry}") + if successful: + log(f" released: {successful}") + + +def describe_host(ec2, host_id: str) -> dict: + resp = ec2.describe_hosts(HostIds=[host_id]) + hosts = resp.get("Hosts", []) + if not hosts: + raise RuntimeError(f"Host {host_id} not found") + return hosts[0] + + +def find_reusable_host(ec2, name_tag: str = HOST_NAME_TAG) -> Optional[str]: + """ + Look for an existing Dedicated Host tagged Name= in the current + region. Released hosts are excluded by AWS automatically. Among live hosts + we prefer one in 'available' state with no running instances; otherwise + any non-released host (since the caller will wait for it to become idle). + Returns None if nothing usable is found. + """ + resp = ec2.describe_hosts( + Filter=[{"Name": "tag:Name", "Values": [name_tag]}], + ) + hosts = [h for h in resp.get("Hosts", []) if h["State"] not in {"released", "released-permanent-failure", "permanent-failure"}] + if not hosts: + return None + + def rank(h: dict) -> tuple[int, int]: + # Lower is better. Prefer idle (state=available + no instances) hosts. + state_score = 0 if h["State"] == "available" else 1 + instance_score = 0 if not h.get("Instances") else 1 + return (state_score, instance_score) + + hosts.sort(key=rank) + chosen = hosts[0] + log( + f"Reusing existing host {chosen['HostId']} " + f"(state={chosen['State']}, instances={len(chosen.get('Instances', []))}, " + f"az={chosen['AvailabilityZone']}, tag Name={name_tag})" + ) + if len(hosts) > 1: + others = ", ".join(h["HostId"] for h in hosts[1:]) + log(f" (also found, not chosen: {others})") + return chosen["HostId"] + + +def wait_for_host_available(ec2, host_id: str, timeout_s: int = HOST_AVAILABLE_TIMEOUT_SECONDS) -> None: + """ + Wait until host State == 'available' and no instance is running on it. + After an instance terminates the host can spend up to ~2h in 'pending' / + 'under-assessment' (scrubbing) before it accepts a new launch. + """ + deadline = time.monotonic() + timeout_s + last_state = None + while True: + host = describe_host(ec2, host_id) + state = host["State"] + running = len(host.get("Instances", [])) + if state != last_state: + log(f"Host {host_id} state={state} instances={running}") + last_state = state + if state == "available" and running == 0: + return + if state in {"permanent-failure", "released", "released-permanent-failure"}: + raise RuntimeError(f"Host {host_id} entered terminal state {state}") + if time.monotonic() > deadline: + raise TimeoutError( + f"Host {host_id} did not reach 'available' within {timeout_s}s (last state: {state})" + ) + time.sleep(POLL_INTERVAL_SECONDS) + + +def run_packer_init(packer_dir: Path) -> None: + log("Running packer init") + subprocess.run(["packer", "init", "."], cwd=packer_dir, check=True) + + +def run_packer_build( + packer_dir: Path, + *, + host_id: str, + availability_zone: str, + macos_version: str, + instance_type: str, + region: str, + skip_create_ami: bool, + extra_args: list[str], +) -> None: + cmd = [ + "packer", + "build", + f"-var=host_id={host_id}", + f"-var=availability_zone={availability_zone}", + f"-var=macos_version={macos_version}", + f"-var=instance_type={instance_type}", + f"-var=region={region}", + f"-var=skip_create_ami={'true' if skip_create_ami else 'false'}", + *extra_args, + ".", + ] + log("Running: " + " ".join(cmd)) + subprocess.run(cmd, cwd=packer_dir, check=True) + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + p.add_argument( + "--macos-version", + action="append", + required=True, + help="macOS major version to build (e.g. 14, 15, 26). Repeat for multiple builds on the same host.", + ) + p.add_argument( + "--region", + default=os.environ.get("AWS_REGION", "us-east-1"), + help="AWS region (default: us-east-1).", + ) + p.add_argument( + "--availability-zone", + default="us-east-1a", + help="Availability zone for host allocation (default: us-east-1a).", + ) + p.add_argument( + "--host-id", + help="Reuse a specific Dedicated Host (h-...) instead of discovering or allocating one.", + ) + p.add_argument( + "--no-reuse", + action="store_true", + help="Do not search for an existing host tagged Name=" + HOST_NAME_TAG + "; allocate a fresh one.", + ) + p.add_argument( + "--release-after", + action="store_true", + help="Release the host after all builds complete. WARNING: Mac hosts are billed for a minimum of 24h regardless. Default: keep host.", + ) + p.add_argument( + "--skip-create-ami", + action="store_true", + help="Run the packer provisioners but do not register an AMI (smoke test).", + ) + p.add_argument( + "--packer-dir", + type=Path, + default=HERE, + help="Directory containing the packer template (default: this script's directory).", + ) + p.add_argument( + "--packer-extra-arg", + action="append", + default=[], + help="Extra argument to pass through to `packer build` (repeatable).", + ) + return p.parse_args() + + +def main() -> int: + args = parse_args() + + if shutil.which("packer") is None: + sys.exit("`packer` not found on PATH. Install from https://www.packer.io/downloads") + + ec2 = boto3.client("ec2", region_name=args.region) + + allocated_by_us = False + host_id = args.host_id + if host_id is None and not args.no_reuse: + host_id = find_reusable_host(ec2) + if host_id is None: + host_id = allocate_host( + ec2, + instance_type=INSTANCE_TYPE, + availability_zone=args.availability_zone, + tags={ + "Name": HOST_NAME_TAG, + "ManagedBy": "build_macos_ami.py", + }, + ) + allocated_by_us = True + + # The launch AZ must match the host's AZ. Source it from the host itself + # rather than the --availability-zone flag, which only applies at allocation. + host_az = describe_host(ec2, host_id)["AvailabilityZone"] + log(f"Host {host_id} lives in {host_az}; pinning Packer launch to that AZ") + + try: + run_packer_init(args.packer_dir) + + for idx, version in enumerate(args.macos_version): + if idx > 0: + log(f"Waiting for host {host_id} to finish scrubbing before next build...") + wait_for_host_available(ec2, host_id) + log(f"=== Building macOS {version} (arm64) on host {host_id} ===") + run_packer_build( + args.packer_dir, + host_id=host_id, + availability_zone=host_az, + macos_version=version, + instance_type=INSTANCE_TYPE, + region=args.region, + skip_create_ami=args.skip_create_ami, + extra_args=args.packer_extra_arg, + ) + finally: + if args.release_after: + if not allocated_by_us: + log(f"--release-after set but host {host_id} was passed in; releasing anyway") + try: + wait_for_host_available(ec2, host_id) + except Exception as exc: + log(f"Could not wait for host to be idle before release: {exc}") + release_host(ec2, host_id) + else: + log(f"Host {host_id} left allocated. Re-use with --host-id {host_id}, or release with:") + log(f" aws ec2 release-hosts --host-ids {host_id} --region {args.region}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/aws/ami/macos/configs/cloudwatch_config.json b/aws/ami/macos/configs/cloudwatch_config.json new file mode 100644 index 0000000000..a0ce37514f --- /dev/null +++ b/aws/ami/macos/configs/cloudwatch_config.json @@ -0,0 +1,78 @@ +{ + "agent": { + "metrics_collection_interval": 10 + }, + "metrics": { + "namespace": "GHARunners/gi-ci-pet", + "append_dimensions": { + "ImageID":"${aws:ImageId}", + "InstanceId":"${aws:InstanceId}", + "InstanceType":"${aws:InstanceType}" + }, + "aggregation_dimensions": [ + ["ImageID"], + ["InstanceType"], + ["ImageID", "InstanceType"] + ], + "metrics_collected": { + "cpu": { + "measurement": [ + "cpu_usage_idle", + "cpu_usage_iowait", + "cpu_usage_user", + "cpu_usage_system" + ], + "metrics_collection_interval": 10 + }, + "disk": { + "measurement": [ + "free", + "total", + "used", + "used_percent", + "inodes_free", + "inodes_total" + ], + "metrics_collection_interval": 10, + "resources": [ + "/" + ] + }, + "diskio": { + "measurement": [ + "io_time" + ], + "metrics_collection_interval": 10, + "resources": [ + "/" + ] + }, + "mem": { + "measurement": [ + "total", + "used", + "free", + "used_percent" + ], + "metrics_collection_interval": 10 + }, + "swap": { + "measurement": [ + "swap_used_percent" + ], + "metrics_collection_interval": 10 + } + } + }, + "logs": { + "logs_collected": { + "files": { + "collect_list": [{ + "log_group_name": "macos_runner", + "file_path": "/Users/ec2-user/runner/_diag/Runner_**.log", + "log_stream_name": "{instance_id}" + }] + } + } + } +} diff --git a/aws/ami/macos/macos.pkr.hcl b/aws/ami/macos/macos.pkr.hcl new file mode 100644 index 0000000000..a305b0501f --- /dev/null +++ b/aws/ami/macos/macos.pkr.hcl @@ -0,0 +1,123 @@ +locals { + timestamp = regex_replace(timestamp(), "[- TZ:]", "") + ami_name = "pytorch-ci-macos-${var.macos_version}-arm64-${local.timestamp}" +} + +# Apple Silicon only. A single arm64_mac AMI is portable across every Mac2 +# instance family (mac2 / mac2-m2 / mac2-m2pro / mac2-m4*), so there is no +# need for a per-chip variant. If x86_64 Intel Macs are ever needed, fork +# this template rather than re-parameterizing. +data "amazon-ami" "macos_root_ami" { + filters = { + name = "amzn-ec2-macos-${var.macos_version}*-arm64" + architecture = "arm64_mac" + virtualization-type = "hvm" + root-device-type = "ebs" + } + most_recent = true + owners = ["amazon"] + region = var.region +} + +source "amazon-ebs" "macos_builder" { + ami_name = local.ami_name + # Note: Mac AMI root snapshots are encrypted by default, and AWS rejects + # ModifyImageAttribute(launchPermission=all) on encrypted snapshots. We keep + # the AMI private-to-account, which is what the pytorch-gha-infra Terraform + # expects (ami_owners_macos_arm64 = []). + associate_public_ip_address = true + source_ami = data.amazon-ami.macos_root_ami.id + instance_type = var.instance_type + region = var.region + ami_regions = var.ami_regions + ssh_username = "ec2-user" + communicator = "ssh" + ssh_timeout = "1h" + ebs_optimized = true + skip_create_ami = var.skip_create_ami + + availability_zone = var.availability_zone + + # Force subnet selection into the default VPC's subnet for the host's AZ. + # Without this Packer's auto-pick in the default VPC can land in a different + # AZ than the dedicated host; without `default-for-az` it could land in a + # custom VPC that lacks an internet gateway, breaking egress for brew/SSM/CW. + subnet_filter { + filters = { + "availability-zone" : var.availability_zone + "default-for-az" : "true" + } + most_free = true + random = false + } + + tenancy = "host" + placement { + host_id = var.host_id + } + + launch_block_device_mappings { + delete_on_termination = true + device_name = "/dev/sda1" + volume_size = var.root_volume_size_gb + volume_type = "gp3" + } + + # Required by org-level SCP: only IMDSv2 instances may be launched. + metadata_options { + http_endpoint = "enabled" + http_tokens = "required" + http_put_response_hop_limit = 2 + } + + run_tags = { + Name = "packer-${local.ami_name}" + BuildTool = "packer" + OS = "macos" + OSVersion = var.macos_version + Arch = "arm64" + } + + tags = { + Name = local.ami_name + BuildTool = "packer" + OS = "macos" + OSVersion = var.macos_version + Arch = "arm64" + SourceAMI = data.amazon-ami.macos_root_ami.id + } + + # macOS AMIs take a long time to register. + aws_polling { + max_attempts = 600 + } +} + +build { + sources = ["source.amazon-ebs.macos_builder"] + + # Ensure brew is on PATH for non-interactive ssh sessions before Ansible runs. + # The base AMI already ships Homebrew at /opt/homebrew (arm64) or /usr/local (x86_64). + provisioner "shell" { + inline = [ + "echo 'eval \"$(/opt/homebrew/bin/brew shellenv 2>/dev/null || /usr/local/bin/brew shellenv)\"' >> ~/.zprofile", + "source ~/.zprofile || true", + "brew --version", + ] + } + + provisioner "ansible" { + playbook_file = "${path.root}/ansible/bake.yml" + user = "ec2-user" + use_proxy = false + extra_arguments = [ + # Force legacy SCP protocol (-O); modern macOS scp defaults to SFTP which + # the base AMI's SSH server sometimes refuses on the first connection. + "--scp-extra-args=-O", + # Retry the TCP connect to survive transient routing/firewall hiccups + # immediately after Packer's own SSH session closes. + "--ssh-extra-args=-o ConnectionAttempts=10 -o ConnectTimeout=30 -o ServerAliveInterval=30", + "-e", "ansible_python_interpreter=/usr/bin/python3", + ] + } +} diff --git a/aws/ami/macos/plugins.pkr.hcl b/aws/ami/macos/plugins.pkr.hcl new file mode 100644 index 0000000000..73b29800b4 --- /dev/null +++ b/aws/ami/macos/plugins.pkr.hcl @@ -0,0 +1,12 @@ +packer { + required_plugins { + amazon = { + source = "github.com/hashicorp/amazon" + version = "~> 1" + } + ansible = { + source = "github.com/hashicorp/ansible" + version = "~> 1" + } + } +} diff --git a/aws/ami/macos/scripts/create-runner-user.sh b/aws/ami/macos/scripts/create-runner-user.sh new file mode 100755 index 0000000000..cf142f82e0 --- /dev/null +++ b/aws/ami/macos/scripts/create-runner-user.sh @@ -0,0 +1,32 @@ +#!/usr/bin/bash +# Mirrors pytorch-gha-infra/macos-runners/scripts/create-runner-user.sh so the +# runner user exists in the baked AMI. Kept in sync manually. + +set -eou pipefail + +RUNNER_USER="${RUNNER_USER:-runner}" +RUNNER_USER_DIR="/Users/${RUNNER_USER}" + +DSCL_CREATE="dscl . -create ${RUNNER_USER_DIR}" + +mkdir -p "/Local/Users/${RUNNER_USER}" + +if ! id -u "${RUNNER_USER}" >/dev/null 2>/dev/null; then + echo "+ Creating the runner user (${RUNNER_USER})" + ( + set -x + ${DSCL_CREATE} + ${DSCL_CREATE} UserShell /bin/zsh + ${DSCL_CREATE} RealName "Runner Person" + ${DSCL_CREATE} UniqueID 1001 + ${DSCL_CREATE} PrimaryGroupID 1000 + ${DSCL_CREATE} NFSHomeDirectory "/Local/Users/${RUNNER_USER}" + dscl . -passwd "${RUNNER_USER_DIR}" password + dscl . -append /Groups/admin GroupMembership "${RUNNER_USER}" + mkdir -p "${RUNNER_USER_DIR}" + if [[ ! -d "${RUNNER_USER_DIR}" ]]; then + echo "error: Something went wrong creating the user ${RUNNER_USER}" + exit 1 + fi + ) +fi diff --git a/aws/ami/macos/scripts/install-ssm-agent.sh b/aws/ami/macos/scripts/install-ssm-agent.sh new file mode 100755 index 0000000000..775ab98d10 --- /dev/null +++ b/aws/ami/macos/scripts/install-ssm-agent.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +# Mirrors pytorch-gha-infra/macos-runners/scripts/install-ssm-agent.sh. + +set -eou pipefail + +if grep "amazon-ssm-agent is stopped" /var/log/amazon/ssm/amazon-ssm-agent.log >/dev/null 2>/dev/null; then + TOKEN=$(curl -s -X PUT -H "X-aws-ec2-metadata-token-ttl-seconds: 30" http://169.254.169.254/latest/api/token) + EC2_REGION=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq .region -r) + ARCH=$(uname -m) + if [[ "${ARCH}" == "arm64" ]]; then + SSM_URL="https://s3.${EC2_REGION}.amazonaws.com/amazon-ssm-${EC2_REGION}/latest/darwin_arm64/amazon-ssm-agent.pkg" + else + SSM_URL="https://s3.${EC2_REGION}.amazonaws.com/amazon-ssm-${EC2_REGION}/latest/darwin_amd64/amazon-ssm-agent.pkg" + fi + curl -fsSL -o /tmp/amazon-ssm-agent.pkg "${SSM_URL}" + installer -pkg /tmp/amazon-ssm-agent.pkg -target / + launchctl load -w /Library/LaunchDaemons/com.amazon.aws.ssm.plist && sudo launchctl start com.amazon.aws.ssm +else + echo -n "SSM INSTALLED" +fi diff --git a/aws/ami/macos/variables.pkr.hcl b/aws/ami/macos/variables.pkr.hcl new file mode 100644 index 0000000000..4ead090907 --- /dev/null +++ b/aws/ami/macos/variables.pkr.hcl @@ -0,0 +1,41 @@ +variable "skip_create_ami" { + type = bool + default = true +} + +variable "macos_version" { + type = string + description = "macOS version prefix used to filter the base AMI (e.g. \"14\", \"14.8\", \"15\")." +} + +variable "instance_type" { + type = string + description = "EC2 Mac instance type. mac2.metal (M1) is the cheapest arm64 host; the resulting AMI runs on every Apple Silicon Mac family." + default = "mac2.metal" +} + +variable "host_id" { + type = string + description = "ID of a pre-allocated EC2 Dedicated Host (h-xxxxxxxx) to launch the builder instance on. Required for Mac instances." +} + +variable "availability_zone" { + type = string + description = "AZ to launch the builder instance in. Must match the AZ of the dedicated host." +} + +variable "region" { + type = string + default = "us-east-1" +} + +variable "ami_regions" { + type = list(string) + description = "Regions the AMI will be registered in. The build region (var.region) is always implicit; including it here is a no-op. Packer issues CopyImage to each non-build region after registration." + default = ["us-east-1", "us-east-2"] +} + +variable "root_volume_size_gb" { + type = number + default = 200 +} From c11b787cdb1b961221cff0a7aec106c6a9e0ab2c Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 21 May 2026 13:24:04 -0700 Subject: [PATCH 2/4] [macOS AMI] Use shell tasks for Homebrew install community.general.homebrew[_cask] treats brew's stderr progress and "already installed" warnings on Homebrew 5.x as hard failures. Switch to shell loops that mirror the workaround already documented in pytorch-gha-infra/macos-runners/playbooks/install-runner.yml. --- aws/ami/macos/ansible/bake.yml | 43 ++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/aws/ami/macos/ansible/bake.yml b/aws/ami/macos/ansible/bake.yml index 1f7dc9fa78..a35f7cb752 100644 --- a/aws/ami/macos/ansible/bake.yml +++ b/aws/ami/macos/ansible/bake.yml @@ -52,21 +52,40 @@ ansible.builtin.shell: | bash /opt/runner_scripts/install-ssm-agent.sh + # Avoid community.general.homebrew_cask / .homebrew: Homebrew 5.x emits + # "already installed" / progress lines on stderr, which the modules treat + # as hard failures. Same workaround as macos-runners/playbooks/install-runner.yml. - name: Install Homebrew cask dependencies - community.general.homebrew_cask: - name: - - miniconda - state: present + register: brew_cask_install + changed_when: "'Installing' in brew_cask_install.stdout" + ansible.builtin.shell: | + set -eu + export PATH="/opt/homebrew/bin:/opt/homebrew/sbin:${PATH:-}" + for cask in miniconda; do + if /opt/homebrew/bin/brew list --cask "$cask" >/dev/null 2>&1; then + echo "Present: $cask" + else + echo "Installing: $cask" + HOMEBREW_NO_AUTO_UPDATE=1 NONINTERACTIVE=1 \ + /opt/homebrew/bin/brew install --cask "$cask" + fi + done - name: Install Homebrew dependencies - community.general.homebrew: - name: - - gh - - jq - - tmux - - libomp - - pstree - state: present + register: brew_install + changed_when: "'Installing' in brew_install.stdout" + ansible.builtin.shell: | + set -eu + export PATH="/opt/homebrew/bin:/opt/homebrew/sbin:${PATH:-}" + for pkg in gh jq tmux libomp pstree; do + if /opt/homebrew/bin/brew list --formula "$pkg" >/dev/null 2>&1; then + echo "Present: $pkg" + else + echo "Installing: $pkg" + HOMEBREW_NO_AUTO_UPDATE=1 NONINTERACTIVE=1 \ + /opt/homebrew/bin/brew install "$pkg" + fi + done - name: Initialize conda in shell rc files register: conda_init From 51b1806b276d31ecaaa94e6d8981de5464a73758 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 21 May 2026 15:28:44 -0700 Subject: [PATCH 3/4] [macOS AMI] Fix lint: PYFMT + E501 --- aws/ami/macos/build_macos_ami.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/aws/ami/macos/build_macos_ami.py b/aws/ami/macos/build_macos_ami.py index ec44b5f6b6..af8b7f8feb 100755 --- a/aws/ami/macos/build_macos_ami.py +++ b/aws/ami/macos/build_macos_ami.py @@ -139,7 +139,8 @@ def find_reusable_host(ec2, name_tag: str = HOST_NAME_TAG) -> Optional[str]: resp = ec2.describe_hosts( Filter=[{"Name": "tag:Name", "Values": [name_tag]}], ) - hosts = [h for h in resp.get("Hosts", []) if h["State"] not in {"released", "released-permanent-failure", "permanent-failure"}] + terminal_states = {"released", "released-permanent-failure", "permanent-failure"} + hosts = [h for h in resp.get("Hosts", []) if h["State"] not in terminal_states] if not hosts: return None @@ -253,7 +254,10 @@ def parse_args() -> argparse.Namespace: p.add_argument( "--release-after", action="store_true", - help="Release the host after all builds complete. WARNING: Mac hosts are billed for a minimum of 24h regardless. Default: keep host.", + help=( + "Release the host after all builds complete. WARNING: Mac hosts are " + "billed for a minimum of 24h regardless. Default: keep host." + ), ) p.add_argument( "--skip-create-ami", @@ -279,7 +283,9 @@ def main() -> int: args = parse_args() if shutil.which("packer") is None: - sys.exit("`packer` not found on PATH. Install from https://www.packer.io/downloads") + sys.exit( + "`packer` not found on PATH. Install from https://www.packer.io/downloads" + ) ec2 = boto3.client("ec2", region_name=args.region) @@ -309,7 +315,9 @@ def main() -> int: for idx, version in enumerate(args.macos_version): if idx > 0: - log(f"Waiting for host {host_id} to finish scrubbing before next build...") + log( + f"Waiting for host {host_id} to finish scrubbing before next build..." + ) wait_for_host_available(ec2, host_id) log(f"=== Building macOS {version} (arm64) on host {host_id} ===") run_packer_build( @@ -325,14 +333,18 @@ def main() -> int: finally: if args.release_after: if not allocated_by_us: - log(f"--release-after set but host {host_id} was passed in; releasing anyway") + log( + f"--release-after set but host {host_id} was passed in; releasing anyway" + ) try: wait_for_host_available(ec2, host_id) except Exception as exc: log(f"Could not wait for host to be idle before release: {exc}") release_host(ec2, host_id) else: - log(f"Host {host_id} left allocated. Re-use with --host-id {host_id}, or release with:") + log( + f"Host {host_id} left allocated. Re-use with --host-id {host_id}, or release with:" + ) log(f" aws ec2 release-hosts --host-ids {host_id} --region {args.region}") return 0 From f1990ab9742926603c75122d4e32bdb51579096e Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Thu, 21 May 2026 17:05:15 -0700 Subject: [PATCH 4/4] Fix PYFMT lint on build_macos_ami.py --- aws/ami/macos/build_macos_ami.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/aws/ami/macos/build_macos_ami.py b/aws/ami/macos/build_macos_ami.py index af8b7f8feb..b391d9cf93 100755 --- a/aws/ami/macos/build_macos_ami.py +++ b/aws/ami/macos/build_macos_ami.py @@ -59,6 +59,7 @@ from pathlib import Path from typing import Optional + try: import boto3 except ImportError: @@ -90,7 +91,9 @@ def allocate_host( availability_zone: str, tags: dict[str, str], ) -> str: - log(f"Allocating Dedicated Host: instance-type={instance_type} az={availability_zone}") + log( + f"Allocating Dedicated Host: instance-type={instance_type} az={availability_zone}" + ) resp = ec2.allocate_hosts( AvailabilityZone=availability_zone, InstanceType=instance_type, @@ -163,7 +166,9 @@ def rank(h: dict) -> tuple[int, int]: return chosen["HostId"] -def wait_for_host_available(ec2, host_id: str, timeout_s: int = HOST_AVAILABLE_TIMEOUT_SECONDS) -> None: +def wait_for_host_available( + ec2, host_id: str, timeout_s: int = HOST_AVAILABLE_TIMEOUT_SECONDS +) -> None: """ Wait until host State == 'available' and no instance is running on it. After an instance terminates the host can spend up to ~2h in 'pending' / @@ -249,7 +254,9 @@ def parse_args() -> argparse.Namespace: p.add_argument( "--no-reuse", action="store_true", - help="Do not search for an existing host tagged Name=" + HOST_NAME_TAG + "; allocate a fresh one.", + help="Do not search for an existing host tagged Name=" + + HOST_NAME_TAG + + "; allocate a fresh one.", ) p.add_argument( "--release-after",