projectcalico · caseydavenport · Apr 13, 2026 · Apr 13, 2026 · Apr 15, 2026 · Apr 15, 2026
@@ -0,0 +1,79 @@
+# End-to-end CI scripts
+
+Orchestrator scripts for the Semaphore e2e jobs. Two top-level entry points
+drive two different job shapes:
+
+| Entry point | Job shape |
+|---|---|
+| `body_standard.sh` | Standard e2e: provision a cluster, install Calico, optionally migrate/upgrade, run tests |
+| `body_flannel-migration.sh` | Flannel-to-Calico migration test with a pre- and post-migration test run |
+
+Both dispatch to single-purpose **phase scripts** under `phases/`. Each phase
+is self-contained, documents its required env vars at the top, and can be
+sourced individually when reproducing part of a CI run locally.
+
+## Phases
+
+| Phase | Purpose |
+|---|---|
+| `phases/provision.sh` | `bz provision` + Semaphore cache store |
+| `phases/install.sh` | `bz install` (install Calico on the provisioned cluster) |
+| `phases/configure.sh` | Post-install env setup: PATH, external-node creds, IPAM pool, failsafe ports |
+| `phases/migrate.sh` | Optional operator migration, AKS migration, `bz upgrade` |
+| `phases/run_tests.sh` | Acquire and run the e2e binary (local build, hashrelease download, or `bz tests` fallback) |
+| `phases/hcp.sh` | Hosted control plane flow (separate provision + test tooling) |
+
+## Reproducing a CI run locally
+
+Each phase script lists its required env vars in its header comment. In the
+common case, reproducing a CI job looks like:
+
+```bash
+cd "${BZ_HOME}"
+source phases/provision.sh
+source phases/install.sh
+source phases/configure.sh
+source phases/run_tests.sh
+```
+
+Phases are **sourced**, not executed, so env vars exported by earlier phases
+(e.g. `PATH`, `EXT_IP`) flow into later phases. Running a phase standalone
+works the same way -- source it from a shell you've set up with the
+required env vars.
+
+## Adding a new phase
+
+1. Create `phases/<name>.sh` with a header comment listing required env vars.
+2. Omit `set -eo pipefail` from the phase -- the orchestrator sets it once
+   and phases inherit via sourcing.
+3. Add the phase to the appropriate body script's dispatch logic.
+4. Add a row to the phase table above.
+
+## The test runner
+
+`phases/run_tests.sh` selects the test execution strategy automatically:
+
+| Condition | Strategy |
+|---|---|
+| `RUN_LOCAL_TESTS` is set | Build the e2e binary from local source (per-PR CI) |
+| `TEST_TYPE == k8s-e2e` | Download the pre-built binary from the hashrelease (scheduled CI) |
+| Otherwise | Fall back to `bz tests` (benchmarks, certification, etc.) |
+
+The first two paths run the binary via `make e2e-run` inside
+`calico/go-build`. Developers can use the same target directly:
+
+```bash
+KUBECONFIG=/path/to/kubeconfig \
+  E2E_TEST_CONFIG=e2e/config/gcp-bpf.yaml \
+  make e2e-run
+```
+
+See `e2e/config/*.yaml` for available test-selection configs and
+`e2e/pkg/testconfig/` for the config format.
+
+## Legacy notes
+
+- `body_flannel-migration.sh` still uses `./bz.sh tests:run` for its pre-
+  and post-migration test runs -- that's a different legacy runner than
+  `bz tests` and has tests the in-repo binary doesn't yet cover. Migrate
+  to `make e2e-run` when parity lands.
@@ -1,5 +1,17 @@
 #!/usr/bin/env bash
+# body_flannel-migration.sh - flannel-to-Calico migration test flow.
+#
+# Provisions a cluster, installs flannel + a CNI plugin helper, runs a basic
+# connectivity smoke test, applies Calico + the flannel-migration job, waits
+# for the migration to complete, then runs the full e2e suite on Calico.
+#
+# Uses the legacy `./bz.sh tests:run` test runner (not the in-repo binary).
+# When the in-repo binary reaches parity, this script can migrate to
+# `make e2e-run` like body_standard.sh's run_tests_local.sh phase.
 set -exo pipefail
+
+PHASES="$(dirname "$0")/phases"
+
 echo "[INFO] starting job..."
 
 export CNI_VERSION=${CNI_VERSION:-"v1.1.1"}
@@ -9,7 +21,7 @@ export CALICO_MANIFEST=${CALICO_MANIFEST:-"manifests/flannel-migration/calico.ya
 export MIGRATION_MANIFEST=${MIGRATION_MANIFEST:-"manifests/flannel-migration/migration-job.yaml"}
 
 if [ "${USE_HASH_RELEASE}" == "true" ]; then
- echo "[INFO] Using hash release for flannel migration"
+  echo "[INFO] Using hash release for flannel migration"
   LATEST_HASHREL="https://latest-os.docs.eng.tigera.net/${RELEASE_STREAM}.txt"
   echo "Checking ${LATEST_HASHREL} for latest hash release url..."
   DOCS_URL=$(curl --retry 9 --retry-all-errors -sS ${LATEST_HASHREL})
@@ -28,16 +40,15 @@ export BZ_LOCAL=${BZ_HOME}/.local
 export KUBECONFIG=$BZ_LOCAL/kubeconfig
 export PATH=$PATH:$BZ_LOCAL/bin
 
-# Seems like modern OSes no longer include br_netfilter by default which breaks flannel. Install it in case we need it.
+# Modern OSes no longer include br_netfilter by default, which breaks flannel.
 echo "[INFO] installing br_netfilter..."
 sudo modprobe br_netfilter
 
 mkdir -p "$BZ_LOGS_DIR"
 cd "${BZ_HOME}"
-bz provision |& tee >(gzip --stdout > "${BZ_LOGS_DIR}/provision.log.gz")
-cache store "$SEMAPHORE_JOB_ID" ../bz
+source "${PHASES}/provision.sh"
 
-# Install bridge CNI plugin (needed by kube-flannel manifest)
+# Install bridge CNI plugin (needed by kube-flannel manifest).
 kubectl apply -f - <<EOF
 apiVersion: apps/v1
 kind: DaemonSet
@@ -96,19 +107,23 @@ spec:
         hostPath:
           path: /opt/cni/bin
 EOF
-# Update flannel.yaml to use the podCIDR that CRC sets up
+
+# Update flannel.yaml to use the podCIDR that CRC sets up.
 wget -O flannel.yaml "$DOWNLEVEL_MANIFEST"
 sed -i "s?10.244.0.0/16?192.168.0.0/16?g" ./flannel.yaml
 kubectl apply -f - < ./flannel.yaml
 sleep 30 # wait for flannel to come up
 kubectl get po -A -owide
-# Run a basic services test to check that flannel networking is working
-K8S_E2E_FLAGS='--ginkgo.focus=should.serve.a.basic.endpoint.from.pods' ./bz.sh tests:run |& tee >(gzip --stdout > "${BZ_LOGS_DIR}/e2e-tests-pre.log")
+
+# Run a basic services test to check that flannel networking is working.
+K8S_E2E_FLAGS='--ginkgo.focus=should.serve.a.basic.endpoint.from.pods' \
+  ./bz.sh tests:run |& tee >(gzip --stdout > "${BZ_LOGS_DIR}/e2e-tests-pre.log")
+
 kubectl delete -n kube-system ds cni-installer || true  # remove the CNI installer daemonset
 kubectl apply -f "$DOCS_URL/$CALICO_MANIFEST"
 wget -O calico-migration.yaml "$DOCS_URL/$MIGRATION_MANIFEST"
 kubectl apply -f - < ./calico-migration.yaml
-sleep 5  # to make sure the job has started before we check its status
+sleep 5  # make sure the job has started before we check its status
 kubectl -n kube-system get jobs flannel-migration
 kubectl -n kube-system describe jobs flannel-migration
 kubectl get po -A -owide
@@ -117,9 +132,11 @@ kubectl -n kube-system get jobs flannel-migration
 kubectl -n kube-system describe jobs flannel-migration
 kubectl -n kube-system logs -l k8s-app=flannel-migration-controller
 kubectl get po -A -owide
-# delete the migration job because the presence of a non-Running pod in kube-system upsets the e2es.
+
+# Delete the migration job because the presence of a non-Running pod in
+# kube-system upsets the e2es.
 kubectl -n kube-system delete job/flannel-migration || true
 kubectl -n kube-system delete po -l k8s-app=flannel-migration-controller || true
 
-# Run e2e on uplevel calico
+# Run e2e on uplevel calico.
 ./bz.sh tests:run |& tee >(gzip --stdout > "${BZ_LOGS_DIR}/e2e-tests.log")
@@ -1,139 +1,50 @@
 #!/usr/bin/env bash
+# body_standard.sh - orchestrator for the standard e2e flow.
+#
+# Dispatches to phase scripts in scripts/phases/. Each phase is self-contained
+# and documents its required env vars. See scripts/README.md for the phase
+# model and guidance on running phases standalone.
 set -eo pipefail
 
-echo "[INFO] starting job..."
+PHASES="$(dirname "$0")/phases"
+
 if [[ "${BZ_VERBOSE}" == "true" ]]; then
   VERBOSE="--verbose"
 else
   VERBOSE=""
 fi
+export VERBOSE
 
-if [[ "${HCP_ENABLED}" == "true" ]]; then
-  echo "[INFO] starting hcp job..."
-
-  echo "[INFO] starting hcp provision..."
-  hcp-provision.sh |& tee ${BZ_LOGS_DIR}/provision.log
-
-  cache delete ${SEMAPHORE_JOB_ID}
-  cache store ${SEMAPHORE_JOB_ID} ${BZ_HOME}
-
-  echo "[INFO] Test logs will be available here after the run: ${SEMAPHORE_ORGANIZATION_URL}/artifacts/jobs/${SEMAPHORE_JOB_ID}?path=semaphore%2Flogs"
-  echo "[INFO] Alternatively, you can view logs while job is running using 'sem attach ${SEMAPHORE_JOB_ID}' and then 'tail -f ${BZ_LOGS_DIR}/${TEST_TYPE}-tests.log'"
-
-  echo "[INFO] starting hcp testing..."
-  hcp-test.sh |& tee ${BZ_LOGS_DIR}/${TEST_TYPE}-tests.log
-
-else
-  echo "[INFO] starting job..."
-  echo "[INFO] BZ_HOME=${BZ_HOME}"
-
-  cd "${BZ_HOME}"
-  if [[ "${HCP_STAGE}" == "hosting" || "${HCP_STAGE}" == "destroy-hosting" ]]; then
-    :  # Skip provisioning for hosting stages as cluster already exists
-  else
-    echo "[INFO] starting bz provision..."
-    bz provision $VERBOSE |& tee >(gzip --stdout > ${BZ_LOGS_DIR}/provision.log.gz)
-
-    cache delete $SEMAPHORE_JOB_ID
-    cache store ${SEMAPHORE_JOB_ID} ${BZ_HOME}
-
-    echo "[INFO] starting bz install..."
-    bz install $VERBOSE |& tee >(gzip --stdout > ${BZ_LOGS_DIR}/install.log.gz)
-
-    if [[ "${HCP_STAGE}" == "setup-hosting" ]]; then
-      echo "[INFO] HCP_STAGE=${HCP_STAGE}, storing hosting cluster profile in cache"
-      cache store ${SEMAPHORE_WORKFLOW_ID}-hosting-${HOSTING_CLUSTER} ${BZ_HOME}
-    fi
-  fi
-
-  # Put the bin dir into the PATH
-  export PATH=$PATH:${BZ_LOCAL_DIR}/bin
-
-  if [[ "${ENABLE_EXTERNAL_NODE}" == "true" ]]; then
-    export EXT_USER=ubuntu
-    EXT_IP=$(cat "${BZ_LOCAL_DIR}"/external_ip)
-    export EXT_IP
-    export EXT_KEY=${BZ_LOCAL_DIR}/external_key
-    export K8S_E2E_DOCKER_EXTRA_FLAGS="-v $EXT_KEY:/key --env EXT_USER --env EXT_KEY=/key --env EXT_IP $K8S_E2E_DOCKER_EXTRA_FLAGS"
-    echo "EXT_USER=ubuntu EXT_IP=$EXT_IP, EXT_KEY=$EXT_KEY"
-    echo "K8S_E2E_DOCKER_EXTRA_FLAGS=$K8S_E2E_DOCKER_EXTRA_FLAGS"
-  fi
-
-  if [ -n "${IPAM_TEST_POOL_SUBNET}" ]; then
-    export K8S_E2E_DOCKER_EXTRA_FLAGS="$K8S_E2E_DOCKER_EXTRA_FLAGS --env IPAM_TEST_POOL_SUBNET"
-    echo "IPAM_TEST_POOL_SUBNET=$IPAM_TEST_POOL_SUBNET"
-  fi
+echo "[INFO] starting job..."
+echo "[INFO] BZ_HOME=${BZ_HOME}"
 
-  if [ "${FAILSAFE_443}" == "true" ]; then
-    KUBECONFIG=${BZ_LOCAL_DIR}/kubeconfig kubectl patch felixconfiguration default --type=merge -p '{"spec":{"failsafeOutboundHostPorts": [{"protocol": "udp", "port":53},{"protocol": "udp", "port":67},{"protocol": "tcp", "port":179},{"protocol": "tcp", "port":2379},{"protocol": "tcp", "port":2380},{"protocol": "tcp", "port":5473},{"protocol": "tcp", "port":443},{"protocol": "tcp", "port":6666},{"protocol": "tcp", "port":6667}]}}'
-  fi
+# HCP jobs take a separate path with their own provision/test tooling.
+if [[ "${HCP_ENABLED}" == "true" ]]; then
+  source "${PHASES}/hcp.sh"
+  exit 0
+fi
 
-  # Perform the operator migration following the instructions here:
-  # https://projectcalico.docs.tigera.io/maintenance/operator-migration
-  if [[ -n "$OPERATOR_MIGRATE" ]]; then
-    ${HOME}/${SEMAPHORE_GIT_DIR}/.semaphore/end-to-end/scripts/test_scripts/operator_migrate.sh |& tee >(gzip --stdout > ${BZ_LOGS_DIR}/operator_migrate.log.gz)
-  fi
-  # Perform the AKS migration following the instructions here:
-  # https://docs.tigera.io/calico/latest/getting-started/kubernetes/managed-public-cloud/aks-migrate
-  if [[ -n "$DESIRED_POLICY" ]]; then
-    echo "[INFO] starting AKS migration..."
-    bz addons run aks-migrate:setup
-  fi
+cd "${BZ_HOME}"
 
-  if [[ -n "$UPLEVEL_RELEASE_STREAM" ]]; then
-    echo "[INFO] starting bz upgrade..."
-    bz upgrade $VERBOSE | tee >(gzip --stdout > ${BZ_LOGS_DIR}/upgrade.log.gz)
-  fi
+# HCP hosting/destroy-hosting stages join an existing cluster provisioned by a
+# prior workflow step, so they skip provisioning and install entirely.
+if [[ "${HCP_STAGE}" != "hosting" && "${HCP_STAGE}" != "destroy-hosting" ]]; then
+  source "${PHASES}/provision.sh"
+  source "${PHASES}/install.sh"
+fi
 
-  if [[ ${MCM_STAGE:-} != *-mgmt* ]] && [[ ${HCP_STAGE:-} != *-hosting* ]]; then
-    echo "[INFO] Test logs will be available here after the run: ${SEMAPHORE_ORGANIZATION_URL}/artifacts/jobs/${SEMAPHORE_JOB_ID}?path=semaphore%2Flogs"
-    echo "[INFO] Alternatively, you can view logs while job is running using 'sem attach ${SEMAPHORE_JOB_ID}' and then 'tail -f ${BZ_LOGS_DIR}/${TEST_TYPE}-tests.log'"
+source "${PHASES}/configure.sh"
+source "${PHASES}/migrate.sh"
 
-    if [[ -n "$RUN_LOCAL_TESTS" ]]; then
-      echo "[INFO] starting e2e testing from local binary..."
-      pushd "${HOME}/calico"
-      make -C e2e build |& tee >(gzip --stdout > "${BZ_LOGS_DIR}/${TEST_TYPE}-build.log.gz")
-      GO_BUILD_VER=$(grep '^GO_BUILD_VER=' ./metadata.mk | cut -d= -f2)
-      # Disable shellcheck double quote validation for ${K8S_E2E_FLAGS} as this var can contain multiple args and should be word split
-      # Capture the exit code so that the JUnit copy below runs even when
-      # tests fail (set -e would otherwise bail out before the cp).
-      #shellcheck disable=SC2086
-      e2e_rc=0
-      docker run --rm --init --net=host \
-        -e LOCAL_USER_ID="$(id -u)" \
-        -e GOCACHE=/go-cache \
-        -e GOPATH=/go \
-        -e KUBECONFIG=/kubeconfig \
-        -e PRODUCT=calico \
-        -e CREATE_WINDOWS_NODES \
-        -e FUNCTIONAL_AREA \
-        -e INSTALLER \
-        -e PROVISIONER \
-        -e K8S_VERSION \
-        -e DATAPLANE \
-        -e ENCAPSULATION_TYPE \
-        -e WINDOWS_OS \
-        -e USE_VENDORED_CNI \
-        -v "$(pwd)":/go/src/github.com/projectcalico/calico:rw \
-        -v "$(pwd)"/.go-pkg-cache:/go-cache:rw \
-        -v "${BZ_LOCAL_DIR}/kubeconfig:/kubeconfig:ro" \
-        -w /go/src/github.com/projectcalico/calico \
-        "calico/go-build:${GO_BUILD_VER}" \
-        go run github.com/onsi/ginkgo/v2/ginkgo -procs="${E2E_PROCS:-4}" \
-          --junit-report=junit.xml --output-dir=report \
-          ./e2e/bin/k8s/e2e.test -- ${K8S_E2E_FLAGS} \
-        |& tee "${BZ_LOGS_DIR}/${TEST_TYPE}-tests.log" || e2e_rc=$?
+# MCM (Multi-Cluster Management) management stages and HCP hosting stages
+# only provision infrastructure for other jobs to test against - they don't
+# run tests themselves. These are enterprise-only flows; MCM_STAGE and
+# HCP_STAGE are unset for OSS jobs.
+if [[ ${MCM_STAGE:-} == *-mgmt* || ${HCP_STAGE:-} == *-hosting* ]]; then
+  exit 0
+fi
 
-      # Copy JUnit XML to REPORT_DIR so the epilogue publishes it.
-      mkdir -p "${REPORT_DIR}"
-      cp report/junit.xml "${REPORT_DIR}/junit.xml" 2>/dev/null || true
-      popd
+echo "[INFO] Test logs will be available here after the run: ${SEMAPHORE_ORGANIZATION_URL}/artifacts/jobs/${SEMAPHORE_JOB_ID}?path=semaphore%2Flogs"
+echo "[INFO] Alternatively, you can view logs while job is running using 'sem attach ${SEMAPHORE_JOB_ID}' and then 'tail -f ${BZ_LOGS_DIR}/${TEST_TYPE}-tests.log'"
 
-      # Propagate the original test exit code.
-      exit $e2e_rc
-    else
-      echo "[INFO] starting bz testing..."
-      bz tests $VERBOSE |& tee >(gzip --stdout > ${BZ_LOGS_DIR}/${TEST_TYPE}-tests.log.gz)
-    fi
-  fi
-fi
+source "${PHASES}/run_tests.sh"
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+# configure.sh - configure test environment after cluster install.
+#
+# Sets PATH to include the bz-provisioned bin dir, exports external-node
+# credentials when ENABLE_EXTERNAL_NODE=true, propagates IPAM test config,
+# and applies the optional failsafe patch.
+#
+# Required env:
+#   BZ_LOCAL_DIR
+# Optional env:
+#   ENABLE_EXTERNAL_NODE, IPAM_TEST_POOL_SUBNET, FAILSAFE_443,
+#   K8S_E2E_DOCKER_EXTRA_FLAGS
+#
+# Exports consumed by later phases:
+#   PATH, EXT_USER, EXT_IP, EXT_KEY, K8S_E2E_DOCKER_EXTRA_FLAGS
+#
+# Sourced from body_*.sh.
+
+if [[ -z "${BZ_LOCAL_DIR}" ]]; then echo "[ERROR] BZ_LOCAL_DIR is required but not set"; exit 1; fi
+
+export PATH=$PATH:${BZ_LOCAL_DIR}/bin
+
+if [[ "${ENABLE_EXTERNAL_NODE}" == "true" ]]; then
+  export EXT_USER=ubuntu
+  EXT_IP=$(cat "${BZ_LOCAL_DIR}/external_ip")
+  export EXT_IP
+  export EXT_KEY=${BZ_LOCAL_DIR}/external_key
+  export K8S_E2E_DOCKER_EXTRA_FLAGS="-v $EXT_KEY:/key --env EXT_USER --env EXT_KEY=/key --env EXT_IP $K8S_E2E_DOCKER_EXTRA_FLAGS"
+  echo "EXT_USER=ubuntu EXT_IP=$EXT_IP, EXT_KEY=$EXT_KEY"
+  echo "K8S_E2E_DOCKER_EXTRA_FLAGS=$K8S_E2E_DOCKER_EXTRA_FLAGS"
+fi
+
+if [ -n "${IPAM_TEST_POOL_SUBNET}" ]; then
+  export K8S_E2E_DOCKER_EXTRA_FLAGS="$K8S_E2E_DOCKER_EXTRA_FLAGS --env IPAM_TEST_POOL_SUBNET"
+  echo "IPAM_TEST_POOL_SUBNET=$IPAM_TEST_POOL_SUBNET"
+fi
+
+# Some pipelines (e.g., VPP) need port 443 added to the failsafe outbound rules
+# so nodes can reach the kube-apiserver. This replaces the default failsafe list
+# with one that includes 443. TODO: consider making 443 a default failsafe port
+# so this patch isn't needed.
+if [ "${FAILSAFE_443}" == "true" ]; then
+  KUBECONFIG=${BZ_LOCAL_DIR}/kubeconfig kubectl patch felixconfiguration default --type=merge \
+    -p '{"spec":{"failsafeOutboundHostPorts": [{"protocol": "udp", "port":53},{"protocol": "udp", "port":67},{"protocol": "tcp", "port":179},{"protocol": "tcp", "port":2379},{"protocol": "tcp", "port":2380},{"protocol": "tcp", "port":5473},{"protocol": "tcp", "port":443},{"protocol": "tcp", "port":6666},{"protocol": "tcp", "port":6667}]}}'
+fi