diff --git a/.github/workflows/deploy-network.yml b/.github/workflows/deploy-network.yml index d25e6a76d04c..5004d532929f 100644 --- a/.github/workflows/deploy-network.yml +++ b/.github/workflows/deploy-network.yml @@ -38,6 +38,11 @@ on: description: "Source tag that triggered this deploy" required: false type: string + notify_on_failure: + description: "Whether this workflow should send its own failure notification" + required: false + type: boolean + default: true workflow_dispatch: inputs: network: @@ -74,6 +79,11 @@ on: description: "Source tag that triggered this deploy" required: false type: string + notify_on_failure: + description: "Whether this workflow should send its own failure notification" + required: false + type: boolean + default: true concurrency: group: deploy-network-${{ inputs.network }}-${{ inputs.namespace || inputs.network }}-${{ inputs.aztec_docker_image || inputs.semver }}-${{ github.ref || github.ref_name }} @@ -242,7 +252,7 @@ jobs: } >> "$GITHUB_STEP_SUMMARY" - name: Notify Slack and dispatch ClaudeBox on failure - if: failure() + if: failure() && inputs.notify_on_failure env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} GH_TOKEN: ${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} diff --git a/.github/workflows/weekly-proving-bench.yml b/.github/workflows/weekly-proving-bench.yml index b8b383a23022..654a38c610a5 100644 --- a/.github/workflows/weekly-proving-bench.yml +++ b/.github/workflows/weekly-proving-bench.yml @@ -2,7 +2,7 @@ name: Weekly Real Proving Benchmark on: schedule: - - cron: "0 6 * * 1" # Every Monday at 6 AM UTC + - cron: "0 6 * * 1" # Every Monday at 6 AM UTC workflow_dispatch: inputs: nightly_tag: @@ -15,8 +15,11 @@ concurrency: cancel-in-progress: true jobs: - real-proving-benchmark: + select-image: runs-on: ubuntu-latest + outputs: + nightly_tag: ${{ steps.nightly-tag.outputs.nightly_tag }} + docker_image: ${{ steps.nightly-tag.outputs.docker_image }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd @@ -32,12 +35,15 @@ jobs: current_version=$(jq -r '."."' .release-please-manifest.json) nightly_tag="${current_version}-nightly.$(date -u +%Y%m%d)" fi - echo "nightly_tag=$nightly_tag" >> $GITHUB_OUTPUT + + docker_image="aztecprotocol/aztec:${nightly_tag}" + echo "nightly_tag=$nightly_tag" >> "$GITHUB_OUTPUT" + echo "docker_image=$docker_image" >> "$GITHUB_OUTPUT" echo "Using nightly tag: $nightly_tag" - name: Check if Docker image exists run: | - DOCKER_IMAGE="aztecprotocol/aztec:${{ steps.nightly-tag.outputs.nightly_tag }}" + DOCKER_IMAGE="${{ steps.nightly-tag.outputs.docker_image }}" echo "Checking if Docker image exists: $DOCKER_IMAGE" if docker manifest inspect "$DOCKER_IMAGE" > /dev/null 2>&1; then echo "Docker image exists: $DOCKER_IMAGE" @@ -46,6 +52,53 @@ jobs: exit 1 fi + deploy-real-proving-network: + needs: select-image + uses: ./.github/workflows/deploy-network.yml + with: + network: prove-n-tps-real + namespace: prove-n-tps-real + aztec_docker_image: ${{ needs.select-image.outputs.docker_image }} + ref: next + notify_on_failure: false + secrets: inherit + + wait-for-first-l2-block: + needs: deploy-real-proving-network + runs-on: ubuntu-latest + timeout-minutes: 120 + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + ref: next + + - name: Authenticate to Google Cloud + uses: google-github-actions/auth@6fc4af4b145ae7821d527454aa9bd537d1f2dc5f + with: + credentials_json: ${{ secrets.GCP_SA_KEY }} + + - name: Set up Cloud SDK + uses: google-github-actions/setup-gcloud@6189d56e4096ee891640bb02ac264be376592d6a + with: + install_components: gke-gcloud-auth-plugin + + - name: Wait for first L2 block + env: + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + run: | + cd spartan + ./bootstrap.sh wait_for_l2_block prove-n-tps-real + + benchmark: + needs: wait-for-first-l2-block + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + ref: next + - name: Run real proving benchmarks timeout-minutes: 180 env: @@ -59,20 +112,9 @@ jobs: RUN_ID: ${{ github.run_id }} AWS_SHUTDOWN_TIME: 180 NO_SPOT: 1 + SKIP_NETWORK_DEPLOY: "1" run: | - ./.github/ci3.sh network-proving-bench prove-n-tps-real prove-n-tps-real "aztecprotocol/aztec:${{ steps.nightly-tag.outputs.nightly_tag }}" - - - name: Cleanup network resources - if: always() - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - GITHUB_TOKEN: ${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} - BUILD_INSTANCE_SSH_KEY: ${{ secrets.BUILD_INSTANCE_SSH_KEY }} - GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }} - GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} - NO_SPOT: 1 - run: ./.github/ci3.sh network-teardown prove-n-tps-real prove-n-tps-real + ./.github/ci3.sh network-proving-bench prove-n-tps-real prove-n-tps-real - name: Download benchmarks if: always() @@ -81,7 +123,7 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} run: | if ./ci.sh gh-spartan-proving-bench; then - echo "ENABLE_DEPLOY_BENCH=1" >> $GITHUB_ENV + echo "ENABLE_DEPLOY_BENCH=1" >> "$GITHUB_ENV" fi - name: Upload benchmarks @@ -100,13 +142,52 @@ jobs: fail-on-alert: false max-items-in-chart: 100 + cleanup: + if: always() + needs: + - select-image + - deploy-real-proving-network + - wait-for-first-l2-block + - benchmark + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + ref: next + + - name: Cleanup network resources + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + GITHUB_TOKEN: ${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} + BUILD_INSTANCE_SSH_KEY: ${{ secrets.BUILD_INSTANCE_SSH_KEY }} + GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }} + GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }} + NO_SPOT: 1 + run: ./.github/ci3.sh network-teardown prove-n-tps-real prove-n-tps-real + + notify-failure: + if: ${{ always() && failure() && github.event_name != 'workflow_dispatch' }} + needs: + - select-image + - deploy-real-proving-network + - wait-for-first-l2-block + - benchmark + - cleanup + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + with: + ref: next + - name: Notify Slack and dispatch ClaudeBox on failure - if: failure() && github.event_name != 'workflow_dispatch' env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} GITHUB_TOKEN: ${{ secrets.AZTEC_BOT_GITHUB_TOKEN }} run: | - TAG="${{ steps.nightly-tag.outputs.nightly_tag }}" + TAG="${{ needs.select-image.outputs.nightly_tag || 'unknown' }}" RUN_URL="https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" ./ci3/slack_notify_with_claudebox_kickoff "#alerts-next-scenario" \ "Weekly Real Proving Benchmark FAILED (nightly tag ${TAG}): <${RUN_URL}|View Run> (🤖)" \ diff --git a/bootstrap.sh b/bootstrap.sh index 6b9c3f74b058..fbf32d2a2c2f 100755 --- a/bootstrap.sh +++ b/bootstrap.sh @@ -740,22 +740,24 @@ case "$cmd" in ;; "ci-network-proving-bench") # Args: [docker_image] - # Deploys network and runs proving benchmarks. Cleanup should be done separately. + # Deploys network and runs proving benchmarks. Set SKIP_NETWORK_DEPLOY=1 to run against an existing network. export CI=1 env_file="${1:?env_file is required}" namespace="${2:?namespace is required}" docker_image="${3:-}" build - # If no docker image provided, build and push to aztecdev - if [ -z "$docker_image" ]; then - release-image/bootstrap.sh push_pr - docker_image="aztecprotocol/aztecdev:$(git rev-parse HEAD)" - fi - # Set up environment and deploy using spartan export NAMESPACE="$namespace" - export AZTEC_DOCKER_IMAGE="$docker_image" - spartan/bootstrap.sh network_deploy "${env_file}" - # Run proving benchmarks + if [ "${SKIP_NETWORK_DEPLOY:-0}" != "1" ]; then + # If no docker image provided, build and push to aztecdev + if [ -z "$docker_image" ]; then + release-image/bootstrap.sh push_pr + docker_image="aztecprotocol/aztecdev:$(git rev-parse HEAD)" + fi + export AZTEC_DOCKER_IMAGE="$docker_image" + spartan/bootstrap.sh network_deploy "${env_file}" + else + echo "SKIP_NETWORK_DEPLOY=1, running proving benchmarks against existing network '$namespace'." + fi spartan/bootstrap.sh proving_bench "${env_file}" rm -rf bench-out mkdir -p bench-out diff --git a/ci.sh b/ci.sh index 0a48e1b99ee1..4c6ed6765b94 100755 --- a/ci.sh +++ b/ci.sh @@ -31,6 +31,7 @@ function print_usage { echo_cmd "network-scenarios" "Spin up EC2 instances to run network scenario tests in parallel." echo_cmd "network-tests" "Spin up an EC2 instance to run tests on a network." echo_cmd "network-bench" "Spin up an EC2 instance to run benchmarks on a network." + echo_cmd "network-proving-bench" "Spin up an EC2 instance to deploy a network and run proving benchmarks. Set SKIP_NETWORK_DEPLOY=1 to skip deploy." echo_cmd "network-bench-10tps" "Spin up an EC2 instance to run the 10 TPS benchmark on bench-10tps." echo_cmd "network-teardown" "Spin up an EC2 instance to teardown a network deployment." echo_cmd "network-tests-kind" "Spin up an EC2 instance to run a KIND-based spartan test." @@ -253,11 +254,13 @@ case "$cmd" in ;; network-proving-bench) # Args: [docker_image] - # Deploys network and runs proving benchmarks. + # Deploys network and runs proving benchmarks. Set SKIP_NETWORK_DEPLOY=1 to run against an existing network. export CI_DASHBOARD="network" export JOB_ID="x-${2:?namespace is required}-network-proving-bench" CPUS=16 export INSTANCE_POSTFIX="n-proving-bench" - bootstrap_ec2 "./bootstrap.sh ci-network-proving-bench $*" + skip_network_deploy=0 + [ "${SKIP_NETWORK_DEPLOY:-0}" = "1" ] && skip_network_deploy=1 + bootstrap_ec2 "SKIP_NETWORK_DEPLOY=$skip_network_deploy ./bootstrap.sh ci-network-proving-bench $*" ;; network-block-capacity-bench) # Args: [docker_image] diff --git a/spartan/bootstrap.sh b/spartan/bootstrap.sh index 15d908e4409b..c5932697e1a6 100755 --- a/spartan/bootstrap.sh +++ b/spartan/bootstrap.sh @@ -399,6 +399,13 @@ case "$cmd" in fi fi ;; + "wait_for_l2_block") + env_file="$1" + source_env_basic "$env_file" + gcp_auth + source_network_env "$env_file" + ./scripts/wait_for_l2_block.sh "$NAMESPACE" + ;; "single_test") run_network_tests "$1" "$2" ;; diff --git a/spartan/scripts/wait_for_l2_block.sh b/spartan/scripts/wait_for_l2_block.sh index f3b9278c4177..19f238234111 100755 --- a/spartan/scripts/wait_for_l2_block.sh +++ b/spartan/scripts/wait_for_l2_block.sh @@ -6,6 +6,7 @@ # AZTEC_SLOT_DURATION - seconds per L2 slot # AZTEC_EPOCH_DURATION - slots per epoch # AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET - epochs to wait for validator set +# AZTEC_LAG_IN_EPOCHS_FOR_RANDAO - epochs to wait for RANDAO seed set -euo pipefail @@ -13,25 +14,36 @@ namespace="${1:?namespace is required}" slot_duration="${AZTEC_SLOT_DURATION:?AZTEC_SLOT_DURATION must be set}" epoch_duration="${AZTEC_EPOCH_DURATION:?AZTEC_EPOCH_DURATION must be set}" -lag_epochs="${AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET:?AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET must be set}" +validator_lag_epochs="${AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET:?AZTEC_LAG_IN_EPOCHS_FOR_VALIDATOR_SET must be set}" +randao_lag_epochs="${AZTEC_LAG_IN_EPOCHS_FOR_RANDAO:-$validator_lag_epochs}" -# Time to first block = lag_epochs * epoch_duration * slot_duration + buffer -# Add 2x buffer for deployment overhead, validator registration, etc. -expected_wait=$((lag_epochs * epoch_duration * slot_duration)) -max_wait=$((expected_wait * 2 + 120)) # 2x expected + 2min buffer +if [ "$validator_lag_epochs" -gt "$randao_lag_epochs" ]; then + lag_epochs="$validator_lag_epochs" +else + lag_epochs="$randao_lag_epochs" +fi + +# A fresh rollup needs lag + 1 complete epochs before the first committee-backed +# block can be proposed. Add half an epoch plus 5m for deployment and RPC jitter. +warmup_epochs=$((lag_epochs + 1)) +expected_wait=$((warmup_epochs * epoch_duration * slot_duration)) +buffer=$((epoch_duration * slot_duration / 2 + 300)) +max_wait="${L2_BLOCK_WAIT_TIMEOUT_SECONDS:-$((expected_wait + buffer))}" poll_interval=10 -echo "Waiting for L2 blocks (slot=${slot_duration}s, epoch=${epoch_duration} slots, lag=${lag_epochs} epochs)" -echo "Expected first block in ~${expected_wait}s, max wait ${max_wait}s" +echo "Waiting for L2 blocks (slot=${slot_duration}s, epoch=${epoch_duration} slots, validator_lag=${validator_lag_epochs}, randao_lag=${randao_lag_epochs})" +echo "Expected first block after ~${expected_wait}s from genesis, max wait ${max_wait}s from now" rpc_pod="${namespace}-rpc-aztec-node-0" +block_number_request="{\"jsonrpc\":\"2.0\",\"method\":\"node_getBlockNumber\",\"params\":[],\"id\":1}" elapsed=0 while [ $elapsed -lt $max_wait ]; do - block_number=$(kubectl exec -n "$namespace" "$rpc_pod" -- \ - curl -s -X POST http://localhost:8080 \ - -H "Content-Type: application/json" \ - -d '{"jsonrpc":"2.0","method":"node_getBlockNumber","params":[],"id":1}' 2>/dev/null \ - | grep -o '"result":[0-9]*' | grep -o '[0-9]*' || echo "0") + block_number=$(kubectl --request-timeout=10s exec -n "$namespace" "$rpc_pod" -- \ + sh -c "curl --max-time 5 -s -X POST http://localhost:8080 \ + -H \"Content-Type: application/json\" \ + -d \"\$1\" \ + | jq -r \".result // 0\"" \ + sh "$block_number_request" 2>/dev/null || echo "0") if [ "$block_number" -ge 1 ] 2>/dev/null; then echo "L2 block $block_number mined after ${elapsed}s" @@ -39,8 +51,16 @@ while [ $elapsed -lt $max_wait ]; do fi echo "Waiting for L2 blocks... (${elapsed}s/${max_wait}s, block: ${block_number:-0})" - sleep $poll_interval - elapsed=$((elapsed + poll_interval)) + sleep_for=$poll_interval + remaining=$((max_wait - elapsed)) + if [ "$remaining" -lt "$sleep_for" ]; then + sleep_for=$remaining + fi + if [ "$sleep_for" -le 0 ]; then + break + fi + sleep "$sleep_for" + elapsed=$((elapsed + sleep_for)) done echo "Warning: No L2 blocks mined after ${max_wait}s"