diff --git a/.github/workflows/deploy-aws.yml b/.github/workflows/deploy-aws.yml index 81baf9e21b1..fe011664a8b 100644 --- a/.github/workflows/deploy-aws.yml +++ b/.github/workflows/deploy-aws.yml @@ -1,34 +1,54 @@ -name: Deploy to AWS +name: Deploy Release to AWS on: - push: - branches: ["main"] release: types: [published] + workflow_dispatch: + inputs: + image_tag: + description: 'Image tag to deploy (defaults to release tag or commit SHA)' + required: false + type: string + apply_infra: + description: 'Apply terraform before deploying workloads' + required: false + type: boolean + default: false concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + group: deploy-aws-production + cancel-in-progress: false env: AWS_REGION: us-east-1 - ECR_REGISTRY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.us-east-1.amazonaws.com EKS_CLUSTER: summit-prod-eks + K8S_NAMESPACE: default + AWS_ROLE_NAME: github-actions-deploy-role + ECR_REGISTRY: ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.us-east-1.amazonaws.com permissions: id-token: write contents: read jobs: - pre-deploy-gate: - uses: ./.github/workflows/gate.yml - with: - region: us-east-1 + preflight: + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + + - name: Validate release deployment prerequisites + run: | + set -euo pipefail + test -n "${{ secrets.AWS_ACCOUNT_ID }}" + test -f terraform/environments/prod/main.tf + test -f charts/universal-app/Chart.yaml + test -f scripts/verify-deployment.sh build-and-push: - needs: pre-deploy-gate + needs: preflight runs-on: ubuntu-22.04 strategy: + fail-fast: false matrix: service: [maestro, prov-ledger, policy-lac] include: @@ -48,125 +68,144 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Configure AWS Credentials + - name: Setup pnpm + uses: pnpm/action-setup@v4 + with: + version: 9.15.4 + + - name: Setup Node + uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'pnpm' + + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/github-actions-deploy-role + role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/${{ env.AWS_ROLE_NAME }} aws-region: ${{ env.AWS_REGION }} - - name: Dependency Audit + - name: Dependency audit run: | - if [ "${{ matrix.service }}" == "maestro" ]; then - pip install safety && safety check + set -euo pipefail + if [ "${{ matrix.service }}" = "maestro" ]; then + python -m pip install --upgrade pip safety + safety check -r maestro/requirements.txt --full-report else + pnpm install --frozen-lockfile pnpm audit --audit-level=high fi - name: Login to Amazon ECR - id: login-ecr uses: aws-actions/amazon-ecr-login@v2 - - name: Security Scan (Trivy) - uses: aquasecurity/trivy-action@57a97c7e7821a5776cebc9bb87c984fa69cba8f1 # 0.35.0 - with: - scan-type: "fs" - scan-ref: "." - trivy-config: trivy.yaml - exit-code: "0" # Don't fail build yet, just report - ignore-unfixed: true - severity: "CRITICAL,HIGH" - - - name: Build and Push Docker Image + - name: Build and push image + if: ${{ github.event_name == 'release' || inputs.image_tag == '' }} env: ECR_REPOSITORY: summit/${{ matrix.service }} - IMAGE_TAG: ${{ github.sha }} + RELEASE_TAG: ${{ inputs.image_tag || github.event.release.tag_name || github.sha }} run: | + set -euo pipefail docker build \ - -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG \ - -f ${{ matrix.dockerfile }} \ - --build-arg SERVICE_PATH=${{ matrix.path }} \ - ${{ matrix.context }} - docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG + -t "$ECR_REGISTRY/$ECR_REPOSITORY:$RELEASE_TAG" \ + -f "${{ matrix.dockerfile }}" \ + --build-arg SERVICE_PATH="${{ matrix.path }}" \ + "${{ matrix.context }}" + docker push "$ECR_REGISTRY/$ECR_REPOSITORY:$RELEASE_TAG" deploy-infra: - needs: [build-and-push, pre-deploy-gate] + needs: build-and-push runs-on: ubuntu-22.04 + if: inputs.apply_infra == true steps: - uses: actions/checkout@v4 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/${{ env.AWS_ROLE_NAME }} + aws-region: ${{ env.AWS_REGION }} + - name: Setup Terraform uses: hashicorp/setup-terraform@v3 - - name: Terraform Apply + - name: Terraform apply + env: + TF_VAR_aws_region: ${{ env.AWS_REGION }} working-directory: terraform/environments/prod run: | - terraform init - terraform apply -auto-approve + set -euo pipefail + terraform init -input=false + terraform apply -auto-approve -input=false deploy-k8s: - needs: deploy-infra + needs: [build-and-push, deploy-infra] + if: ${{ always() && needs.build-and-push.result == 'success' && (needs.deploy-infra.result == 'success' || needs.deploy-infra.result == 'skipped') }} runs-on: ubuntu-22.04 + environment: production steps: - uses: actions/checkout@v4 - - name: Configure AWS Credentials + - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: - role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/github-actions-deploy-role + role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_ID }}:role/${{ env.AWS_ROLE_NAME }} aws-region: ${{ env.AWS_REGION }} - - name: Capture Governance Evidence + - name: Setup Helm + uses: azure/setup-helm@v4 + + - name: Setup kubectl + uses: azure/setup-kubectl@v4 + + - name: Update kubeconfig + run: aws eks update-kubeconfig --name "$EKS_CLUSTER" --region "$AWS_REGION" + + - name: Capture governance evidence run: | + set -euo pipefail mkdir -p evidence-artifacts aws sts get-caller-identity > evidence-artifacts/caller-identity.json - # Attempt to capture role policy, ignore failure if permission denied - aws iam get-role --role-name github-actions-deploy-role --query 'Role.AssumeRolePolicyDocument' --output json > evidence-artifacts/trust-policy.json || echo "Could not fetch trust policy" > evidence-artifacts/trust-policy-error.txt + aws eks describe-cluster --name "$EKS_CLUSTER" --region "$AWS_REGION" > evidence-artifacts/cluster.json - - name: Upload Governance Evidence - uses: actions/upload-artifact@v4 - with: - name: governance-evidence-deploy-k8s - path: evidence-artifacts/ - - - name: Update Kubeconfig - run: aws eks update-kubeconfig --name ${{ env.EKS_CLUSTER }} - - - name: Deploy Manifests + - name: Deploy workloads + env: + RELEASE_TAG: ${{ inputs.image_tag || github.event.release.tag_name || github.sha }} run: | - # Deploy Maestro + set -euo pipefail helm upgrade --install maestro charts/universal-app \ - --set image.repository=$ECR_REGISTRY/summit/maestro \ - --set image.tag=${{ github.sha }} \ + --namespace "$K8S_NAMESPACE" \ + --set image.repository="$ECR_REGISTRY/summit/maestro" \ + --set image.tag="$RELEASE_TAG" \ --set service.targetPort=8001 \ - --set ingress.enabled=true \ - --set ingress.hosts[0].host=api.summit.internal \ - --set ingress.hosts[0].paths[0].path=/maestro \ - --set ingress.hosts[0].paths[0].pathType=Prefix \ - --namespace default + --set fullnameOverride=maestro - # Deploy Prov Ledger helm upgrade --install prov-ledger charts/universal-app \ - --set image.repository=$ECR_REGISTRY/summit/prov-ledger \ - --set image.tag=${{ github.sha }} \ + --namespace "$K8S_NAMESPACE" \ + --set image.repository="$ECR_REGISTRY/summit/prov-ledger" \ + --set image.tag="$RELEASE_TAG" \ --set service.targetPort=4010 \ - --namespace default + --set fullnameOverride=prov-ledger - # Deploy Policy LAC helm upgrade --install policy-lac charts/universal-app \ - --set image.repository=$ECR_REGISTRY/summit/policy-lac \ - --set image.tag=${{ github.sha }} \ + --namespace "$K8S_NAMESPACE" \ + --set image.repository="$ECR_REGISTRY/summit/policy-lac" \ + --set image.tag="$RELEASE_TAG" \ --set service.targetPort=4000 \ - --namespace default + --set fullnameOverride=policy-lac - - name: Post-Deployment Smoke Test + - name: Verify rollouts and smoke check run: | - # Wait for rollout - kubectl rollout status deployment/maestro --timeout=118s - - # Run the project's internal smoke test script - # We use kubectl exec to run it from inside a pod or curl the ingress - echo "Running Health Check..." + set -euo pipefail + kubectl rollout status deployment/maestro --namespace "$K8S_NAMESPACE" --timeout=180s + kubectl rollout status deployment/prov-ledger --namespace "$K8S_NAMESPACE" --timeout=180s + kubectl rollout status deployment/policy-lac --namespace "$K8S_NAMESPACE" --timeout=180s ./scripts/verify-deployment.sh - # Optional: Run app-level functional smoke tests - # pnpm run test:smoke + - name: Upload governance evidence + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: aws-deploy-evidence-${{ github.run_id }}-attempt-${{ github.run_attempt }} + path: evidence-artifacts/ + retention-days: 30 diff --git a/docker-compose.dev.yaml b/docker-compose.dev.yaml index 939ce9e91b0..2ea1a167763 100644 --- a/docker-compose.dev.yaml +++ b/docker-compose.dev.yaml @@ -1,5 +1,3 @@ -version: '3.9' - services: postgres: image: postgres:16-alpine @@ -53,7 +51,7 @@ services: - neo4j_data:/data - neo4j_logs:/logs healthcheck: - test: [ 'CMD-SHELL', 'cypher-shell -u ${NEO4J_USERNAME:-neo4j} -p ${NEO4J_PASSWORD} "RETURN 1"' ] + test: [ 'CMD-SHELL', 'cypher-shell -u ${NEO4J_USERNAME:-neo4j} -p ${NEO4J_PASSWORD:-summit_dev_pw} "RETURN 1"' ] interval: 15s timeout: 10s retries: 10 @@ -128,7 +126,7 @@ services: OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4318 OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: http://otel-collector:4318/v1/metrics ports: - - '4000:4000' + - '4001:4000' depends_on: postgres: condition: service_healthy @@ -147,6 +145,32 @@ services: - summit labels: - 'prometheus.job=summit_api' + - 'prometheus.port=4001' + + api-gateway: + build: + context: . + dockerfile: services/api-gateway/Dockerfile + container_name: summit-api-gateway + restart: unless-stopped + environment: + - PORT=4000 + - NODE_ENV=development + - GRAPH_SERVICE_URL=http://server:4000 + ports: + - '4000:4000' + depends_on: + server: + condition: service_healthy + networks: + - summit + healthcheck: + test: ['CMD', 'curl', '-f', 'http://localhost:4000/health/live'] + interval: 15s + timeout: 5s + retries: 10 + labels: + - 'prometheus.job=summit_api_gateway' - 'prometheus.port=4000' nginx: @@ -155,31 +179,36 @@ services: restart: unless-stopped ports: - '4100:4100' - - '9464:9464' # Prometheus port for Nginx metrics if exposed + - '9464:9464' volumes: - ./services/nginx/nginx.conf:/etc/nginx/nginx.conf:ro depends_on: server: condition: service_healthy - # Update dependencies if they were tied to the old gateway service - # Assuming 'api' might have been a placeholder dependency or also proxied - # policy-compiler: - # condition: service_healthy - # typesense: - # condition: service_healthy networks: - summit labels: - 'prometheus.job=summit_nginx_gateway' - 'prometheus.port=4100' - - 'prometheus.path=/health' # Assuming Nginx exposes /health for healthcheck + - 'prometheus.path=/health' prov-ledger: - build: ./services/prov-ledger + build: + context: . + dockerfile: services/prov-ledger/Dockerfile ports: [ "4010:4010" ] + healthcheck: + test: [ 'CMD', 'curl', '-f', 'http://localhost:4010/health' ] + interval: 15s + timeout: 5s + retries: 10 + networks: + - summit policy-compiler: - build: ./services/policy-compiler + build: + context: . + dockerfile: services/policy-compiler/Dockerfile ports: [ "8102:8080" ] healthcheck: test: [ "CMD", "curl", "-f", "http://localhost:8080/health" ] @@ -190,43 +219,81 @@ services: - summit ai-nlq: - build: ./services/ai-nlq + build: + context: . + dockerfile: services/ai-nlq/Dockerfile ports: [ "8103:8080" ] + healthcheck: + test: [ 'CMD', 'curl', '-f', 'http://localhost:8080/health' ] + interval: 15s + timeout: 5s + retries: 10 + networks: + - summit er-service: - build: ./services/er-service + build: + context: . + dockerfile: services/er-service/Dockerfile ports: [ "8104:8080" ] + healthcheck: + test: [ 'CMD', 'curl', '-f', 'http://localhost:8080/health' ] + interval: 15s + timeout: 5s + retries: 10 + networks: + - summit ingest: - build: ./services/ingest + build: + context: . + dockerfile: services/ingest/Dockerfile ports: [ "8105:8080" ] + healthcheck: + test: [ 'CMD', 'curl', '-f', 'http://localhost:8080/health' ] + interval: 15s + timeout: 5s + retries: 10 + networks: + - summit zk-tx: - build: ./services/zk-tx + build: + context: . + dockerfile: services/zk-tx/Dockerfile ports: [ "8106:8080" ] + healthcheck: + test: [ 'CMD', 'curl', '-f', 'http://localhost:8080/health' ] + interval: 15s + timeout: 5s + retries: 10 + networks: + - summit predictd: build: context: . dockerfile: services/predictd/Dockerfile + networks: + - summit slo-exporter: build: - context: ./apps/slo-exporter - dockerfile: Dockerfile + context: . + dockerfile: apps/slo-exporter/Dockerfile container_name: summit-slo-exporter restart: unless-stopped environment: PORT: 9092 PROMETHEUS_URL: http://prometheus:9090 - API_URL: http://api:4000 + API_URL: http://server:4000 PREDICTD_PORT: 4001 ports: - '9092:9092' depends_on: prometheus: condition: service_started - api: + server: condition: service_healthy networks: - summit @@ -263,6 +330,8 @@ services: - "/policies" volumes: - ./services/opa/policies:/policies + networks: + - summit web: build: @@ -273,14 +342,14 @@ services: env_file: - ${DEV_ENV_FILE:-.env} environment: - VITE_API_URL: http://server:4000/graphql - VITE_WS_URL: ws://server:4000/graphql + VITE_API_URL: http://api-gateway:4000/graphql + VITE_WS_URL: ws://api-gateway:4000/graphql VITE_WEBSOCKET_URL: ws://websocket-server:9001 VITE_PORT: 3000 ports: - '3000:3000' depends_on: - server: + api-gateway: condition: service_healthy healthcheck: test: [ 'CMD', 'curl', '-f', 'http://localhost:3000' ] @@ -292,8 +361,8 @@ services: websocket-server: build: - context: ./services/websocket-server - dockerfile: Dockerfile + context: . + dockerfile: services/websocket-server/Dockerfile command: node dist/index.js container_name: summit-websocket-server restart: unless-stopped @@ -452,8 +521,8 @@ services: ai-sandbox: build: - context: ./services/ai-sandbox - dockerfile: Dockerfile + context: . + dockerfile: services/ai-sandbox/Dockerfile command: node dist/index.js container_name: summit-ai-sandbox restart: unless-stopped @@ -483,8 +552,8 @@ services: agentic-mesh-evaluation: build: - context: ./services/agentic-mesh-evaluation - dockerfile: Dockerfile + context: . + dockerfile: services/agentic-mesh-evaluation/Dockerfile container_name: summit-agentic-mesh-evaluation restart: unless-stopped environment: @@ -522,6 +591,11 @@ services: OTEL_SERVICE_NAME: summit-ai OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: http://otel-collector:4318/v1/traces OTEL_EXPORTER_OTLP_METRICS_ENDPOINT: http://otel-collector:4318/v1/metrics + healthcheck: + test: [ 'CMD', 'curl', '-f', 'http://localhost:8000/health' ] + interval: 15s + timeout: 5s + retries: 10 networks: - summit labels: