diff --git a/.eslintignore b/.eslintignore index 8b2fdf31d0f..d1f8b7cabfa 100644 --- a/.eslintignore +++ b/.eslintignore @@ -75,3 +75,4 @@ tsconfig.json **/tsconfig.json tsconfig.*.json **/tsconfig.*.json +node_modules_corrupted diff --git a/.github/workflows/branch-protection-drift.yml b/.github/workflows/branch-protection-drift.yml index 0cb7d6dd2fd..072f9dcb042 100644 --- a/.github/workflows/branch-protection-drift.yml +++ b/.github/workflows/branch-protection-drift.yml @@ -49,7 +49,7 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 - name: Determine Branch id: branch @@ -60,7 +60,7 @@ jobs: - name: Generate GitHub App token id: app-token - uses: actions/create-github-app-token@v2 + uses: actions/create-github-app-token@c1a285145b9d317df6acd8a5e62e5e4e9688a8f0 continue-on-error: true with: app-id: ${{ secrets.BRANCH_PROTECTION_APP_ID }} @@ -133,7 +133,7 @@ jobs: echo "extra_count=${EXTRA}" >> "$GITHUB_OUTPUT" - name: Upload Drift Report - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 if: always() with: name: branch-protection-drift-report @@ -144,7 +144,7 @@ jobs: - name: Upsert governance issue (non-PR only) if: github.event_name != 'pull_request' && steps.check.outputs.drift_detected == 'true' && github.event.inputs.dry_run != 'true' - uses: actions/github-script@v6 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea with: script: | const fs = require('fs'); diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5ffbf8a3306..fb00730da33 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,11 +33,11 @@ jobs: permissions: contents: read steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 with: fetch-tags: true fetch-depth: 0 - - uses: actions/setup-node@v4 + - uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 with: node-version-file: .nvmrc - name: Verify no unauthorized merge conflict markers @@ -47,7 +47,7 @@ jobs: - name: Upload Conflict Marker Report if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 with: name: conflict-markers-report path: conflict-markers-report.json @@ -61,16 +61,16 @@ jobs: steps: - name: Clean orphaned worktrees run: rm -rf .worktrees - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 with: fetch-depth: 0 fetch-tags: true - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fe02b34f77f8bc703788d5817da081398fad5dd2 with: version: 9.15.4 - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fe02b34f77f8bc703788d5817da081398fad5dd2 with: version: 9.15.4 - name: Validate Jest & pnpm Configuration @@ -95,16 +95,16 @@ jobs: steps: - name: Clean orphaned worktrees run: rm -rf .worktrees - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 with: fetch-tags: true fetch-depth: 0 # Need full history for changed files detection - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fe02b34f77f8bc703788d5817da081398fad5dd2 with: version: 9.15.4 - name: Setup Node - uses: actions/setup-node@v4 + uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 with: cache: "pnpm" node-version-file: .nvmrc @@ -139,7 +139,7 @@ jobs: # Cache ESLint results - name: Cache ESLint - uses: actions/cache@v4 + uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 with: path: .eslintcache key: eslint-cache-${{ runner.os }}-${{ hashFiles('**/*.ts', '**/*.tsx', '**/*.js', '**/*.jsx') }} @@ -164,16 +164,16 @@ jobs: steps: - name: Clean orphaned worktrees run: rm -rf .worktrees - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 with: fetch-depth: 0 fetch-tags: true - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fe02b34f77f8bc703788d5817da081398fad5dd2 with: version: 9.15.4 - name: Setup Node - uses: actions/setup-node@v4 + uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 with: node-version-file: .nvmrc cache: "pnpm" @@ -194,7 +194,7 @@ jobs: # Cache TypeScript build info - name: Cache TypeScript - uses: actions/cache@v4 + uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 with: path: | **/tsconfig.tsbuildinfo @@ -221,16 +221,16 @@ jobs: steps: - name: Clean orphaned worktrees run: rm -rf .worktrees - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 with: fetch-tags: true fetch-depth: 0 # Need for --onlyChanged - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fe02b34f77f8bc703788d5817da081398fad5dd2 with: version: 9.15.4 - name: Setup Node - uses: actions/setup-node@v4 + uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 with: node-version-file: .nvmrc cache: "pnpm" @@ -262,7 +262,7 @@ jobs: - name: Generate Coverage Report if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 with: name: coverage-report path: server/coverage/ @@ -270,7 +270,7 @@ jobs: # Cache Jest - name: Cache Jest - uses: actions/cache@v4 + uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 with: path: | .jest-cache @@ -297,16 +297,16 @@ jobs: steps: - name: Clean orphaned worktrees run: rm -rf .worktrees - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 with: fetch-depth: 0 fetch-tags: true - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fe02b34f77f8bc703788d5817da081398fad5dd2 with: version: 9.15.4 - name: Setup Node - uses: actions/setup-node@v4 + uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 with: cache: "pnpm" node-version-file: .nvmrc @@ -334,12 +334,12 @@ jobs: contents: read security-events: write steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 with: fetch-tags: true fetch-depth: 0 - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fe02b34f77f8bc703788d5817da081398fad5dd2 with: version: 9.15.4 - name: Secret Scan (gitleaks - changed files only) @@ -414,16 +414,16 @@ jobs: steps: - name: Clean orphaned worktrees run: rm -rf .worktrees - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 with: fetch-depth: 0 fetch-tags: true - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fe02b34f77f8bc703788d5817da081398fad5dd2 with: version: 9.15.4 - name: Setup Node - uses: actions/setup-node@v4 + uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 with: node-version-file: .nvmrc cache: "pnpm" @@ -435,7 +435,7 @@ jobs: run: bash scripts/test-soc-controls.sh soc-compliance-reports - name: Upload SOC compliance reports if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 with: name: soc-compliance-report if-no-files-found: ignore @@ -455,7 +455,7 @@ jobs: steps: - name: Clean orphaned worktrees run: rm -rf .worktrees - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 with: fetch-depth: 0 fetch-tags: true @@ -473,7 +473,7 @@ jobs: shell: bash - name: Upload artifacts if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 with: name: verification-artifacts path: artifacts/** @@ -554,7 +554,7 @@ jobs: - name: Store validation artifact if: github.event_name == 'pull_request' - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 with: name: ci-validation-${{ github.event.pull_request.head.sha || github.sha }} path: validation-success.txt diff --git a/.github/workflows/lint-gate.yml b/.github/workflows/lint-gate.yml index f013748f727..d7c00837333 100644 --- a/.github/workflows/lint-gate.yml +++ b/.github/workflows/lint-gate.yml @@ -76,6 +76,7 @@ jobs: --exclude="*.test.*" \ --exclude="*.spec.*" \ --exclude-dir="node_modules" \ + --exclude-dir="node_modules_corrupted" \ --exclude-dir="dist" \ --exclude-dir="build" \ --exclude-dir=".next" \ @@ -94,6 +95,7 @@ jobs: --exclude="*.test.*" \ --exclude="*.spec.*" \ --exclude-dir="node_modules" \ + --exclude-dir="node_modules_corrupted" \ -n . | head -20 exit 1 fi diff --git a/.github/workflows/merge-queue.yml b/.github/workflows/merge-queue.yml index e1b9c2656ea..3ab03ebadf0 100644 --- a/.github/workflows/merge-queue.yml +++ b/.github/workflows/merge-queue.yml @@ -13,25 +13,21 @@ jobs: name: Heavy Checks runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 -<<<<<<< HEAD + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 with: fetch-depth: 0 fetch-tags: true - - uses: pnpm/action-setup@v4 -======= - - uses: pnpm/action-setup@v3 ->>>>>>> pr-21884 + - uses: pnpm/action-setup@fe02b34f77f8bc703788d5817da081398fad5dd2 with: version: 9.15.4 - - uses: actions/setup-node@v4 + - uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 with: node-version: 24 cache: 'pnpm' - name: Check for PR Validation Artifact id: validate - uses: actions/download-artifact@v4 + uses: actions/download-artifact@c850b930e6ba138125429b7e5c93fc707cb86dac with: name: ci-validation-${{ github.event.merge_group.head_sha }} path: . diff --git a/.github/workflows/policy-drift.yml b/.github/workflows/policy-drift.yml index b44bc301bca..de75315a977 100644 --- a/.github/workflows/policy-drift.yml +++ b/.github/workflows/policy-drift.yml @@ -25,7 +25,7 @@ jobs: cache: 'npm' - name: Install dependencies - run: npm ci || npm install + run: pnpm install --frozen-lockfile - name: Build Policy Cards run: | diff --git a/.github/workflows/schema-change-check.yml b/.github/workflows/schema-change-check.yml index 39cd387636b..ea6b07c1b11 100644 --- a/.github/workflows/schema-change-check.yml +++ b/.github/workflows/schema-change-check.yml @@ -105,7 +105,7 @@ jobs: - name: Install deps run: | sudo apt-get update && sudo apt-get install -y jq postgresql-client - npm ci --prefix head/tools + pnpm install --frozen-lockfile --dir head/tools - name: Apply base migrations working-directory: base diff --git a/.github/workflows/workflow-lint.yml b/.github/workflows/workflow-lint.yml index 7673cc1579b..1cc2b23b513 100644 --- a/.github/workflows/workflow-lint.yml +++ b/.github/workflows/workflow-lint.yml @@ -7,6 +7,6 @@ jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 - name: Lint workflows run: echo "Linting workflows..." diff --git a/docs/research/agent-ecosystem-report.md b/docs/research/agent-ecosystem-report.md index 1bc326e4b58..6a7c1d8d6f2 100644 --- a/docs/research/agent-ecosystem-report.md +++ b/docs/research/agent-ecosystem-report.md @@ -2,11 +2,23 @@ ## Executive Summary -The AI agent ecosystem has matured significantly, moving past experimental phases into production-grade orchestration systems. Three prominent frameworks continue to dominate the multi-agent landscape in 2026: **LangGraph**, **CrewAI**, and **AutoGen**. Each addresses different operational paradigms, ranging from strict graph-based state machines to dynamic conversational workflows. +The AI agent ecosystem has matured significantly, moving past experimental phases into production-grade orchestration systems. Prominent frameworks continue to dominate the multi-agent landscape in 2026: **LangGraph**, **CrewAI**, **AutoGen**, and the emerging **OpenAI Agents SDK**. Each addresses different operational paradigms, ranging from strict graph-based state machines to dynamic conversational workflows. There is a notable industry trend towards the "Agent as a Tool" and handoff patterns, offering more modular, transparent, and auditable multi-agent collaboration. ## Framework Analysis & Capabilities -### 1. LangGraph (LangChain) +### 1. OpenAI Agents SDK (Swarm Evolution) + +The OpenAI Agents SDK represents a streamlined, native approach to multi-agent orchestration, heavily relying on the concepts of routines and handoffs without needing complex external framework dependencies. + +- **Core Paradigm:** Agent-as-a-Tool and Handoffs. +- **Key Capabilities:** + - **Native Integration:** Direct integration with OpenAI's APIs, leveraging the newest model capabilities seamlessly. + - **Tool Support:** Comprehensive support for custom Python functions, managed tools (e.g., Code Interpreter, WebSearch), and external MCP servers. + - **Handoff Mechanism:** Agents can seamlessly transfer control to specialized peer agents based on task requirements, treating other agents essentially as executable tools. + - **Strict LLM Orchestration:** Avoids heavy state-machine abstractions in favor of letting the LLM's tool-calling logic drive the orchestration flow directly. +- **Best Use Cases:** Systems requiring transparent, auditable collaboration with minimal orchestration boilerplate, leveraging specialized sub-agents. + +### 2. LangGraph (LangChain) LangGraph has solidified its position as the premier framework for complex, stateful, and deterministic orchestration. With the stable release of LangChain 1.0 and LangGraph 1.0, it excels in environments with strict auditability and high-reliability requirements. @@ -19,7 +31,7 @@ LangGraph has solidified its position as the premier framework for complex, stat - **Stability and Modernization:** Python 3.10+ requirement and simplified package structure for production-grade deployments. - **Best Use Cases:** Complex, conditional pipelines; production systems requiring compliance and strict audit trails. -### 2. CrewAI +### 3. CrewAI CrewAI focuses on simplifying the creation of multi-agent systems by leveraging intuitive human-like team metaphors. It offers the fastest path from prototype to functional multi-agent collaboration. @@ -31,7 +43,7 @@ CrewAI focuses on simplifying the creation of multi-agent systems by leveraging - **MCP Integration:** Native support for the Model Context Protocol (MCP), enabling deeper integration with external tools and resources. - **Best Use Cases:** Business workflows, research syndication, and task delegation where roles map neatly to human organizational structures. -### 3. AutoGen (Microsoft Agent Framework) +### 4. AutoGen (Microsoft Agent Framework) Backed by enterprise resources and now in version 0.4.0+, AutoGen excels in dynamic, conversational interactions and complex problem-solving where iterative refinement is required. @@ -45,9 +57,10 @@ Backed by enterprise resources and now in version 0.4.0+, AutoGen excels in dyna ## Industry Trends & Next Steps -- **Hybrid Architectures:** We are seeing an increase in production deployments combining frameworks (e.g., LangGraph for overall state orchestration, wrapping a CrewAI team for a specific research sub-task). +- **Hybrid Architectures:** We are seeing an increase in production deployments combining frameworks (e.g., LangGraph for overall state orchestration, wrapping a CrewAI team or an OpenAI Agent SDK routine for a specific research sub-task). +- **Agent as a Tool:** A massive shift towards the "Agent as a Tool" handoff pattern (popularized by OpenAI Agents SDK) where central orchestrators treat specialized sub-agents simply as functional tool calls. - **Production Safety:** Error handling and robust fallback mechanisms ("safe nodes") are becoming standard requirements over sheer capability. **Recommendation:** Summit's internal orchestration and benchmarking must expand to cover these advanced topologies, specifically evaluating the overhead of coordination and the resilience of durable execution under load. -_Update:_ We have explicitly expanded our benchmarks to track State Recovery Success Rate (SRSR), Coordination Token Overhead (CTO), and Orchestration Latency Penalty (OLP). We have also created adapter layers for LangGraph, CrewAI, and AutoGen to support these metrics. +_Update:_ We have explicitly expanded our benchmarks to track State Recovery Success Rate (SRSR), Coordination Token Overhead (CTO), and Orchestration Latency Penalty (OLP). We have also created adapter layers for LangGraph, CrewAI, AutoGen, and OpenAI Agents to support these metrics. diff --git a/docs/research/agent-eval-insights.md b/docs/research/agent-eval-insights.md index 91f196d3ffb..3df6b034aee 100644 --- a/docs/research/agent-eval-insights.md +++ b/docs/research/agent-eval-insights.md @@ -2,7 +2,7 @@ ## Overview -Based on the latest developments in the agent ecosystem (LangGraph, CrewAI, AutoGen), Summit Bench must expand its evaluation dimensions to accurately measure production-grade multi-agent capabilities. The current benchmarks largely focus on single-agent reasoning and deterministic tool use. We must shift toward evaluating coordination, state resilience, and execution overhead in complex multi-agent topologies. +Based on the latest developments in the agent ecosystem (LangGraph, CrewAI, AutoGen, and OpenAI Agents SDK), Summit Bench must expand its evaluation dimensions to accurately measure production-grade multi-agent capabilities. The current benchmarks largely focus on single-agent reasoning and deterministic tool use. We must shift toward evaluating coordination, state resilience, and execution overhead in complex multi-agent topologies. ## Proposed Benchmark Expansions @@ -12,6 +12,7 @@ Based on the latest developments in the agent ecosystem (LangGraph, CrewAI, Auto - **High-Concurrency Orchestration:** Measuring the latency and throughput of the orchestration layer itself when managing hundreds or thousands of simultaneous agent interactions. - **Role-Based Delegation Efficiency:** Evaluating how accurately an orchestrator (like a CrewAI Manager) can divide a complex task, assign the correct sub-tasks to specialized agents based on their defined roles, and synthesize the results without hallucination. - **Dynamic Code Execution & Sandboxing:** Testing an agent's ability to iteratively write, safely execute, debug, and refine code in an isolated environment to solve a problem that cannot be addressed purely via static tool calls. +- **Agent-as-a-Tool Handoff Efficiency:** Evaluating the smoothness and token cost of an orchestrator dynamically passing complete contextual state to a specialized sub-agent and returning the synthesized result, a pattern native to the OpenAI Agents SDK. ### 2. Proposed Cases & Fixtures (Backlog) @@ -28,6 +29,10 @@ Based on the latest developments in the agent ecosystem (LangGraph, CrewAI, Auto - **Case: `iterative_script_debugging`** - **Description:** Provide a task requiring parsing a deliberately malformed proprietary binary format. - **Goal:** The agent must write a script, observe the execution failure (stack trace), and iterate on the code until the script successfully parses the file and extracts the expected string. +- **Case: `agent_as_tool_handoff`** + - **Description:** An overarching "Analyst" agent must delegate three distinct domain queries (Legal, Financial, Technical) to three different specialized agents via tool calls (handoffs). + - **Target Framework:** OpenAI Agents SDK and frameworks supporting "Agent as a Tool". + - **Goal:** Evaluate the context retention during the handoff boundaries and the CTO (Coordination Token Overhead) of the handoff mechanism compared to monolithic resolution. ### 3. Proposed Evaluation Metrics @@ -37,6 +42,6 @@ Based on the latest developments in the agent ecosystem (LangGraph, CrewAI, Auto ## Next Steps for the Summit Team -1. [x] **Framework Integration:** Implement adapter layers for the latest stable versions of LangGraph, CrewAI, and AutoGen within the `evaluation/adapters/` directory. +1. [x] **Framework Integration:** Implement adapter layers for the latest stable versions of LangGraph, CrewAI, AutoGen, and OpenAI Agents SDK within the `evaluation/adapters/` directory. 2. [x] **Dataset Generation:** Construct the golden fixtures for `concurrent_stress_test` and `mid_task_failure_recovery` in `GOLDEN/datasets/agent_orchestration/`. 3. [x] **Metric Implementation:** Add the `SRSR` and `CTO` scoring logic to `evaluation/scoring/agent_metrics.py`. diff --git a/policy/actions-allowlist.json b/policy/actions-allowlist.json index 6ef10defb42..55e648b7078 100644 --- a/policy/actions-allowlist.json +++ b/policy/actions-allowlist.json @@ -13,9 +13,14 @@ ".github/workflows/workflow-lint.yml" ], "actions": [ + "actions/cache", "actions/checkout", + "actions/create-github-app-token", + "actions/download-artifact", + "actions/github-script", "actions/setup-node", "actions/upload-artifact", + "gitleaks/gitleaks-action", "pnpm/action-setup" ] } diff --git a/scripts/compliance/generate_sbom_from_lockfile.ts b/scripts/compliance/generate_sbom_from_lockfile.ts index a5b6e9c1b0b..37177d0883f 100644 --- a/scripts/compliance/generate_sbom_from_lockfile.ts +++ b/scripts/compliance/generate_sbom_from_lockfile.ts @@ -25,7 +25,9 @@ interface SbomComponent { interface Sbom { id: string; - version: string; + bomFormat: string; + specVersion: string; + version: number; createdAt: string; generator: { name: string; version: string }; components: SbomComponent[]; @@ -111,7 +113,9 @@ function generateSbom(): Sbom { const sbom: Sbom = { id: `sbom_summit-${commitSha.slice(0, 8)}`, - version: '1.0.0', + bomFormat: 'CycloneDX', + specVersion: '1.4', + version: 1, createdAt: new Date().toISOString(), generator: { name: 'summit-sbom-generator',