diff --git a/tests/scripts/codex-desc-avg-budget.bats b/tests/scripts/codex-desc-avg-budget.bats new file mode 100644 index 000000000..0b0c712df --- /dev/null +++ b/tests/scripts/codex-desc-avg-budget.bats @@ -0,0 +1,46 @@ +#!/usr/bin/env bats +# ag-vzbt: the codex-description catalog budget must be a per-skill AVERAGE (scales +# with skill count) instead of a hard aggregate that walls off the Nth+ skill. +# Fixture-driven via BUDGET_REPO_ROOT. We assert on the codex-catalog output line +# (robust to unrelated checks in the gate). + +setup() { + GATE="$BATS_TEST_DIRNAME/../../tests/skills/test-token-budgets.sh" + FIX="$(mktemp -d)" + mkdir -p "$FIX/skills" "$FIX/skills-codex" + mk() { # mk + mkdir -p "$FIX/skills-codex/$1" + printf -- '---\nname: %s\ndescription: %s\n---\n# %s\n' "$1" "$2" "$1" > "$FIX/skills-codex/$1/SKILL.md" + } + export -f mk +} + +teardown() { rm -rf "$FIX"; } + +@test "codex catalog PASSES when average description length is under budget" { + mk a "short terse codex description here" # ~34 chars + mk b "another short terse codex description" # ~36 chars + run env BUDGET_REPO_ROOT="$FIX" bash "$GATE" + echo "$output" + [[ "$output" == *"skills-codex description catalog"* ]] + echo "$output" | grep "skills-codex description catalog" | grep -q "PASS" +} + +@test "codex catalog FAILS when average description length is over budget" { + # Each ~120 chars → avg ~120 >> 45 per-skill-avg cap, but still < 180 per-skill hard cap. + long="this is a deliberately verbose codex description that pushes the per skill average well over the configured budget ceiling" + mk a "$long" + mk b "$long" + run env BUDGET_REPO_ROOT="$FIX" bash "$GATE" + echo "$output" + echo "$output" | grep "skills-codex description catalog" | grep -q "FAIL" +} + +@test "budget scales: 100 short-desc skills pass even though total > old 2800 wall" { + # 100 x ~38 chars = ~3800 total (> the old 2800 hard aggregate) but avg ~38 < 45. + # Passes ONLY under the per-skill-average rule — proves the wall is gone. + for i in $(seq 1 100); do mk "skill$i" "terse codex description number $i here"; done + run env BUDGET_REPO_ROOT="$FIX" bash "$GATE" + echo "$output" + echo "$output" | grep "skills-codex description catalog" | grep -q "PASS" +} diff --git a/tests/skills/test-token-budgets.sh b/tests/skills/test-token-budgets.sh index a4a5593e4..cc4daf634 100755 --- a/tests/skills/test-token-budgets.sh +++ b/tests/skills/test-token-budgets.sh @@ -12,7 +12,9 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +# BUDGET_REPO_ROOT overrides for fixture tests +# (tests/scripts/codex-desc-avg-budget.bats); production derives from script location. +REPO_ROOT="${BUDGET_REPO_ROOT:-$(cd "$SCRIPT_DIR/../.." && pwd)}" SKILL_ROOTS=("$REPO_ROOT/skills" "$REPO_ROOT/skills-codex") # Colors @@ -27,11 +29,13 @@ SKILL_FAIL_LIMIT=10000 SKILL_WARN_LIMIT=8000 SESSION_FAIL_LIMIT=8000 DESC_FAIL_CHARS=180 -# Always-loaded codex skill catalog. Sized to fit the catalog with modest -# headroom: ~81 skills x ~34 avg description chars. Raised from 2600 (calibrated -# at exactly 77 skills with zero slack) to 2700 when expert-council landed, then -# to 2800 when using-gc landed (the catalog grew to 81 skills, ag-p4p). -CODEX_DESC_TOTAL_FAIL_CHARS=2800 +# Always-loaded codex skill catalog. The budget is a PER-SKILL AVERAGE, not a hard +# aggregate (ag-vzbt): a hard total (raised 2600→2700→2800 as skills landed) walls +# off the Nth+ skill and forced /burndown into a 17-char stub. An average scales +# with the catalog — each terse description keeps the avg low; the gate fails only +# if descriptions are bloated on average. Current avg ~35; cap 45 = comfortable +# headroom while preserving the terse-description discipline. +CODEX_DESC_AVG_FAIL_CHARS=45 # Token estimation: bytes / 4 estimate_tokens() { @@ -195,12 +199,12 @@ fi if [[ "$codex_desc_count" -gt 0 ]]; then codex_desc_avg=$((codex_desc_total / codex_desc_count)) - if [[ "$codex_desc_total" -gt "$CODEX_DESC_TOTAL_FAIL_CHARS" ]]; then - echo -e " ${RED}[FAIL]${NC} skills-codex description catalog: ${codex_desc_total} chars > ${CODEX_DESC_TOTAL_FAIL_CHARS} aggregate limit" + if [[ "$codex_desc_avg" -gt "$CODEX_DESC_AVG_FAIL_CHARS" ]]; then + echo -e " ${RED}[FAIL]${NC} skills-codex description catalog: avg ${codex_desc_avg} chars/skill > ${CODEX_DESC_AVG_FAIL_CHARS} per-skill-avg limit (${codex_desc_total} chars over ${codex_desc_count} skills)" ((failed++)) || true else - pct=$((codex_desc_total * 100 / CODEX_DESC_TOTAL_FAIL_CHARS)) - echo -e " ${GREEN}[PASS]${NC} skills-codex description catalog: ${codex_desc_total} chars (${pct}% of ${CODEX_DESC_TOTAL_FAIL_CHARS}, avg ${codex_desc_avg})" + pct=$((codex_desc_avg * 100 / CODEX_DESC_AVG_FAIL_CHARS)) + echo -e " ${GREEN}[PASS]${NC} skills-codex description catalog: avg ${codex_desc_avg} chars/skill (${pct}% of ${CODEX_DESC_AVG_FAIL_CHARS} per-skill-avg; ${codex_desc_total} chars over ${codex_desc_count} skills)" ((passed++)) || true fi else