Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions tests/scripts/codex-desc-avg-budget.bats
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bats
# ag-vzbt: the codex-description catalog budget must be a per-skill AVERAGE (scales
# with skill count) instead of a hard aggregate that walls off the Nth+ skill.
# Fixture-driven via BUDGET_REPO_ROOT. We assert on the codex-catalog output line
# (robust to unrelated checks in the gate).

setup() {
GATE="$BATS_TEST_DIRNAME/../../tests/skills/test-token-budgets.sh"
FIX="$(mktemp -d)"
mkdir -p "$FIX/skills" "$FIX/skills-codex"
mk() { # mk <name> <description>
mkdir -p "$FIX/skills-codex/$1"
printf -- '---\nname: %s\ndescription: %s\n---\n# %s\n' "$1" "$2" "$1" > "$FIX/skills-codex/$1/SKILL.md"
}
export -f mk
}

teardown() { rm -rf "$FIX"; }

@test "codex catalog PASSES when average description length is under budget" {
mk a "short terse codex description here" # ~34 chars
mk b "another short terse codex description" # ~36 chars
run env BUDGET_REPO_ROOT="$FIX" bash "$GATE"
echo "$output"
[[ "$output" == *"skills-codex description catalog"* ]]
echo "$output" | grep "skills-codex description catalog" | grep -q "PASS"
}

@test "codex catalog FAILS when average description length is over budget" {
# Each ~120 chars → avg ~120 >> 45 per-skill-avg cap, but still < 180 per-skill hard cap.
long="this is a deliberately verbose codex description that pushes the per skill average well over the configured budget ceiling"
mk a "$long"
mk b "$long"
run env BUDGET_REPO_ROOT="$FIX" bash "$GATE"
echo "$output"
echo "$output" | grep "skills-codex description catalog" | grep -q "FAIL"
}

@test "budget scales: 100 short-desc skills pass even though total > old 2800 wall" {
# 100 x ~38 chars = ~3800 total (> the old 2800 hard aggregate) but avg ~38 < 45.
# Passes ONLY under the per-skill-average rule — proves the wall is gone.
for i in $(seq 1 100); do mk "skill$i" "terse codex description number $i here"; done
run env BUDGET_REPO_ROOT="$FIX" bash "$GATE"
echo "$output"
echo "$output" | grep "skills-codex description catalog" | grep -q "PASS"
}
24 changes: 14 additions & 10 deletions tests/skills/test-token-budgets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# BUDGET_REPO_ROOT overrides for fixture tests
# (tests/scripts/codex-desc-avg-budget.bats); production derives from script location.
REPO_ROOT="${BUDGET_REPO_ROOT:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
SKILL_ROOTS=("$REPO_ROOT/skills" "$REPO_ROOT/skills-codex")

# Colors
Expand All @@ -27,11 +29,13 @@ SKILL_FAIL_LIMIT=10000
SKILL_WARN_LIMIT=8000
SESSION_FAIL_LIMIT=8000
DESC_FAIL_CHARS=180
# Always-loaded codex skill catalog. Sized to fit the catalog with modest
# headroom: ~81 skills x ~34 avg description chars. Raised from 2600 (calibrated
# at exactly 77 skills with zero slack) to 2700 when expert-council landed, then
# to 2800 when using-gc landed (the catalog grew to 81 skills, ag-p4p).
CODEX_DESC_TOTAL_FAIL_CHARS=2800
# Always-loaded codex skill catalog. The budget is a PER-SKILL AVERAGE, not a hard
# aggregate (ag-vzbt): a hard total (raised 2600→2700→2800 as skills landed) walls
# off the Nth+ skill and forced /burndown into a 17-char stub. An average scales
# with the catalog — each terse description keeps the avg low; the gate fails only
# if descriptions are bloated on average. Current avg ~35; cap 45 = comfortable
# headroom while preserving the terse-description discipline.
CODEX_DESC_AVG_FAIL_CHARS=45

# Token estimation: bytes / 4
estimate_tokens() {
Expand Down Expand Up @@ -195,12 +199,12 @@ fi

if [[ "$codex_desc_count" -gt 0 ]]; then
codex_desc_avg=$((codex_desc_total / codex_desc_count))
if [[ "$codex_desc_total" -gt "$CODEX_DESC_TOTAL_FAIL_CHARS" ]]; then
echo -e " ${RED}[FAIL]${NC} skills-codex description catalog: ${codex_desc_total} chars > ${CODEX_DESC_TOTAL_FAIL_CHARS} aggregate limit"
if [[ "$codex_desc_avg" -gt "$CODEX_DESC_AVG_FAIL_CHARS" ]]; then
echo -e " ${RED}[FAIL]${NC} skills-codex description catalog: avg ${codex_desc_avg} chars/skill > ${CODEX_DESC_AVG_FAIL_CHARS} per-skill-avg limit (${codex_desc_total} chars over ${codex_desc_count} skills)"
((failed++)) || true
else
pct=$((codex_desc_total * 100 / CODEX_DESC_TOTAL_FAIL_CHARS))
echo -e " ${GREEN}[PASS]${NC} skills-codex description catalog: ${codex_desc_total} chars (${pct}% of ${CODEX_DESC_TOTAL_FAIL_CHARS}, avg ${codex_desc_avg})"
pct=$((codex_desc_avg * 100 / CODEX_DESC_AVG_FAIL_CHARS))
echo -e " ${GREEN}[PASS]${NC} skills-codex description catalog: avg ${codex_desc_avg} chars/skill (${pct}% of ${CODEX_DESC_AVG_FAIL_CHARS} per-skill-avg; ${codex_desc_total} chars over ${codex_desc_count} skills)"
((passed++)) || true
fi
else
Expand Down
Loading