From 0ec7fffb7bbffa38fdd8a63c4fdddd7634cc66fc Mon Sep 17 00:00:00 2001 From: Phillip Cunliffe Date: Fri, 12 Jun 2026 12:55:48 -0700 Subject: [PATCH 1/7] Add LLP 0022: remote config and join flow spec Specifies the client half of centrally-managed gateway configuration: join sequence, config pull loop, seed-config mode, staged-restart apply semantics, hash-pinned install-on-config, and last-known-good rollback with post-apply probation. Supporting amendments: - LLP 0003: config apply engine added to the core-owns list - LLP 0011: join added as a non-interactive entry point - LLP 0017: staged restart for config replacement + installer relaunch requirement - central proto.md: policy tokens, running-config If-None-Match convergence semantics, hash-pinned plugins, 404 demoted to legacy - LLP 0000: subsystem map updated - notes-archive: round-1 review record for LLP 0022 Co-Authored-By: Claude Fable 5 --- .../plugins-workspace/central/proto.md | 23 +- llp/0000-hypaware.explainer.md | 1 + llp/0003-core-vs-plugin-surface.spec.md | 11 + llp/0011-setup-and-onboarding.decision.md | 6 + llp/0017-daemon-runtime.decision.md | 31 +- llp/0022-remote-config-join-flow.spec.md | 297 ++++++++++++++++++ ...te-config-join-flow.spec.round-1.claude.md | 159 ++++++++++ 7 files changed, 521 insertions(+), 7 deletions(-) create mode 100644 llp/0022-remote-config-join-flow.spec.md create mode 100644 notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md diff --git a/hypaware-core/plugins-workspace/central/proto.md b/hypaware-core/plugins-workspace/central/proto.md index 1ab9395..8a31e16 100644 --- a/hypaware-core/plugins-workspace/central/proto.md +++ b/hypaware-core/plugins-workspace/central/proto.md @@ -30,8 +30,9 @@ atomic tmp+rename). ### POST `/v1/identity/bootstrap` Exchange an operator-issued bootstrap token for a long-lived JWT. -Bootstrap tokens are single-use; a successful bootstrap response also -invalidates the bootstrap token server-side. +Bootstrap tokens are **policy tokens** (server LLP 0008): multi-use, so +one token can be deployed fleet-wide via MDM, and every token references +a config at mint (see "Config pull" below). Request: @@ -78,6 +79,12 @@ Headers (request): - `Authorization: Bearer ` - `If-None-Match: ` (optional) +`If-None-Match` reflects the **running** config, never a +downloaded-but-not-yet-applied one. The server reads this header to +track fleet convergence, so a gateway mid-install/mid-apply keeps +presenting its old etag until the new config has taken effect +(LLP 0022). + Response 200: ```json @@ -89,15 +96,21 @@ Response 200: } ``` +The body is a full HypAware v2 config and replaces the gateway's +operative config wholesale. Plugin entries are pinned by **version + +artifact content hash**; the gateway verifies the artifact hash on +install and treats a mismatch as an apply failure (LLP 0022). + `ETag: ` accompanies every 200 response. Clients persist the etag in a sidecar (`/config-etag.json`) so a restart short-circuits to 304 instead of re-pulling and re-validating. Response 304: no body. The gateway keeps its current config. -Response 404: the operator has not registered a config for this -gateway. Gateways back off to 5 minutes and log once until the state -clears. +Response 404: legacy-only branch — every token now references a config +at mint (server LLP 0009), so gateways enrolled under that flow always +resolve. Kept for conformance against older servers: back off to +5 minutes and log once until the state clears. Response 401: see "Refresh window" above. diff --git a/llp/0000-hypaware.explainer.md b/llp/0000-hypaware.explainer.md index 88db9fd..a506b43 100644 --- a/llp/0000-hypaware.explainer.md +++ b/llp/0000-hypaware.explainer.md @@ -70,6 +70,7 @@ plugin that registers a dataset gets query and formatting for free. | AI gateway as a plugin | [0016](./0016-ai-gateway.decision.md) | Decision | | Daemon runtime & installers | [0017](./0017-daemon-runtime.decision.md) | Decision | | Observability & self-instrumentation | [0021](./0021-observability.spec.md) | Spec | +| Remote config & join flow | [0022](./0022-remote-config-join-flow.spec.md) | Spec | ## Where to start diff --git a/llp/0003-core-vs-plugin-surface.spec.md b/llp/0003-core-vs-plugin-surface.spec.md index fbbcc10..f6ca4ed 100644 --- a/llp/0003-core-vs-plugin-surface.spec.md +++ b/llp/0003-core-vs-plugin-surface.spec.md @@ -28,6 +28,11 @@ copy-pasted into every plugin, it belongs in core. - the Iceberg-backed cache/storage implementation and freshness checks - result formatting (table / json / jsonl / markdown) - managed state directories, lock files, permission prompts +- the **config apply engine** — staging a replacement config: validate, + install pinned plugins, persist last-known-good, swap, staged restart, + rollback bookkeeping. Exposed to plugins as a narrow context facade; the + document's *transport* (e.g. `@hypaware/central`'s pull loop) is plugin + domain. See [LLP 0022](./0022-remote-config-join-flow.spec.md#apply-engine-is-kernel-surface). ## Intrinsic, not plugin-provided @@ -50,6 +55,12 @@ They are therefore promoted to a neutral core home re-exported from `src/core/index.js`, not buried in the cache — the cache is one consumer, not the owner. +"Query is intrinsic" means the **SQL/dataset surface** specifically: the +dataset registry, SQL execution, cursors, freshness, and formatting. Other +query modalities (e.g. vector similarity search) are **plugin capabilities** +that build on the intrinsic surface, not kernel surface — decided 2026-06-12 +when scoping `@hypaware/vector-search`. + ## Plugins own Domain behavior only, expressed through what they `require`, `provide`, and diff --git a/llp/0011-setup-and-onboarding.decision.md b/llp/0011-setup-and-onboarding.decision.md index bea7263..9782733 100644 --- a/llp/0011-setup-and-onboarding.decision.md +++ b/llp/0011-setup-and-onboarding.decision.md @@ -55,3 +55,9 @@ For scripted installs (CI, fleet provisioning), `hypaware init ` accepts named presets contributed by plugins, and `hypaware init --from-file ./team.json` provisions a fleet of identical installs. Presets are named after what they are *for*, never after an architectural role. + +For centrally-managed gateways, `hypaware join ` writes a seed +config (central plugin only) and performs the non-interactive daemon install; +the full config arrives from the server at join. It is sugar over "write the +config file + install the daemon", not a separate path. See +[LLP 0022](./0022-remote-config-join-flow.spec.md#seed-config-mode). diff --git a/llp/0017-daemon-runtime.decision.md b/llp/0017-daemon-runtime.decision.md index ebb431e..00e9dd6 100644 --- a/llp/0017-daemon-runtime.decision.md +++ b/llp/0017-daemon-runtime.decision.md @@ -5,7 +5,7 @@ **Systems:** Daemon **Author:** Phil / Claude **Date:** 2026-06-01 -**Related:** LLP 0002, LLP 0011, LLP 0012, LLP 0014 +**Related:** LLP 0002, LLP 0011, LLP 0012, LLP 0014, LLP 0022 > The primary daemon and how it is installed. Decomposed from the V1 finishing > plan (`finish-v1` Phases 3–4, now tombstoned) and `hypaware-design.md`. @@ -20,12 +20,39 @@ V1 introduces a primary daemon that boots the kernel and runs the steady state: - run the **sink export loop** — tick each configured sink on its cron schedule ([LLP 0014](./0014-sinks.spec.md)) - watch config and reload sources in place on change (same-shape reload, see - [LLP 0004](./0004-activation-and-paths.spec.md#same-shape-reload)) + [LLP 0004](./0004-activation-and-paths.spec.md#same-shape-reload)) — this + path covers **same-shape** changes only; config *replacement* takes the + [staged restart](#staged-restart-for-config-replacement) below - report health for `hypaware status` ([LLP 0009](./0009-cli-registry.spec.md#core-rendered-status)) The source registry and sink driver exist independently; the daemon is the long-lived host that drives them together. +## Staged restart for config replacement + +When the operative config is **replaced wholesale** — remote config apply +([LLP 0022](./0022-remote-config-join-flow.spec.md#apply-semantics-staged-restart)), +or any change to the plugin set or installed plugin code — the daemon does +**not** reload in place. It persists the new config and **exits; the service +manager relaunches it** onto the new config. + +Process restart is the only correct model here, not a simplification target: +install-on-config can upgrade a plugin that is already loaded, and Node's ESM +module cache cannot be invalidated — an in-process re-activate would run stale +code against the new config, defeating the artifact hash verification that +just passed. Restarting the process guarantees executed code = pinned artifact. + +Consequences: + +- The launchd / systemd user units **must be configured to relaunch on exit** + (`KeepAlive` / `Restart=always`). This is now a requirement of the + installers, not a nicety. +- A foreground (non-service) daemon cannot relaunch itself: it exits with a + distinct restart exit code and the invoker (smoke harness, dev shell) loops. +- Same-shape reload ([LLP 0004](./0004-activation-and-paths.spec.md#same-shape-reload)) + remains the path for in-place source config changes; there are exactly two + paths, distinguished by whether the plugin set / plugin code changed. + ## Install: global package, then service manager When daemon install is requested from `npx hypaware`, **install a persistent diff --git a/llp/0022-remote-config-join-flow.spec.md b/llp/0022-remote-config-join-flow.spec.md new file mode 100644 index 0000000..ec6e5bc --- /dev/null +++ b/llp/0022-remote-config-join-flow.spec.md @@ -0,0 +1,297 @@ +# LLP 0022: Remote Config and Join Flow + +**Type:** Spec +**Status:** Draft +**Systems:** Config, Sinks, Plugins +**Author:** Phil / Claude +**Date:** 2026-06-12 +**Related:** LLP 0007, LLP 0008, LLP 0010, LLP 0014, LLP 0017; hypaware-server LLP 0009 (out of tree, design authority) + +> Client-side spec for centrally-managed gateway configuration. Derived from +> the hypaware-server LLP 0009 handoff +> (`~/workspace/hypaware-server/llp/0009-remote-config.spec.md` is the design +> authority for the feature as a whole; this document owns the client half). + +## Summary + +A gateway can be configured entirely from the central server. MDM deploys a +**seed** — server URL + policy token, nothing else — and the gateway joins the +fleet, pulls its full config, installs any plugins that config names, and +becomes operational without the user ever touching a config file. Later edits +to the central config reconfigure the fleet on the poll cadence. This document +specifies the join sequence, the config pull loop, seed-config mode, apply +semantics, install-on-config, and last-known-good rollback. + +This is **post-V1 work**: `@hypaware/central` is explicitly out of V1 scope +([LLP 0002](./0002-v1-scope.decision.md#out-of-v1-scope)). + +## Motivation + +The client user never touches a config file or knows one exists. Everything +the gateway does (plugins, sinks, query) is authored centrally and delivered +at join. The existing `@hypaware/central` plugin already has identity +bootstrap/refresh and the ingest path; what is missing is the config pull loop +and the apply machinery around it. + +## The join sequence + +1. Seed boots the kernel with the central plugin only. +2. `POST /v1/identity/bootstrap` exchanges the policy token for a JWT. +3. `GET /v1/config` pulls the operator-authored config. +4. Apply (persist + staged restart) → fully operational. + +## Config pull loop + +The central plugin is configured as a **sink instance** +([LLP 0014](./0014-sinks.spec.md#config-two-shapes)); the pull cadence lives +in its sink config block as `poll_interval_seconds` (already validated by +`central/src/config.js`, 5–3600s), separate from the cron `schedule` that +drives ingest exports. The pull and identity-refresh timers are +plugin-internal: started at activation, stopped at `close()` — no change to +the LLP 0014 sink contract. + +`@hypaware/central`'s `src/sink.js` notes that refresh and config pull "live +on their own timers when wired in" — this spec wires the config pull: + +- Pull **immediately on bootstrap success**, then on a steady timer (minutes; + 304s are cheap — the server ETag is a content hash of the served revision). +- The `proto.md` ETag/304/404/429 semantics are unchanged. The etag sidecar + (`config-etag.json`) behavior stands. +- **`If-None-Match` must reflect the *running* config, never a + downloaded-but-not-yet-applied one.** The server reads this header to track + fleet convergence (it lands in the queryable `gateways` dataset), so a + gateway mid-install/mid-apply keeps presenting its old etag until the new + config has actually taken effect. + +## Seed-config mode + +The seed is an **ordinary v2 config file** — `~/.hyp/hypaware-config.json` +containing exactly the central plugin (server URL + policy token), nothing +else. There is no seed-specific file format and the kernel has no "seed" +state: seed-config mode is just this particular config booted, consistent +with [LLP 0010](./0010-config-model.spec.md#no-mode-field) (no mode flag; a +host is what its config says). + +Such a config must boot cleanly: no sources, no other sinks, collecting +nothing, polling for config. This is a legitimate steady state for the +seconds between enrollment and first 200 — not an error. + +The policy token lives in the seed config itself (the config file is mode +0600). Policy tokens are multi-use (server LLP 0008), so it is not consumed +on bootstrap; the first successful apply replaces the seed config wholesale, +which naturally retires the token from disk. From then on `identity.json` +carries the JWT. + +`hypaware join ` is convenience sugar for MDM install scripts: +it writes the seed config and performs the non-interactive daemon install, +and is specified as **exactly equivalent** to doing those two steps by hand — +a wrapper, not a second code path. It joins `init ` and +`init --from-file` as a non-interactive entry point +([LLP 0011](./0011-setup-and-onboarding.decision.md#non-interactive-entry)). +Because a policy token is a multi-use, fleet-wide credential, `join` also +accepts `--token-file ` and stdin, and MDM scripts should prefer those +forms — a bare argv token lands in shell history and process listings. + +## Apply semantics: staged restart + +A pulled 200 body is a **full HypAware v2 config and replaces the operative +config wholesale** — no merging, no client-owned sections. Persist the +document, then restart. Never live-mutate. + +Staged restart is a **process-level restart**: the daemon persists the new +config and exits; the service manager relaunches it +([LLP 0017](./0017-daemon-runtime.decision.md#staged-restart-for-config-replacement) +records the decision and why in-process re-activation is unsound — Node's ESM +module cache would run stale plugin code past the artifact hash check). The +in-place [same-shape reload](./0004-activation-and-paths.spec.md#same-shape-reload) +path is never used for remote apply. + +Recommended persistence idiom: **A/B slots** — write each config to its own +path and flip an atomic pointer (symlink or one-line file) as the last step +before exit. Same semantics as "file swap," but a crash between persist and +restart can never leave an ambiguous operative config, and last-known-good +is crash-safe by construction. + +### Apply engine is kernel surface + +The central plugin is **transport only**: pull, ETag bookkeeping, auth. It +hands a downloaded document to a narrow kernel facade (shape TBD at +implementation, e.g. `ctx.configControl.stage(document)`); the **kernel** +owns validate → install pinned plugins → persist last-known-good → swap → +restart, and the rollback bookkeeping. Recorded in +[LLP 0003](./0003-core-vs-plugin-surface.spec.md#core-owns). + +Why kernel-side: rollback state must survive the restart and pairs with the +kernel-owned config file; the apply engine is testable without HTTP (rollback +is exactly the code that must not be discovered broken in production); and a +future second management channel reuses it. Consequently +**last-known-good config and the remembered bad etag live in kernel-managed +state** ([LLP 0004](./0004-activation-and-paths.spec.md#state-directories)), +not the central plugin's state dir. + +The `config-etag.json` sidecar must transition **atomically with the +operative config, in both directions**: it carries the etag of the *running* +config, so apply moves it forward and rollback reverts it (otherwise a +rolled-back gateway would present a converged etag while running +last-known-good). Since every sidecar change coincides with an apply or +rollback, the facade takes the etag alongside the document and the **apply +engine stages the sidecar with the swap**; the central plugin only reads it +(at boot, to populate `If-None-Match`). + +Identity state (`identity.json`, JWT, gateway id) is **not config** and is +never touched by config application. + +## Install-on-config (hash-pinned) + +A pulled config may name plugins not installed on the machine. The client +installs them through the **existing +[LLP 0007](./0007-plugin-install-and-locking.decision.md) install path** +(prebuilt git artifact, never `npm install` — +[LLP 0008](./0008-plugin-runtime-dependencies.decision.md) — recorded in the +plugin lock file). Served configs always pin **version + artifact content +hash** (the server's save pipeline guarantees this); the client must verify +the artifact hash and treat a mismatch as an apply failure (→ rollback, +below). The config names exactly one artifact; nothing may substitute code +after authoring. + +### Bundled first-party plugins + +First-party plugins ship bundled in the kernel package +([LLP 0002](./0002-v1-scope.decision.md#plugin-packaging-divergence)) and are +never fetched at apply time. For a pinned plugin that is bundled with the +running kernel: + +- The bundled copy satisfies the pin; the **artifact hash is not checked**. + Bundled code is inside the existing trust boundary — it ships in the same + npm package as the kernel performing the verification, and the server's + hash refers to a git release artifact that legitimately differs from the + npm-bundled tree. +- The pinned **version is checked strictly**: a mismatch between the pinned + version and the bundled version is an apply failure (→ rollback, below). + +Version-strictness means a fleet with mixed kernel versions (e.g. mid +rolling upgrade) can only converge on a config whose first-party pins match +every gateway's bundled versions — see open questions. + +## Last-known-good rollback + +If an applied config fails validation, a pinned install fails its hash check, +or the post-apply probation window (below) expires unsatisfied, revert to the +previous operative config (file swap + staged restart — cheap by +construction). Remember the failed revision's etag and **back off re-apply +attempts for that etag until the etag changes** — re-polling is fine, an +apply-crash loop is not. One remembered bad-etag value, no persistent +denylist. The client records a **structured rollback reason** (validation +failure / hash mismatch / probation expiry, plus the offending etag) from day +one — the server only sees non-convergence via `If-None-Match` and cannot +distinguish "rolled back" from "never applied," so if a rollback column is +ever added to the `gateways` dataset, the data must already exist +client-side. For V1 it surfaces in client logs and in `hypaware status` +([LLP 0009](./0009-cli-registry.spec.md#core-rendered-status)): probation +state, last rollback + reason, and the remembered bad etag — an operator at +the machine must not need log spelunking to learn the gateway rejected a +config. + +Rollback restores the config, **not the install root**: plugin trees and +lock-file entries installed for the failed config stay on disk. The lock +file records what is installed, not what is active — the operative config +defines the active set — and keeping the artifacts makes re-apply after a +fixed revision cheaper. + +### Post-apply probation + +Because apply is a process restart, the apply engine writes a **probation +marker to kernel-managed state before restarting** ("revision X applied at T, +probation until T+W"); the relaunched daemon reads it at boot. Probation is +cleared by the **first successful authenticated config poll** (200 or 304 on +`GET /v1/config`) after the restart — that one request proves identity +survived, the server is reachable, and the new config's central sink runs, +and its `If-None-Match` is simultaneously the server-side convergence signal, +so client probation and fleet convergence clear on the same packet. An ingest +POST is deliberately *not* the signal: an idle gateway with nothing to export +must still be able to clear probation. If the window expires unsatisfied, the +kernel rolls back: staged restart onto last-known-good, bad etag remembered. + +The **kernel owns the probation timer and the rollback decision, +independently of the central plugin functioning** — a wedged or +wrongly-pointed central sink is precisely a case probation must catch. The +plugin reports a successful poll through the apply facade (a confirmation +call); **it never touches probation state directly**. Probation expiry is +also evaluated **at boot, before plugin activation**: a +kernel-killing-but-valid config that crashloops under the service manager's +relaunch policy may never live long enough for a running timer to fire, so +each relaunch checks the marker first and rolls back from boot if the window +has passed. + +A probation-clearing poll may itself return 200 with a newer revision; that +triggers an immediate next apply, with its own probation. This chaining is +correct — do not serialize or suppress it. + +W must comfortably exceed one poll interval plus retry backoff (e.g. +`max(3 × poll_interval_seconds, floor)` rather than a fixed constant), so a +slow operator-chosen poll cadence cannot make every apply roll back. + +Rollback from the **first** applied config lands back on the seed config — +fine by construction: seed-config mode is a legitimate polling steady state, +and the bad-etag backoff prevents a re-apply loop. + +## Wire contract amendments (`proto.md`) + +`hypaware-core/plugins-workspace/central/proto.md` is the authoritative wire +reference and is amended by this spec: + +- Served configs pin plugins by version + artifact content hash. +- `If-None-Match` reflects the running config (convergence semantics). +- 404 ("operator has not registered a config") is demoted to a legacy-only + branch: every token now references a config at mint, so gateways enrolled + under server LLP 0009 always resolve. Keep the polite backoff for + conformance. +- The "bootstrap tokens are single-use" sentence is replaced by the + policy-token amendment (server LLP 0008); both changes fold in together. + +## Server-side guarantees the client relies on + +- Every gateway enrolled through a policy token resolves to a config — + join-time 404 is structurally impossible for new enrollments. +- The served document passed the server's save pipeline: schema-valid, + plugins hash-pinned, and **always contains a central sink targeting the + server's own external URL** (so a config that would disconnect the fleet + can't be authored). The rollback backstop covers the residue + (wrong-but-present URL, kernel-killing-but-valid configs). +- ETag changes exactly when the served bytes change (revision content hash). + No push channel in V1: propagation latency = the poll cadence. + +## Sequencing + +Server lands first (registry, revisions, admin authoring endpoints, +mint-requires-config, serving, convergence columns) and ships dark. +`GET /v1/config` has existed since V1, so no capability handshake is needed. +Nothing server-side is blocked on the client; nothing client-side is blocked +on the server except end-to-end testing. + +## Open questions + +- Exact poll cadence default (the spec says "minutes"; pick a number when + wiring the timer). +- Maximum accepted config document size. Wholesale-replace means an + authenticated 200 of arbitrary size goes straight into memory and onto + disk; a stated cap is one line of defense-in-depth. Pick a generous bound + when wiring the pull. +- Exact probation window formula (the *signal* and the + `max(3 × poll_interval_seconds, floor)` shape are decided; pick the floor + when wiring). +- **Strict version pins for bundled plugins vs rolling kernel upgrades.** + The strict check (above) means a kernel upgrade that bumps bundled plugin + versions de-converges the fleet until the central config's pins are + updated, and a mixed-version fleet cannot fully converge on one config. + Considered alternative: treat the pin as enforced only for fetched + artifacts and let config *validation* gate apply for bundled plugins, + reporting the bundled version upward. Deliberately deferred — strict now, + relax if upgrade thrash shows up in practice. + +## References + +- hypaware-server LLP 0009 (`0009-remote-config.spec.md`) — design authority +- hypaware-server LLP 0008 — policy tokens +- [`proto.md`](../hypaware-core/plugins-workspace/central/proto.md) — wire reference +- [LLP 0007](./0007-plugin-install-and-locking.decision.md), [LLP 0008](./0008-plugin-runtime-dependencies.decision.md), [LLP 0010](./0010-config-model.spec.md), [LLP 0014](./0014-sinks.spec.md) diff --git a/notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md b/notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md new file mode 100644 index 0000000..298163a --- /dev/null +++ b/notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md @@ -0,0 +1,159 @@ +# Review of LLP 0022: Remote Config and Join Flow + +**Reviewer:** Claude (Fable 5) +**Date:** 2026-06-12 +**Round:** 1 +**LLP Status at review time:** Draft + +## Overall assessment + +This is a good design and a notably complete one for a Draft: the join flow +is coherent end-to-end, the hard decisions (process-level restart, kernel +apply engine, bundled-plugin pin semantics, probation signal) are made +explicitly with rationale rather than left to the implementer, and the spec +is honest about what it defers. The strongest property is that several +mechanisms collapse into single primitives — convergence reporting, probation +clearing, and rollback visibility are all the same `GET /v1/config` request; +rollback is the same staged restart as apply. That economy is what makes the +"one remembered bad etag, no denylist" simplicity credible. + +The main weakness is at the seams the restart creates: the spec decides *who +owns* probation state (kernel) but not *who watches the clock* or how the +plugin's successful poll reaches the kernel, and it doesn't say what happens +when the applied config crashes the daemon faster than probation can be +evaluated. These are not flaws in the design — they are consequences of the +(correct) restart decision that the spec hasn't finished chasing down. + +## Strengths + +- **The probation signal choice is genuinely elegant.** Clearing on the first + authenticated poll makes client-side health and server-side convergence the + same observable event, and the explicit rejection of ingest-POST as the + signal (idle gateways must clear probation) shows the edge case was + actually considered. +- **The ESM-module-cache argument for process restart** is the right kind of + rationale: it converts a style preference ("restarts are cleaner") into a + correctness requirement (in-process re-activation would execute code other + than the hash-verified artifact). A future agent cannot "optimize" this + away without confronting a stated invariant. +- **Seed-as-ordinary-config** keeps faith with LLP 0010#no-mode-field — no + seed file format, no kernel seed state — and gets crash-safety for free: + rollback from the first apply lands on the seed, which the spec correctly + identifies as a legitimate steady state rather than a special case. +- **The bundled-plugin trust-boundary argument** (hash-checking code that + ships in the same npm package as the verifier buys nothing) is correct and + prevents the always-mismatching-hash failure mode that a naive uniform rule + would have shipped. +- **The kernel/plugin split for apply** is justified on the right grounds: + rollback state must survive the restart, and rollback is exactly the code + path that must be testable without HTTP. +- The open questions section records *why* the bundled-pin strictness was + deferred and what the considered alternative was — that's the difference + between a deferral and a hole. + +## Concerns + +1. **[Definite, trivial] The join sequence still says "kernel reload."** + Step 4 reads "Apply (persist + kernel reload)" — stale wording from before + the staged-restart decision; the apply section below contradicts it. Fix: + "Apply (persist + staged restart)". + +2. **[Definite] The probation watchdog's owner and evaluation point are + unspecified.** The marker is kernel state and "the kernel rolls back," but + the clearing event is observed by the *central plugin* (its poll), and the + window can expire while the plugin is wedged — wrong-but-present URL is + exactly the residue case the server guarantees don't cover. Two things + must be stated: (a) the kernel owns the probation timer and rollback + decision *independently of the central plugin functioning*; (b) probation + expiry is also evaluated **at boot, before plugin activation** — otherwise + a kernel-killing-but-valid config that crashloops under the service + manager's relaunch policy may never stay alive long enough for a running + timer to fire, and the gateway never rolls back. Boot-time evaluation + closes the crashloop case. Resolve by adding both sentences to + #post-apply-probation. + +3. **[Possible] The plugin→kernel "poll succeeded" signal path is + unspecified.** The facade shape is deliberately TBD, but this particular + signal is load-bearing for rollback correctness, and its absence invites + an implementer to have the plugin clear the marker file directly — + violating the state-ownership rule the spec itself establishes. One line + ("the facade includes a confirmation call; the plugin never touches + probation state") would pin it without designing the API. + +4. **[Possible] The etag sidecar's update timing now crosses the ownership + boundary.** The sidecar is plugin-owned wire bookkeeping, but apply is + kernel-owned, and the sidecar must come to reflect the new etag exactly + when the new config becomes the running one (during probation the new + config *is* running, so presenting the new etag mid-probation is correct — + and rollback must revert the sidecar too, or the gateway will present a + converged etag while running last-known-good). Who writes it, and when, in + both the apply and rollback directions? This is the one place where + "kernel owns apply" and "plugin owns the sidecar" genuinely collide; + resolve by specifying the handoff (simplest: the facade passes the etag + with the document, the kernel stages both, and the plugin re-reads the + sidecar at boot). + +5. **[Minor] Rollback leaves orphaned installs.** A failed config may have + hash-verified and installed plugins before validation or probation failed; + rollback restores the config but the spec says nothing about the installed + trees or lock-file entries. Probably correct to leave them (the lock file + records installs, not the active set, and re-apply after a fixed config + becomes cheaper) — but say so, or the lock file's meaning quietly shifts. + +6. **[Minor] `hypaware join ` puts a credential in argv** — + shell history and process listings on the very MDM-scripted machines this + targets. Policy tokens are multi-use and fleet-wide, which raises the + blast radius. Suggest the command also accept `--token-file` / stdin and + the spec recommend that form for MDM scripts. + +## Suggestions + +Prioritized: + +1. Fix concern 1 (one line) and add the two probation sentences from + concern 2 — these are the only changes I'd block on. +2. Pin the confirmation-signal ownership (concern 3) and the sidecar handoff + (concern 4) in a sentence each. +3. **Add an operator-visibility line:** probation state, last rollback, and + the remembered bad etag should surface in `hypaware status` (LLP 0009 + core-rendered status). "Rollback diagnosis stays in client logs for V1" is + fine for the server side, but the operator standing at the machine + shouldn't need log spelunking to learn the gateway rejected a config — + and this spec's own log-driven-development culture argues for it. +4. **Consider A/B config slots as the implementation idiom** for + "file swap": write configs to content-addressed or alternating paths and + flip an atomic pointer (symlink or one-line file). Same semantics the spec + already requires, but it makes "persist last-known-good" crash-safe by + construction — there is no moment where a crash between persist and + restart leaves an ambiguous operative config. Non-standard for config + files, standard for OTA updates, and this *is* an OTA update scheme. +5. The "chained apply" case is worth one sentence: a probation-clearing poll + may itself return 200 with a newer revision, triggering an immediate + second apply. This is correct behavior (each apply gets its own + probation), but stating it prevents an implementer from "helpfully" + serializing or suppressing it. + +## Open questions + +Beyond the three the spec already records (all appropriately deferred): + +- Where does probation rollback report *to*? The server sees non-convergence + via etag, but cannot distinguish "rolled back" from "never applied." If the + `gateways` dataset later wants a rollback column, the client needs to have + been recording the reason from day one — cheap now, annoying to retrofit. +- Does the lock file distinguish "installed and in the active config" from + "installed, orphaned by rollback"? (Falls out of concern 5.) +- Is there a maximum config document size the client will accept? A + wholesale-replace model means a malformed-but-authenticated 200 of + arbitrary size goes straight into memory and onto disk; a stated cap is + one line of defense-in-depth. + +## Recommended next step + +Stay `Draft` for one more pass: address concerns 1–2 (small, mechanical) and +decide on 3–4 (a sentence each). After that this is ready to move to +`Review` — the design itself is sound, the decisions are well-argued, and +nothing here is wrongheaded. Note that a single AI review is not sufficient +for acceptance; this round came from a reviewer who participated in the +grilling session that shaped the document, so an independent model's review +(and human judgment) should follow once the Draft revisions land. From 94c776b1b1590c52362829783dd36245e053853b Mon Sep 17 00:00:00 2001 From: Phillip Cunliffe Date: Fri, 12 Jun 2026 12:58:09 -0700 Subject: [PATCH 2/7] Renumber remote-config spec to LLP 0023 Origin master took 0022 for iceberg-export-partitioning (#91) while the join-flow spec was drafted on a local branch; renumber to the next free slot and update all cross-references. Co-Authored-By: Claude Fable 5 --- hypaware-core/plugins-workspace/central/proto.md | 4 ++-- llp/0000-hypaware.explainer.md | 2 +- llp/0003-core-vs-plugin-surface.spec.md | 2 +- llp/0011-setup-and-onboarding.decision.md | 2 +- llp/0017-daemon-runtime.decision.md | 4 ++-- ...join-flow.spec.md => 0023-remote-config-join-flow.spec.md} | 2 +- .../remote-config-join-flow.spec.round-1.claude.md | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) rename llp/{0022-remote-config-join-flow.spec.md => 0023-remote-config-join-flow.spec.md} (99%) diff --git a/hypaware-core/plugins-workspace/central/proto.md b/hypaware-core/plugins-workspace/central/proto.md index 8a31e16..e97eb20 100644 --- a/hypaware-core/plugins-workspace/central/proto.md +++ b/hypaware-core/plugins-workspace/central/proto.md @@ -83,7 +83,7 @@ Headers (request): downloaded-but-not-yet-applied one. The server reads this header to track fleet convergence, so a gateway mid-install/mid-apply keeps presenting its old etag until the new config has taken effect -(LLP 0022). +(LLP 0023). Response 200: @@ -99,7 +99,7 @@ Response 200: The body is a full HypAware v2 config and replaces the gateway's operative config wholesale. Plugin entries are pinned by **version + artifact content hash**; the gateway verifies the artifact hash on -install and treats a mismatch as an apply failure (LLP 0022). +install and treats a mismatch as an apply failure (LLP 0023). `ETag: ` accompanies every 200 response. Clients persist the etag in a sidecar (`/config-etag.json`) so a restart diff --git a/llp/0000-hypaware.explainer.md b/llp/0000-hypaware.explainer.md index a506b43..3981a91 100644 --- a/llp/0000-hypaware.explainer.md +++ b/llp/0000-hypaware.explainer.md @@ -70,7 +70,7 @@ plugin that registers a dataset gets query and formatting for free. | AI gateway as a plugin | [0016](./0016-ai-gateway.decision.md) | Decision | | Daemon runtime & installers | [0017](./0017-daemon-runtime.decision.md) | Decision | | Observability & self-instrumentation | [0021](./0021-observability.spec.md) | Spec | -| Remote config & join flow | [0022](./0022-remote-config-join-flow.spec.md) | Spec | +| Remote config & join flow | [0023](./0023-remote-config-join-flow.spec.md) | Spec | ## Where to start diff --git a/llp/0003-core-vs-plugin-surface.spec.md b/llp/0003-core-vs-plugin-surface.spec.md index f6ca4ed..e2aca71 100644 --- a/llp/0003-core-vs-plugin-surface.spec.md +++ b/llp/0003-core-vs-plugin-surface.spec.md @@ -32,7 +32,7 @@ copy-pasted into every plugin, it belongs in core. install pinned plugins, persist last-known-good, swap, staged restart, rollback bookkeeping. Exposed to plugins as a narrow context facade; the document's *transport* (e.g. `@hypaware/central`'s pull loop) is plugin - domain. See [LLP 0022](./0022-remote-config-join-flow.spec.md#apply-engine-is-kernel-surface). + domain. See [LLP 0023](./0023-remote-config-join-flow.spec.md#apply-engine-is-kernel-surface). ## Intrinsic, not plugin-provided diff --git a/llp/0011-setup-and-onboarding.decision.md b/llp/0011-setup-and-onboarding.decision.md index 9782733..54466ce 100644 --- a/llp/0011-setup-and-onboarding.decision.md +++ b/llp/0011-setup-and-onboarding.decision.md @@ -60,4 +60,4 @@ For centrally-managed gateways, `hypaware join ` writes a seed config (central plugin only) and performs the non-interactive daemon install; the full config arrives from the server at join. It is sugar over "write the config file + install the daemon", not a separate path. See -[LLP 0022](./0022-remote-config-join-flow.spec.md#seed-config-mode). +[LLP 0023](./0023-remote-config-join-flow.spec.md#seed-config-mode). diff --git a/llp/0017-daemon-runtime.decision.md b/llp/0017-daemon-runtime.decision.md index 00e9dd6..ed6631d 100644 --- a/llp/0017-daemon-runtime.decision.md +++ b/llp/0017-daemon-runtime.decision.md @@ -5,7 +5,7 @@ **Systems:** Daemon **Author:** Phil / Claude **Date:** 2026-06-01 -**Related:** LLP 0002, LLP 0011, LLP 0012, LLP 0014, LLP 0022 +**Related:** LLP 0002, LLP 0011, LLP 0012, LLP 0014, LLP 0023 > The primary daemon and how it is installed. Decomposed from the V1 finishing > plan (`finish-v1` Phases 3–4, now tombstoned) and `hypaware-design.md`. @@ -31,7 +31,7 @@ long-lived host that drives them together. ## Staged restart for config replacement When the operative config is **replaced wholesale** — remote config apply -([LLP 0022](./0022-remote-config-join-flow.spec.md#apply-semantics-staged-restart)), +([LLP 0023](./0023-remote-config-join-flow.spec.md#apply-semantics-staged-restart)), or any change to the plugin set or installed plugin code — the daemon does **not** reload in place. It persists the new config and **exits; the service manager relaunches it** onto the new config. diff --git a/llp/0022-remote-config-join-flow.spec.md b/llp/0023-remote-config-join-flow.spec.md similarity index 99% rename from llp/0022-remote-config-join-flow.spec.md rename to llp/0023-remote-config-join-flow.spec.md index ec6e5bc..9109b71 100644 --- a/llp/0022-remote-config-join-flow.spec.md +++ b/llp/0023-remote-config-join-flow.spec.md @@ -1,4 +1,4 @@ -# LLP 0022: Remote Config and Join Flow +# LLP 0023: Remote Config and Join Flow **Type:** Spec **Status:** Draft diff --git a/notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md b/notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md index 298163a..37083af 100644 --- a/notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md +++ b/notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md @@ -1,4 +1,4 @@ -# Review of LLP 0022: Remote Config and Join Flow +# Review of LLP 0023: Remote Config and Join Flow **Reviewer:** Claude (Fable 5) **Date:** 2026-06-12 From 246f6c9be552a40a08ba615bbb7c91cf2702f0ec Mon Sep 17 00:00:00 2001 From: Phillip Cunliffe Date: Fri, 12 Jun 2026 13:35:26 -0700 Subject: [PATCH 3/7] Implement remote config and join flow (LLP 0023) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel side: - Config apply engine (src/core/config/apply.js): validate -> install pinned plugins -> persist to A/B slots -> atomic pointer flip -> staged restart, with last-known-good rollback, bad-etag re-apply backoff, and structured rollback reasons. The served etag lives in a per-slot sidecar written before the flip, so the document and its etag commit on the same rename in both directions. - Narrow ctx.configControl facade (stage / confirmPoll / runningEtag) exposed only in daemon mode; CLI boots leave it undefined so transport plugins keep their pull loops off. - Kernel-owned probation watchdog: window max(3 x poll_interval, 120s) sized from the staged document, cleared only by a confirmed poll, expiry also evaluated at boot before plugin activation (crashloop case), orphaned markers from a crash before the flip are discarded. - Staged restart: DAEMON_RESTART_EXIT_CODE (75) for foreground invokers; service managers already relaunch via KeepAlive / Restart=always (now pinned by tests). Sinks are closed on shutdown so plugin timers stop. - Hash-pinned install-on-config through the LLP 0007 path; pin verified against the staged artifact before the install commits. Bundled first-party plugins: version strict, hash skipped. - Config shape: plugin entries accept version / artifact_hash / source pins. - hypaware status: remote-config section (probation, last rollback + reason, remembered bad etag, running etag) in text and JSON. - hypaware join [token] [--token-file|stdin] [--no-daemon]: writes the mode-0600 seed config and runs the daemon install — a wrapper over the two existing steps, not a second code path. Central plugin (transport only): - Config pull loop (central/src/config_client.js): immediate pull on bootstrap success, steady timer, If-None-Match always the running config's etag, 401 refresh-retry, 404 legacy backoff, 429/503 Retry-After + linear backoff, 1 MiB body cap. - Dropped the never-wired config_etag_path option (the etag is kernel-managed and read through the facade). Settled LLP 0023 open questions: poll default 300s, probation floor 120s, max document size 1 MiB. proto.md sidecar wording updated; the restart exit code recorded in LLP 0017. Tests: apply-engine state machine (18), pull loop (10), join command (5), installer relaunch defaults; join_flow_remote_config hermetic smoke drives seed -> bootstrap -> pull -> apply -> restart -> probation clear against a stub server with convergence assertions. Note: central_forward_outbox was already failing on origin/master (empty ingest rows) before this change. Co-Authored-By: Claude Fable 5 --- collectivus-plugin-kernel-types.d.ts | 70 +++ .../plugins-workspace/central/index.js | 29 +- .../plugins-workspace/central/proto.md | 6 +- .../plugins-workspace/central/src/config.js | 4 - .../central/src/config_client.js | 284 +++++++++ .../plugins-workspace/central/src/sink.js | 5 +- .../plugins-workspace/central/src/types.d.ts | 15 +- .../smoke/flows/join_flow_remote_config.js | 405 +++++++++++++ llp/0017-daemon-runtime.decision.md | 4 +- llp/0023-remote-config-join-flow.spec.md | 79 ++- src/core/cli/core_commands.js | 226 +++++++ src/core/config/apply.js | 549 ++++++++++++++++++ src/core/config/apply_deps.js | 144 +++++ src/core/config/schema.js | 10 + src/core/config/types.d.ts | 117 ++++ src/core/daemon/runtime.js | 83 ++- src/core/daemon/status.js | 19 +- src/core/daemon/types.d.ts | 10 +- src/core/runtime/activation.d.ts | 8 + src/core/runtime/activation.js | 6 +- src/core/runtime/boot.js | 11 +- src/core/runtime/types.d.ts | 3 + test/core/config-apply.test.js | 421 ++++++++++++++ test/core/daemon.test.js | 26 + test/core/join-command.test.js | 131 +++++ test/plugins/central-config-pull.test.js | 233 ++++++++ 26 files changed, 2840 insertions(+), 58 deletions(-) create mode 100644 hypaware-core/plugins-workspace/central/src/config_client.js create mode 100644 hypaware-core/smoke/flows/join_flow_remote_config.js create mode 100644 src/core/config/apply.js create mode 100644 src/core/config/apply_deps.js create mode 100644 test/core/config-apply.test.js create mode 100644 test/core/join-command.test.js create mode 100644 test/plugins/central-config-pull.test.js diff --git a/collectivus-plugin-kernel-types.d.ts b/collectivus-plugin-kernel-types.d.ts index fbba741..02a2c77 100644 --- a/collectivus-plugin-kernel-types.d.ts +++ b/collectivus-plugin-kernel-types.d.ts @@ -323,10 +323,60 @@ export interface PluginActivationContext { * before appending to the cache. */ backfillMaterializers: BackfillMaterializerRegistry + /** + * Narrow facade over the kernel config apply engine (LLP 0023). Only + * present when the host process runs an apply engine (the daemon); + * absent in plain CLI boots, so transport plugins must treat it as + * optional and skip their pull loops when it is missing. The facade + * is the only channel a plugin has into config application — the + * kernel owns validation, install, persistence, restart, probation, + * and rollback. + */ + configControl?: ConfigControlFacade requireCapability(name: CapabilityName, range?: SemverRange): T provideCapability(name: CapabilityName, version: SemverVersion, value: T): void } +/** + * Plugin-facing surface of the kernel config apply engine. Handed to + * transport plugins (e.g. `@hypaware/central`) so they can deliver a + * downloaded config document and report poll liveness. Deliberately + * narrow: plugins never see probation state, slot paths, or rollback + * bookkeeping. + */ +export interface ConfigControlFacade { + /** + * Deliver a downloaded config document (parsed JSON) plus the ETag it + * was served under. The kernel validates, installs pinned plugins, + * persists, swaps, and requests a staged restart. Resolves before the + * restart happens; callers should treat `{ ok: true }` as "apply + * committed, restart pending". + */ + stage(document: unknown, etag: string): Promise + /** + * Report a successful authenticated config poll (200 or 304). Clears + * the post-apply probation window when one is active; a no-op + * otherwise. + */ + confirmPoll(): void + /** ETag of the *running* config, for `If-None-Match`. Undefined when the operative config was never applied from the server (e.g. seed). */ + runningEtag(): string | undefined +} + +export type ConfigStageResult = + | { ok: true, action: 'applied' | 'noop_same_etag' | 'skipped_bad_etag' } + | { ok: false, errorKind: ConfigApplyErrorKind, message: string } + +export type ConfigApplyErrorKind = + | 'config_invalid' + | 'plugin_install_failed' + | 'artifact_hash_mismatch' + | 'bundled_version_mismatch' + | 'document_too_large' + | 'apply_engine_not_ready' + | 'restart_pending' + | 'apply_io_error' + export interface PluginDeactivationContext { plugin: ActivePlugin log: PluginLogger @@ -405,6 +455,26 @@ export interface PluginConfigInstance { name: PluginName enabled?: boolean config?: JsonObject + /** + * Pinned plugin version. Set by centrally-served configs (LLP 0023): + * the apply engine refuses a config whose pins it cannot satisfy. + * For bundled first-party plugins the pin is checked strictly against + * the bundled version; for fetched plugins it selects the artifact. + */ + version?: SemverVersion + /** + * Pinned artifact content hash for fetched plugins. The apply engine + * verifies the fetched artifact against this before committing the + * install; a mismatch is an apply failure. Ignored (not checked) for + * plugins bundled with the running kernel. + */ + artifact_hash?: string + /** + * Optional explicit install source (raw source string accepted by the + * plugin installer). Defaults to the plugin name, which the resolver + * maps to its canonical git source. + */ + source?: string } /** diff --git a/hypaware-core/plugins-workspace/central/index.js b/hypaware-core/plugins-workspace/central/index.js index 1200a7d..77e3e9b 100644 --- a/hypaware-core/plugins-workspace/central/index.js +++ b/hypaware-core/plugins-workspace/central/index.js @@ -3,6 +3,7 @@ import path from 'node:path' import { validateCentralConfig } from './src/config.js' +import { createConfigPullLoop } from './src/config_client.js' import { IdentityClient } from './src/identity_client.js' import { createForwardSink } from './src/sink.js' @@ -26,6 +27,10 @@ import { createForwardSink } from './src/sink.js' export async function activate(ctx) { const query = ctx.query const storage = ctx.storage + // Present only in daemon mode. Without an apply engine there is no + // one to hand a pulled document to, so the pull loop stays off (CLI + // boots must not fire config polls as a side effect of `hyp status`). + const configControl = ctx.configControl ctx.sinks.register({ name: 'forward', @@ -55,13 +60,35 @@ export async function activate(ctx) { hyp_identity_source: source, }) - return createForwardSink({ + const sink = createForwardSink({ config, identityClient, query, storage, log: sinkCtx.log, }) + + if (!configControl) return sink + + // @ref LLP 0023#config-pull-loop [implements] — pull immediately on bootstrap success, then on the steady timer + const pullLoop = createConfigPullLoop({ + centralUrl: config.url, + identityClient, + configControl, + ...(config.poll_interval_seconds !== undefined + ? { pollIntervalSeconds: config.poll_interval_seconds } + : {}), + log: sinkCtx.log, + }) + pullLoop.start() + + return { + ...sink, + async close() { + await pullLoop.stop() + await sink.close() + }, + } }, }) } diff --git a/hypaware-core/plugins-workspace/central/proto.md b/hypaware-core/plugins-workspace/central/proto.md index e97eb20..1403b78 100644 --- a/hypaware-core/plugins-workspace/central/proto.md +++ b/hypaware-core/plugins-workspace/central/proto.md @@ -102,8 +102,10 @@ artifact content hash**; the gateway verifies the artifact hash on install and treats a mismatch as an apply failure (LLP 0023). `ETag: ` accompanies every 200 response. Clients persist the etag -in a sidecar (`/config-etag.json`) so a restart -short-circuits to 304 instead of re-pulling and re-validating. +of the *running* config in kernel-managed state (it transitions +atomically with the operative config on apply and rollback — LLP 0023) +so a restart short-circuits to 304 instead of re-pulling and +re-validating. Response 304: no body. The gateway keeps its current config. diff --git a/hypaware-core/plugins-workspace/central/src/config.js b/hypaware-core/plugins-workspace/central/src/config.js index e39043e..a9ab971 100644 --- a/hypaware-core/plugins-workspace/central/src/config.js +++ b/hypaware-core/plugins-workspace/central/src/config.js @@ -58,10 +58,6 @@ export function validateCentralConfig(value) { } } - if (cfg.config_etag_path !== undefined && typeof cfg.config_etag_path !== 'string') { - return invalid('central.config_etag_path must be a string when set') - } - return { ok: true, config: /** @type {CentralSinkConfig} */ (/** @type {unknown} */ (cfg)) } } diff --git a/hypaware-core/plugins-workspace/central/src/config_client.js b/hypaware-core/plugins-workspace/central/src/config_client.js new file mode 100644 index 0000000..da5d5a4 --- /dev/null +++ b/hypaware-core/plugins-workspace/central/src/config_client.js @@ -0,0 +1,284 @@ +// @ts-check + +/** + * @import { ConfigControlFacade, PluginLogger } from '../../../../collectivus-plugin-kernel-types.d.ts' + * @import { IdentityClient } from './identity_client.js' + */ + +/** + * Default pull cadence when `poll_interval_seconds` is not configured. + * Mirrors the kernel apply engine's `DEFAULT_POLL_INTERVAL_SECONDS` + * (it sizes the probation window from the same number). + */ +export const DEFAULT_POLL_INTERVAL_SECONDS = 300 + +/** + * Transport-level cap on a pulled config body. Mirrors the kernel's + * `MAX_CONFIG_DOCUMENT_BYTES` — the apply engine enforces it again, + * but an oversized body should be dropped before it is buffered whole. + */ +export const MAX_CONFIG_DOCUMENT_BYTES = 1024 * 1024 + +/** Linear backoff ladder (seconds) for 429/503/transport failures, per proto.md. */ +const RETRY_BACKOFF_SECONDS = [30, 60, 120, 300] + +/** Polite backoff (seconds) for the legacy 404 branch, per proto.md. */ +const LEGACY_404_BACKOFF_SECONDS = 300 + +/** + * The config pull loop: poll `GET /v1/config` with `If-None-Match` set + * to the *running* config's etag, confirm successful polls to the + * kernel (clearing post-apply probation), and hand 200 bodies to the + * apply facade. Transport only — validation, persistence, restart, + * probation, and rollback are all kernel-owned behind `configControl`. + * + * The loop is a self-rescheduling timeout rather than an interval so + * backoff (404 / 429 / 503 / transport errors) can stretch a single + * gap without skewing the steady cadence. Identity refresh needs no + * timer of its own: every poll goes through `getCurrentJwt()`, which + * eagerly refreshes inside the 24h window, and the poll cadence is + * capped at one hour. + * + * Timers are deliberately *not* unref'd: in seed-config mode (central + * sink only, no sources) this loop is the daemon's only live handle, + * and that polling idle state is a legitimate steady state, not an + * exit condition. + * + * @param {{ + * centralUrl: string, + * identityClient: IdentityClient, + * configControl: ConfigControlFacade, + * pollIntervalSeconds?: number, + * log: PluginLogger, + * fetchFn?: typeof fetch, + * }} args + * @ref LLP 0023#config-pull-loop [implements] — immediate pull on bootstrap success, then a steady plugin-internal timer + */ +export function createConfigPullLoop(args) { + const { centralUrl, identityClient, configControl, log } = args + const fetchFn = args.fetchFn ?? fetch + const pollIntervalSeconds = args.pollIntervalSeconds ?? DEFAULT_POLL_INTERVAL_SECONDS + + /** @type {NodeJS.Timeout | null} */ + let timer = null + let stopped = false + let consecutiveFailures = 0 + /** @type {Promise | null} */ + let inFlight = null + + /** @param {number} delaySeconds */ + function schedule(delaySeconds) { + if (stopped) return + timer = setTimeout(() => { + timer = null + inFlight = pollOnce().finally(() => { inFlight = null }) + }, delaySeconds * 1000) + } + + /** @returns {Promise} */ + async function pollOnce() { + let nextDelay = pollIntervalSeconds + try { + const outcome = await pull() + if (outcome === 'retry_backoff') { + nextDelay = RETRY_BACKOFF_SECONDS[ + Math.min(consecutiveFailures, RETRY_BACKOFF_SECONDS.length) - 1 + ] ?? RETRY_BACKOFF_SECONDS[RETRY_BACKOFF_SECONDS.length - 1] + } else if (outcome === 'legacy_404') { + nextDelay = Math.max(LEGACY_404_BACKOFF_SECONDS, pollIntervalSeconds) + } else if (typeof outcome === 'number') { + nextDelay = outcome + } + } catch (err) { + consecutiveFailures += 1 + const message = err instanceof Error ? err.message : String(err) + log.warn('central.config.poll_failed', { + error_kind: 'config_poll_error', + consecutive_failures: consecutiveFailures, + message, + }) + nextDelay = RETRY_BACKOFF_SECONDS[ + Math.min(consecutiveFailures, RETRY_BACKOFF_SECONDS.length) - 1 + ] + } + schedule(nextDelay) + } + + /** + * One poll. Returns `'ok'`, `'retry_backoff'`, `'legacy_404'`, or an + * explicit next-delay in seconds (server-provided `Retry-After`). + * + * @returns {Promise<'ok' | 'retry_backoff' | 'legacy_404' | number>} + */ + async function pull() { + const url = joinUrl(centralUrl, '/v1/config') + const runningEtag = configControl.runningEtag() + + let response = await doFetch(url, runningEtag) + if (response.status === 401) { + // One-shot refresh + retry; a second 401 escalates as an auth + // failure (proto.md "Refresh window"). + await identityClient.refresh() + response = await doFetch(url, runningEtag) + if (response.status === 401) { + consecutiveFailures += 1 + log.error('central.config.poll_failed', { + error_kind: 'config_poll_auth_failed', + http_status: 401, + }) + return 'retry_backoff' + } + } + + if (response.status === 304) { + consecutiveFailures = 0 + configControl.confirmPoll() + log.info('central.config.poll', { + hyp_operation: 'config.pull', + http_status: 304, + status: 'ok', + }) + return 'ok' + } + + if (response.status === 200) { + const etag = response.headers.get('etag') + const body = await response.text() + if (Buffer.byteLength(body, 'utf8') > MAX_CONFIG_DOCUMENT_BYTES) { + consecutiveFailures += 1 + log.error('central.config.poll_failed', { + error_kind: 'config_document_too_large', + http_status: 200, + body_bytes: Buffer.byteLength(body, 'utf8'), + }) + return 'retry_backoff' + } + if (!etag) { + consecutiveFailures += 1 + log.error('central.config.poll_failed', { + error_kind: 'config_missing_etag', + http_status: 200, + }) + return 'retry_backoff' + } + /** @type {unknown} */ + let document + try { + document = JSON.parse(body) + } catch (err) { + consecutiveFailures += 1 + log.error('central.config.poll_failed', { + error_kind: 'config_invalid_json', + http_status: 200, + message: err instanceof Error ? err.message : String(err), + }) + return 'retry_backoff' + } + consecutiveFailures = 0 + // The 200 itself is a successful authenticated poll: it clears + // any active probation before the new revision stages its own. + // A probation-clearing poll returning a newer revision chains + // into the next apply by design. + configControl.confirmPoll() + const staged = await configControl.stage(document, etag) + log.info('central.config.poll', { + hyp_operation: 'config.pull', + http_status: 200, + config_etag: etag, + apply_action: staged.ok ? staged.action : 'failed', + ...(staged.ok ? {} : { error_kind: staged.errorKind }), + status: staged.ok ? 'ok' : 'failed', + }) + return 'ok' + } + + if (response.status === 404) { + // Legacy-only branch: servers that mint tokens without a config. + if (consecutiveFailures === 0) { + log.warn('central.config.poll', { + hyp_operation: 'config.pull', + http_status: 404, + status: 'skipped', + hyp_reason: 'no_config_registered_legacy', + }) + } + consecutiveFailures += 1 + return 'legacy_404' + } + + if (response.status === 429 || response.status === 503) { + consecutiveFailures += 1 + const retryAfter = parseRetryAfter(response.headers.get('retry-after')) + log.warn('central.config.poll_failed', { + error_kind: 'config_poll_throttled', + http_status: response.status, + ...(retryAfter !== undefined ? { retry_after_seconds: retryAfter } : {}), + }) + return retryAfter !== undefined ? retryAfter : 'retry_backoff' + } + + consecutiveFailures += 1 + log.warn('central.config.poll_failed', { + error_kind: 'config_poll_http_error', + http_status: response.status, + }) + return 'retry_backoff' + } + + /** + * @param {string} url + * @param {string | undefined} runningEtag + */ + async function doFetch(url, runningEtag) { + const jwt = await identityClient.getCurrentJwt() + return fetchFn(url, { + method: 'GET', + headers: { + authorization: `Bearer ${jwt}`, + // If-None-Match always reflects the *running* config — the + // server reads it as the fleet-convergence signal, so a + // gateway mid-apply keeps presenting its old etag. + ...(runningEtag ? { 'if-none-match': runningEtag } : {}), + }, + }) + } + + return { + /** Pull immediately, then settle into the steady cadence. */ + start() { + if (stopped || timer || inFlight) return + inFlight = pollOnce().finally(() => { inFlight = null }) + }, + /** Stop polling; resolves after any in-flight poll settles. */ + async stop() { + stopped = true + if (timer) { + clearTimeout(timer) + timer = null + } + if (inFlight) await inFlight + }, + } +} + +/** + * @param {string | null} value + * @returns {number | undefined} + */ +function parseRetryAfter(value) { + if (!value) return undefined + const seconds = Number.parseInt(value, 10) + if (Number.isInteger(seconds) && seconds >= 0) return seconds + const date = Date.parse(value) + if (!Number.isNaN(date)) return Math.max(0, Math.round((date - Date.now()) / 1000)) + return undefined +} + +/** + * @param {string} base + * @param {string} suffix + */ +function joinUrl(base, suffix) { + const baseWithSlash = base.endsWith('/') ? base : `${base}/` + return new URL(suffix.replace(/^\//, ''), baseWithSlash).toString() +} diff --git a/hypaware-core/plugins-workspace/central/src/sink.js b/hypaware-core/plugins-workspace/central/src/sink.js index 1d0b560..451acad 100644 --- a/hypaware-core/plugins-workspace/central/src/sink.js +++ b/hypaware-core/plugins-workspace/central/src/sink.js @@ -99,8 +99,9 @@ export function createForwardSink(args) { }, async close() { - // No background loops to stop in the V1 forward sink; identity - // refresh and config pull live on their own timers when wired in. + // No background loops to stop here: the config pull loop wraps + // this sink's close() in index.js, and identity refresh is lazy + // (every authenticated call refreshes inside the 24h window). }, } } diff --git a/hypaware-core/plugins-workspace/central/src/types.d.ts b/hypaware-core/plugins-workspace/central/src/types.d.ts index 482e507..0131873 100644 --- a/hypaware-core/plugins-workspace/central/src/types.d.ts +++ b/hypaware-core/plugins-workspace/central/src/types.d.ts @@ -47,17 +47,12 @@ export interface CentralSinkConfig { persisted_path?: string } /** - * Override the etag sidecar path used by the config-pull loop. Defaults - * to `/config-etag.json`. The loop itself is opt-in. + * Poll cadence (seconds) for the config-pull loop. Default 300s + * (5 minutes) — 304s are cheap, and propagation latency equals this + * cadence (no push channel in V1). The running config's etag is + * kernel-managed (LLP 0023); the plugin reads it through the + * `configControl` facade, so there is no plugin-side sidecar path. */ - config_etag_path?: string - /** Poll cadence (seconds) for the config-pull loop. Default 30s. */ poll_interval_seconds?: number } -/** Payload of the `config-changed` event emitted by `ConfigClient`. */ -export interface ConfigChangedEvent { - newConfig: unknown - etag: string - fetchedAt: string -} diff --git a/hypaware-core/smoke/flows/join_flow_remote_config.js b/hypaware-core/smoke/flows/join_flow_remote_config.js new file mode 100644 index 0000000..f282b50 --- /dev/null +++ b/hypaware-core/smoke/flows/join_flow_remote_config.js @@ -0,0 +1,405 @@ +// @ts-check + +import fs from 'node:fs/promises' +import http from 'node:http' +import path from 'node:path' +import process from 'node:process' + +import { installObservability } from '../../../src/core/observability/index.js' +import { defaultConfigPath } from '../../../src/core/config/schema.js' +import { readConfigControlStatus } from '../../../src/core/config/apply.js' +import { DAEMON_RESTART_EXIT_CODE, runDaemon } from '../../../src/core/daemon/runtime.js' +import { dispatch } from '../../../src/core/cli/dispatch.js' + +/** + * Join-flow smoke (LLP 0023): drives the full remote-config lifecycle + * against a stub central server — + * + * join (seed write) → seed boot → identity bootstrap → config pull + * (200) → kernel apply → staged restart → relaunch on the served + * config → probation cleared by the first successful poll (304). + * + * The daemon runs in-process and the smoke plays the role of the + * foreground invoker: it relaunches `runDaemon` when `handle.done` + * resolves with the restart exit code, exactly as a dev shell or the + * service manager would. + * + * Asserted signals (Log-Driven Development): + * - external: operative config replaced wholesale (token retired), + * seed preserved as the rollback slot, otlp source running on the + * served config, `If-None-Match` convergence transitions on the + * stub server. + * - internal: `config.apply` span (status=ok), `config.applied` and + * `config.probation_cleared` log rows, `join.run` span. + * + * @param {{ harness: any, expect: any }} args + * @ref LLP 0023#the-join-sequence [tests] — seed → bootstrap → pull → apply → restart → operational, end to end against a stub server + */ +export async function run({ harness, expect }) { + const obs = installObservability() + if (!obs.tracer.provider) { + throw new Error( + 'join_flow_remote_config: tracer provider not installed — expected HYP_DEV_TELEMETRY=1' + ) + } + + process.env.HYP_HOME = harness.hypHome + delete process.env.HYP_CONFIG + const configPath = defaultConfigPath(harness.hypHome) + const stateRoot = path.join(harness.hypHome, 'hypaware') + + // ----- smoke_step: stub_server_up ----- + const server = await startStubCentralServer() + try { + // The served revision: a full v2 config. The otel pin exercises + // the bundled-plugin strict version check on the apply path. + const otelManifest = JSON.parse(await fs.readFile( + path.join( + path.dirname(new URL(import.meta.url).pathname), + '..', '..', 'plugins-workspace', 'otel', 'hypaware.plugin.json' + ), + 'utf8' + )) + server.setConfig({ + version: 2, + plugins: [ + { name: '@hypaware/central' }, + { name: '@hypaware/otel', version: otelManifest.version, config: { listen_host: '127.0.0.1', listen_port: 0 } }, + ], + sinks: { + central: { + plugin: '@hypaware/central', + config: { + url: server.baseUrl, + identity: {}, + schedule: '0 * * * *', + poll_interval_seconds: 5, + }, + }, + }, + query: { cache: { retention: { default_days: 30 } } }, + }, 'rev-1') + + // ----- smoke_step: join (write seed + skip daemon install) ----- + const joinOut = makeBuf() + const joinErr = makeBuf() + const joinCode = await dispatch( + ['join', server.baseUrl, 'policy-token-smoke', '--no-daemon'], + { + stdout: joinOut, + stderr: joinErr, + env: { ...process.env, HYP_HOME: harness.hypHome }, + } + ) + expect.that( + `join: exits 0 (stderr: ${joinErr.text()})`, + joinCode, + (v) => v === 0 + ) + const seed = JSON.parse(await fs.readFile(configPath, 'utf8')) + expect.that( + 'join: seed config carries the policy token', + seed.sinks?.central?.config?.identity?.bootstrap_token, + (v) => v === 'policy-token-smoke' + ) + + // ----- smoke_step: seed_boot (bootstrap → pull → apply → restart) ----- + const first = await runDaemon({ + hypHome: harness.hypHome, + env: process.env, + runId: harness.devRunId, + tickIntervalMs: 0, + installSignalHandlers: false, + }) + const firstExit = await withTimeout( + first.done, + 30_000, + 'seed boot did not request a staged restart within 30s' + ) + expect.that( + `seed boot: daemon exited with the restart code (got ${firstExit})`, + firstExit, + (v) => v === DAEMON_RESTART_EXIT_CODE + ) + + // The apply replaced the operative config wholesale and preserved + // the seed as the rollback slot. + const operative = JSON.parse(await fs.readFile(configPath, 'utf8')) + expect.that( + 'apply: operative config no longer carries the policy token', + operative.sinks?.central?.config?.identity?.bootstrap_token, + (v) => v === undefined + ) + expect.that( + 'apply: operative config names the otel plugin from the served revision', + operative.plugins?.some((/** @type {any} */ p) => p.name === '@hypaware/otel'), + (v) => v === true + ) + const slotA = JSON.parse( + await fs.readFile(path.join(stateRoot, 'config-control', 'config.a.json'), 'utf8') + ) + expect.that( + 'apply: the seed survives in the rollback slot', + slotA.sinks?.central?.config?.identity?.bootstrap_token, + (v) => v === 'policy-token-smoke' + ) + const midStatus = readConfigControlStatus({ stateRoot, configPath }) + expect.that( + 'apply: probation marker armed for the served revision', + midStatus.probation?.etag, + (v) => v === 'rev-1' + ) + + // ----- smoke_step: relaunch (service-manager role) ----- + const second = await runDaemon({ + hypHome: harness.hypHome, + env: process.env, + runId: harness.devRunId, + tickIntervalMs: 0, + installSignalHandlers: false, + }) + try { + // Probation clears on the first successful poll (304 here). + await waitFor( + () => readConfigControlStatus({ stateRoot, configPath }).probation === null, + 10_000, + 'probation did not clear within 10s of relaunch' + ) + const cleared = readConfigControlStatus({ stateRoot, configPath }) + expect.that( + 'probation: cleared with the served revision running', + cleared.runningEtag, + (v) => v === 'rev-1' + ) + expect.that( + 'probation: no rollback was recorded', + cleared.lastRollback, + (v) => v === null + ) + + const snapshot = second.snapshot() + expect.that( + `relaunch: daemon state is healthy (got ${snapshot.state})`, + snapshot.state, + (v) => v === 'healthy' + ) + expect.that( + 'relaunch: otlp source from the served config is started', + snapshot.sources.find((/** @type {any} */ s) => s.name === 'otlp')?.state, + (v) => v === 'started' + ) + + // Convergence semantics on the wire: the first GET presented no + // etag (seed has none), the post-apply GET presented rev-1 and + // was answered 304. + const configGets = server.requests.filter((r) => r.path === '/v1/config') + expect.that( + `stub server: at least two config pulls observed (got ${configGets.length})`, + configGets.length, + (v) => typeof v === 'number' && v >= 2 + ) + expect.that( + 'stub server: the seed-boot pull presented no If-None-Match', + configGets[0]?.ifNoneMatch, + (v) => v === undefined + ) + expect.that( + 'stub server: a post-apply pull presented the running etag and converged', + configGets.some((r) => r.ifNoneMatch === 'rev-1' && r.responseStatus === 304), + (v) => v === true + ) + expect.that( + 'stub server: exactly one bootstrap happened (policy token not re-spent)', + server.requests.filter((r) => r.path === '/v1/identity/bootstrap').length, + (v) => v === 1 + ) + } finally { + await second.stop() + await second.done + } + } finally { + await server.close() + } + + await obs.shutdown() + + // ----- smoke_step: telemetry ----- + const traces = await expect.traces() + const applySpans = traces.filter((/** @type {any} */ t) => t.name === 'config.apply') + expect.that( + 'traces: a config.apply span was emitted with status=ok and apply_action=applied', + applySpans.some((/** @type {any} */ s) => + s.attributes?.status === 'ok' && s.attributes?.apply_action === 'applied' + ), + (v) => v === true + ) + expect.that( + 'traces: a join.run span was emitted', + traces.some((/** @type {any} */ t) => t.name === 'join.run'), + (v) => v === true + ) + + const logs = await expect.logs() + expect.that( + 'logs: config.applied recorded for rev-1', + logs.some((/** @type {any} */ l) => + l.body === 'config.applied' && l.attributes?.config_etag === 'rev-1' + ), + (v) => v === true + ) + expect.that( + 'logs: config.probation_cleared recorded for rev-1', + logs.some((/** @type {any} */ l) => + l.body === 'config.probation_cleared' && l.attributes?.config_etag === 'rev-1' + ), + (v) => v === true + ) + expect.that( + 'logs: central.config.poll observed both a 200 and a 304', + [200, 304].every((status) => + logs.some((/** @type {any} */ l) => + l.body === 'central.config.poll' && l.attributes?.http_status === status + ) + ), + (v) => v === true + ) +} + +/* ---------- stub central server ---------- */ + +/** + * Minimal `@hypaware/server` stand-in: identity bootstrap/refresh, + * etag-aware config serving, and an ingest acceptor. Every request is + * recorded for convergence assertions. + */ +async function startStubCentralServer() { + /** @type {Array<{ method: string, path: string, ifNoneMatch?: string, responseStatus: number }>} */ + const requests = [] + /** @type {unknown} */ + let configDoc = null + /** @type {string} */ + let configEtag = '' + + const jwt = buildFakeJwt('gateway-smoke-1') + const expiresAt = Math.floor(Date.now() / 1000) + 30 * 24 * 60 * 60 + + const server = http.createServer((req, res) => { + const url = new URL(req.url ?? '/', 'http://localhost') + /** @param {number} status @param {Record} headers @param {string} [body] */ + function reply(status, headers, body) { + requests.push({ + method: req.method ?? '', + path: url.pathname, + ...(req.headers['if-none-match'] + ? { ifNoneMatch: String(req.headers['if-none-match']) } + : {}), + responseStatus: status, + }) + res.writeHead(status, headers) + res.end(body ?? '') + } + + if (req.method === 'POST' && (url.pathname === '/v1/identity/bootstrap' || url.pathname === '/v1/identity/refresh')) { + reply(200, { 'content-type': 'application/json' }, JSON.stringify({ jwt, expires_at: expiresAt })) + return + } + if (req.method === 'GET' && url.pathname === '/v1/config') { + if (!configDoc) { + reply(404, { 'content-type': 'application/json' }, JSON.stringify({ error: 'no_config' })) + return + } + if (req.headers['if-none-match'] === configEtag) { + reply(304, { etag: configEtag }) + return + } + reply( + 200, + { 'content-type': 'application/json', etag: configEtag }, + JSON.stringify(configDoc) + ) + return + } + if (req.method === 'POST' && url.pathname.startsWith('/v1/ingest/')) { + reply(202, {}) + return + } + reply(404, { 'content-type': 'application/json' }, JSON.stringify({ error: 'not_found' })) + }) + + await new Promise((resolve) => server.listen(0, '127.0.0.1', () => resolve(undefined))) + const address = /** @type {import('node:net').AddressInfo} */ (server.address()) + + return { + baseUrl: `http://127.0.0.1:${address.port}`, + requests, + /** @param {unknown} doc @param {string} etag */ + setConfig(doc, etag) { + configDoc = doc + configEtag = etag + }, + close() { + return new Promise((resolve) => server.close(() => resolve(undefined))) + }, + } +} + +/** + * Unsigned JWT with the `sub` claim the identity client decodes. The + * gateway never verifies signatures (it trusts TLS), so a fake + * signature is wire-faithful enough for the smoke. + * + * @param {string} sub + */ +function buildFakeJwt(sub) { + /** @param {object} obj */ + const b64 = (obj) => Buffer.from(JSON.stringify(obj)).toString('base64url') + return `${b64({ alg: 'none', typ: 'JWT' })}.${b64({ sub })}.smoke` +} + +/* ---------- helpers ---------- */ + +function makeBuf() { + let value = '' + return { + /** @param {string} chunk */ + write(chunk) { + value += String(chunk) + return true + }, + text() { + return value + }, + } +} + +/** + * @template T + * @param {Promise} promise + * @param {number} ms + * @param {string} message + * @returns {Promise} + */ +function withTimeout(promise, ms, message) { + /** @type {NodeJS.Timeout} */ + let timer + return Promise.race([ + promise.finally(() => clearTimeout(timer)), + new Promise((_resolve, reject) => { + timer = setTimeout(() => reject(new Error(`join_flow_remote_config: ${message}`)), ms) + }), + ]) +} + +/** + * @param {() => boolean} predicate + * @param {number} ms + * @param {string} message + */ +async function waitFor(predicate, ms, message) { + const deadline = Date.now() + ms + while (Date.now() < deadline) { + if (predicate()) return + await new Promise((resolve) => setTimeout(resolve, 50)) + } + throw new Error(`join_flow_remote_config: ${message}`) +} diff --git a/llp/0017-daemon-runtime.decision.md b/llp/0017-daemon-runtime.decision.md index ed6631d..8b41262 100644 --- a/llp/0017-daemon-runtime.decision.md +++ b/llp/0017-daemon-runtime.decision.md @@ -48,7 +48,9 @@ Consequences: (`KeepAlive` / `Restart=always`). This is now a requirement of the installers, not a nicety. - A foreground (non-service) daemon cannot relaunch itself: it exits with a - distinct restart exit code and the invoker (smoke harness, dev shell) loops. + distinct restart exit code — **75** (`EX_TEMPFAIL`, + `DAEMON_RESTART_EXIT_CODE`) — and the invoker (smoke harness, dev shell) + loops on that code. - Same-shape reload ([LLP 0004](./0004-activation-and-paths.spec.md#same-shape-reload)) remains the path for in-place source config changes; there are exactly two paths, distinguished by whether the plugin set / plugin code changed. diff --git a/llp/0023-remote-config-join-flow.spec.md b/llp/0023-remote-config-join-flow.spec.md index 9109b71..56b4b10 100644 --- a/llp/0023-remote-config-join-flow.spec.md +++ b/llp/0023-remote-config-join-flow.spec.md @@ -53,10 +53,16 @@ the LLP 0014 sink contract. `@hypaware/central`'s `src/sink.js` notes that refresh and config pull "live on their own timers when wired in" — this spec wires the config pull: -- Pull **immediately on bootstrap success**, then on a steady timer (minutes; - 304s are cheap — the server ETag is a content hash of the served revision). -- The `proto.md` ETag/304/404/429 semantics are unchanged. The etag sidecar - (`config-etag.json`) behavior stands. +- Pull **immediately on bootstrap success**, then on a steady timer + (`poll_interval_seconds`, default **300 s** — 304s are cheap; the server + ETag is a content hash of the served revision). +- The `proto.md` ETag/304/404/429 semantics are unchanged. The running + config's etag persists across restarts so a relaunch short-circuits to + 304; it is kernel-managed state read through the facade (below). +- A pulled 200 body above **1 MiB** is dropped — enforced at both the + transport (before buffering completes its way into a parse) and the apply + engine. Wholesale-replace means an authenticated 200 goes straight into + memory and onto disk; the stated cap is one line of defense-in-depth. - **`If-None-Match` must reflect the *running* config, never a downloaded-but-not-yet-applied one.** The server reads this header to track fleet convergence (it lands in the queryable `gateways` dataset), so a @@ -110,15 +116,23 @@ Recommended persistence idiom: **A/B slots** — write each config to its own path and flip an atomic pointer (symlink or one-line file) as the last step before exit. Same semantics as "file swap," but a crash between persist and restart can never leave an ambiguous operative config, and last-known-good -is crash-safe by construction. +is crash-safe by construction. As implemented: slot files live under +`/config-control/`, the operative config path becomes a relative +symlink to the active slot (replaced atomically via tmp + rename), and each +slot carries its served etag in a per-slot sidecar written before the flip — +so the document and its etag commit on the same rename, in both directions. ### Apply engine is kernel surface The central plugin is **transport only**: pull, ETag bookkeeping, auth. It -hands a downloaded document to a narrow kernel facade (shape TBD at -implementation, e.g. `ctx.configControl.stage(document)`); the **kernel** -owns validate → install pinned plugins → persist last-known-good → swap → -restart, and the rollback bookkeeping. Recorded in +hands a downloaded document to a narrow kernel facade — +`ctx.configControl.stage(document, etag)`, plus `confirmPoll()` (poll +liveness) and `runningEtag()` (for `If-None-Match`); the **kernel** owns +validate → install pinned plugins → persist last-known-good → swap → +restart, and the rollback bookkeeping. The facade exists only where an +apply engine runs (the daemon); plain CLI boots leave `ctx.configControl` +undefined and the plugin keeps its pull loop off — `hyp status` must not +fire config polls as a side effect. Recorded in [LLP 0003](./0003-core-vs-plugin-surface.spec.md#core-owns). Why kernel-side: rollback state must survive the restart and pairs with the @@ -129,14 +143,15 @@ future second management channel reuses it. Consequently state** ([LLP 0004](./0004-activation-and-paths.spec.md#state-directories)), not the central plugin's state dir. -The `config-etag.json` sidecar must transition **atomically with the -operative config, in both directions**: it carries the etag of the *running* -config, so apply moves it forward and rollback reverts it (otherwise a -rolled-back gateway would present a converged etag while running -last-known-good). Since every sidecar change coincides with an apply or -rollback, the facade takes the etag alongside the document and the **apply -engine stages the sidecar with the swap**; the central plugin only reads it -(at boot, to populate `If-None-Match`). +The etag sidecar must transition **atomically with the operative config, in +both directions**: it carries the etag of the *running* config, so apply +moves it forward and rollback reverts it (otherwise a rolled-back gateway +would present a converged etag while running last-known-good). Since every +sidecar change coincides with an apply or rollback, the facade takes the +etag alongside the document and the **apply engine stages the sidecar with +the swap** (realized as the per-slot etag files above — flipping the +pointer flips the etag); the central plugin only reads it, through +`configControl.runningEtag()`, to populate `If-None-Match`. Identity state (`identity.json`, JWT, gateway id) is **not config** and is never touched by config application. @@ -227,9 +242,14 @@ A probation-clearing poll may itself return 200 with a newer revision; that triggers an immediate next apply, with its own probation. This chaining is correct — do not serialize or suppress it. -W must comfortably exceed one poll interval plus retry backoff (e.g. -`max(3 × poll_interval_seconds, floor)` rather than a fixed constant), so a -slow operator-chosen poll cadence cannot make every apply roll back. +W must comfortably exceed one poll interval plus retry backoff: +`W = max(3 × poll_interval_seconds, 120 s)` — a formula, not a fixed +constant, so a slow operator-chosen poll cadence cannot make every apply +roll back, and the 120 s floor leaves room for relaunch + identity refresh ++ one retry even at the fastest cadence. The interval is taken from the +*staged* document's central sink block (that is the sink that will, or +won't, confirm the poll); the kernel falls back to the 300 s default when +the block doesn't set one. Rollback from the **first** applied config lands back on the seed config — fine by construction: seed-config mode is a legitimate polling steady state, @@ -269,17 +289,18 @@ mint-requires-config, serving, convergence columns) and ships dark. Nothing server-side is blocked on the client; nothing client-side is blocked on the server except end-to-end testing. +## Settled at implementation (2026-06-12) + +Three knobs the draft left open were fixed when the client landed: + +- **Poll cadence default: 300 s** (5 minutes). Validated range stays + 5–3600 s. +- **Maximum config document size: 1 MiB**, enforced at both the transport + and the apply engine. +- **Probation floor: 120 s** (`W = max(3 × poll_interval_seconds, 120 s)`). + ## Open questions -- Exact poll cadence default (the spec says "minutes"; pick a number when - wiring the timer). -- Maximum accepted config document size. Wholesale-replace means an - authenticated 200 of arbitrary size goes straight into memory and onto - disk; a stated cap is one line of defense-in-depth. Pick a generous bound - when wiring the pull. -- Exact probation window formula (the *signal* and the - `max(3 × poll_interval_seconds, floor)` shape are decided; pick the floor - when wiring). - **Strict version pins for bundled plugins vs rolling kernel upgrades.** The strict check (above) means a kernel upgrade that bumps bundled plugin versions de-converges the fleet until the central config's pins are diff --git a/src/core/cli/core_commands.js b/src/core/cli/core_commands.js index 85f4118..02bba64 100644 --- a/src/core/cli/core_commands.js +++ b/src/core/cli/core_commands.js @@ -215,6 +215,12 @@ function buildCoreCommands() { usage: 'hyp init [preset]', run: runInit, }, + { + name: 'join', + summary: 'Join a centrally-managed fleet (write seed config + install daemon)', + usage: 'hyp join [token] [--token-file ] [--bin ] [--no-daemon]', + run: runJoin, + }, { name: 'attach', summary: 'Attach an AI client to the local gateway', @@ -504,6 +510,22 @@ function renderStatusJson({ report, clientNames, datasets, cacheRoot }) { oldest_partition_date: report.cache.oldestDate, }, recent_error_count: report.recentErrorCount, + // Remote-config apply state (LLP 0023). All-null until the gateway + // applies its first centrally-served config. + remote_config: report.remoteConfig + ? { + running_etag: report.remoteConfig.runningEtag, + probation: report.remoteConfig.probation + ? { + etag: report.remoteConfig.probation.etag, + applied_at: report.remoteConfig.probation.applied_at, + until: report.remoteConfig.probation.until, + } + : null, + last_rollback: report.remoteConfig.lastRollback, + bad_etag: report.remoteConfig.badEtag, + } + : null, diagnostics: report.diagnostics.map((d) => ({ severity: d.severity, kind: d.kind, @@ -596,6 +618,23 @@ function renderStatusText({ report, clientNames, datasets, cacheRoot, stdout }) stdout.write(` datasets: ${datasets.length}\n`) stdout.write(` recent errors: ${report.recentErrorCount}\n`) + // Remote-config section appears only once the gateway has state to + // show — a never-joined install keeps the V1 status surface. + const rc = report.remoteConfig + if (rc && (rc.runningEtag || rc.probation || rc.lastRollback || rc.badEtag)) { + stdout.write(' remote config:\n') + if (rc.runningEtag) stdout.write(` running etag: ${rc.runningEtag}\n`) + if (rc.probation) { + stdout.write(` probation: ${rc.probation.etag} until ${rc.probation.until}\n`) + } + if (rc.lastRollback) { + stdout.write(` last rollback: ${rc.lastRollback.etag} at ${rc.lastRollback.at} (${rc.lastRollback.reason})\n`) + } + if (rc.badEtag) { + stdout.write(` bad etag: ${rc.badEtag.etag} (${rc.badEtag.reason})\n`) + } + } + if (report.diagnostics.length > 0) { stdout.write(' diagnostics:\n') for (const d of report.diagnostics) { @@ -2753,6 +2792,193 @@ async function runInitFromFile(flags, ctx) { return 0 } +/** + * `hyp join [token]` — join a centrally-managed fleet. Pure + * sugar over two existing steps: write the seed config (an ordinary v2 + * config containing exactly the central plugin) and run the + * non-interactive daemon install. Doing those two steps by hand is + * specified to be exactly equivalent. + * + * Because a policy token is a multi-use fleet-wide credential, the + * token can (and for MDM scripts, should) arrive via `--token-file` + * or stdin instead of argv — a bare argv token lands in shell history + * and process listings. The seed config is written mode 0600. + * + * @param {string[]} argv + * @param {CommandRunContext} ctx + * @ref LLP 0023#seed-config-mode [implements] — join = write-seed-config + daemon install; a wrapper, not a second code path + */ +async function runJoin(argv, ctx) { + const parsed = parseJoinArgs(argv) + if (parsed.help) { + ctx.stdout.write('usage: hyp join [token] [--token-file ] [--bin ] [--no-daemon]\n') + ctx.stdout.write(' token sources (pick one): positional argument, --token-file, or stdin\n') + return 0 + } + if (parsed.error) { + ctx.stderr.write(`hyp join: ${parsed.error}\n`) + return 2 + } + + try { + const url = new URL(/** @type {string} */ (parsed.url)) + if (url.protocol !== 'http:' && url.protocol !== 'https:') { + ctx.stderr.write(`hyp join: url must be http(s); got ${url.protocol}\n`) + return 2 + } + } catch { + ctx.stderr.write(`hyp join: not a valid URL: ${parsed.url}\n`) + return 2 + } + + /** @type {string | undefined} */ + let token = parsed.token + if (token === undefined && parsed.tokenFile !== undefined) { + try { + token = (await fs.readFile(parsed.tokenFile, 'utf8')).trim() + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + ctx.stderr.write(`hyp join: --token-file: ${message}\n`) + return 1 + } + } + if (token === undefined) { + if (isTty(ctx.stdin)) { + ctx.stderr.write('hyp join: no token given — pass it as an argument, via --token-file, or on stdin\n') + return 2 + } + token = (await readAllStdin(ctx.stdin)).trim() + } + if (token.length === 0) { + ctx.stderr.write('hyp join: token is empty\n') + return 2 + } + + /** @type {HypAwareV2Config} */ + const seed = { + version: 2, + plugins: [{ name: '@hypaware/central' }], + sinks: { + central: { + plugin: '@hypaware/central', + config: { + url: /** @type {string} */ (parsed.url), + identity: { bootstrap_token: token }, + }, + }, + }, + } + + const catalogCtx = await buildKnownPluginsForCtx(ctx) + const validation = await validateConfig(seed, { + knownPlugins: catalogCtx.knownPlugins, + knownDatasets: catalogCtx.knownDatasets, + }) + if (!validation.ok) { + for (const err of validation.errors) { + ctx.stderr.write(`hyp join: [${err.errorKind}] ${err.pointer || ''}: ${err.message}\n`) + } + return 1 + } + + const obsEnv = readObservabilityEnv(ctx.env) + const targetPath = ctx.env.HYP_CONFIG + ? path.resolve(ctx.env.HYP_CONFIG) + : defaultConfigPath(obsEnv.hypHome) + + return withSpan( + 'join.run', + { + [Attr.COMPONENT]: 'join', + [Attr.OPERATION]: 'join.run', + config_path: targetPath, + install_daemon: !parsed.noDaemon, + status: 'ok', + }, + async (span) => { + // The token is the only credential on disk until the first + // bootstrap, so the seed write is atomic and mode 0600. + await fs.mkdir(path.dirname(targetPath), { recursive: true }) + const tmp = `${targetPath}.tmp.${process.pid}.${Date.now()}` + await fs.writeFile(tmp, JSON.stringify(seed, null, 2) + '\n', { mode: 0o600 }) + await fs.rename(tmp, targetPath) + ctx.stdout.write(`✓ Wrote seed config ${targetPath}\n`) + + if (parsed.noDaemon) { + ctx.stdout.write(' daemon install skipped (--no-daemon); run `hyp daemon install` to finish joining\n') + return 0 + } + + const installArgv = parsed.binPath !== undefined ? ['--bin', parsed.binPath] : [] + const code = await runDaemonInstall(installArgv, ctx) + if (code !== 0) { + span.setAttribute('status', 'failed') + span.setAttribute('error_kind', 'daemon_install_failed') + return code + } + ctx.stdout.write('✓ Joined — the daemon will pull its configuration from the server\n') + return 0 + }, + { component: 'join' } + ) +} + +/** + * @param {string[]} argv + * @returns {{ help?: boolean, error?: string, url?: string, token?: string, tokenFile?: string, binPath?: string, noDaemon?: boolean }} + */ +function parseJoinArgs(argv) { + /** @type {{ help?: boolean, error?: string, url?: string, token?: string, tokenFile?: string, binPath?: string, noDaemon?: boolean }} */ + const r = {} + /** @type {string[]} */ + const positional = [] + for (let i = 0; i < argv.length; i += 1) { + const token = argv[i] + if (token === '--help' || token === '-h') { r.help = true; return r } + if (token === '--no-daemon') { r.noDaemon = true; continue } + if (token === '--token-file' || token.startsWith('--token-file=')) { + const value = token === '--token-file' ? argv[++i] : token.slice('--token-file='.length) + if (!value) return { error: '--token-file: requires a path' } + r.tokenFile = value + continue + } + if (token === '--bin' || token.startsWith('--bin=')) { + const value = token === '--bin' ? argv[++i] : token.slice('--bin='.length) + if (!value) return { error: '--bin: requires a path' } + r.binPath = value + continue + } + if (token.startsWith('-') && token !== '-') { + return { error: `unknown argument: ${token}` } + } + positional.push(token) + } + if (positional.length === 0) return { error: 'missing (see hyp join --help)' } + if (positional.length > 2) return { error: `unexpected argument: ${positional[2]}` } + r.url = positional[0] + // '-' as the token positional means "read from stdin", same as + // omitting it on a piped invocation. + if (positional.length === 2 && positional[1] !== '-') r.token = positional[1] + if (r.token !== undefined && r.tokenFile !== undefined) { + return { error: 'pass the token either as an argument or via --token-file, not both' } + } + return r +} + +/** + * @param {unknown} stdin + * @returns {Promise} + */ +async function readAllStdin(stdin) { + const stream = /** @type {AsyncIterable | undefined} */ (stdin) + if (!stream || typeof (/** @type {any} */ (stream))[Symbol.asyncIterator] !== 'function') return '' + let out = '' + for await (const chunk of stream) { + out += typeof chunk === 'string' ? chunk : chunk.toString('utf8') + } + return out +} + /** @param {unknown} stream */ function isTty(stream) { return !!stream && typeof stream === 'object' && /** @type {{ isTTY?: boolean }} */ (stream).isTTY === true diff --git a/src/core/config/apply.js b/src/core/config/apply.js new file mode 100644 index 0000000..1655cb4 --- /dev/null +++ b/src/core/config/apply.js @@ -0,0 +1,549 @@ +// @ts-check + +import fs from 'node:fs' +import path from 'node:path' + +import { Attr, getLogger, withSpan } from '../observability/index.js' + +/** + * @import { ConfigControlFacade, ConfigStageResult, HypAwareV2Config, PluginConfigInstance } from '../../../collectivus-plugin-kernel-types.d.ts' + * @import { + * ConfigApplyDeps, + * ConfigControl, + * ConfigControlState, + * ConfigControlStatus, + * ConfigRollbackReason, + * ConfigSlot, + * CreateConfigControlOptions, + * ProbationMarker, + * } from './types.d.ts' + */ + +/** + * Maximum accepted config document size in bytes. A pulled 200 body is + * parsed and persisted wholesale, so a stated cap bounds memory and + * disk regardless of what an authenticated server sends. 1 MiB is + * orders of magnitude above any real config. + * @ref LLP 0023#config-pull-loop [implements] — max accepted config document size, settled at 1 MiB + */ +export const MAX_CONFIG_DOCUMENT_BYTES = 1024 * 1024 + +/** + * Default config pull cadence (seconds) when the staged document's + * central sink does not set `poll_interval_seconds`. Mirrors the + * central plugin's own default — the kernel needs the value to size + * the probation window without asking the plugin. + */ +export const DEFAULT_POLL_INTERVAL_SECONDS = 300 + +/** + * Probation window floor (seconds). The window is + * `max(3 × poll_interval_seconds, floor)` so a fast poll cadence still + * leaves room for daemon relaunch + identity refresh + one retry. + * @ref LLP 0023#post-apply-probation [implements] — window formula with the floor settled at 120s + */ +export const PROBATION_FLOOR_SECONDS = 120 + +const CONTROL_DIRNAME = 'config-control' +const STATE_BASENAME = 'state.json' + +/** + * Build the kernel config apply engine: validate → install pinned + * plugins → persist to an A/B slot → flip the operative pointer → + * staged restart, plus probation and last-known-good rollback. + * + * Persistence idiom: each applied config is written to its own slot + * file under `/config-control/`, with the served ETag in a + * per-slot sidecar written *before* the flip. The operative config + * path becomes a symlink to the active slot, replaced atomically via + * tmp+rename — so the config document and its etag transition together + * in both directions (apply and rollback), and last-known-good is + * crash-safe by construction (the previous slot is never modified). + * + * @param {CreateConfigControlOptions} opts + * @returns {ConfigControl} + * @ref LLP 0023#apply-engine-is-kernel-surface [implements] — the engine is kernel-owned; plugins only see the narrow facade + */ +export function createConfigControl(opts) { + const { stateRoot, configPath, requestRestart } = opts + const now = opts.now ?? Date.now + const log = getLogger('config-control') + const controlDir = path.join(stateRoot, CONTROL_DIRNAME) + const statePath = path.join(controlDir, STATE_BASENAME) + + /** @type {ConfigApplyDeps | null} */ + let applyDeps = null + /** @type {NodeJS.Timeout | null} */ + let watchdog = null + let restartPending = false + + /** @returns {ConfigControlState} */ + function readState() { + return readControlState(statePath) + } + + /** @param {ConfigControlState} state */ + function writeState(state) { + fs.mkdirSync(controlDir, { recursive: true, mode: 0o700 }) + const tmp = `${statePath}.tmp.${process.pid}.${now()}` + fs.writeFileSync(tmp, JSON.stringify(state, null, 2) + '\n', { mode: 0o600 }) + fs.renameSync(tmp, statePath) + } + + /** @param {ConfigSlot} slot */ + function slotPath(slot) { + return path.join(controlDir, `config.${slot}.json`) + } + + /** @param {ConfigSlot} slot */ + function slotEtagPath(slot) { + return path.join(controlDir, `config.${slot}.etag`) + } + + /** @returns {ConfigSlot | null} */ + function activeSlot() { + return readActiveSlot(controlDir, configPath) + } + + /** + * Atomically point the operative config path at `slot`. A relative + * symlink is created at a tmp path and renamed over the config path, + * so a crash leaves either the old or the new pointer — never + * neither. + * + * @param {ConfigSlot} slot + */ + function flipPointer(slot) { + const target = path.relative(path.dirname(configPath), slotPath(slot)) + const tmp = `${configPath}.tmp.${process.pid}.${now()}` + fs.symlinkSync(target, tmp) + fs.renameSync(tmp, configPath) + } + + /** @returns {string | undefined} */ + function runningEtag() { + return readRunningEtag(controlDir, configPath) + } + + /** + * Revert to the previous operative config: flip the pointer back + * (the per-slot etag sidecar reverts with it), clear probation, + * remember the bad etag, and record the structured rollback reason. + * + * @param {ProbationMarker} marker + * @param {ConfigRollbackReason} reason + * @param {string} [detail] + * @ref LLP 0023#last-known-good-rollback [implements] — flip back + remembered bad etag + structured reason, recorded client-side from day one + */ + function rollback(marker, reason, detail) { + if (marker.previous_slot) { + flipPointer(marker.previous_slot) + } + const at = new Date(now()).toISOString() + const state = readState() + delete state.probation + state.bad_etag = { etag: marker.etag, reason, recorded_at: at } + state.last_rollback = { + etag: marker.etag, + reason, + at, + ...(detail ? { detail } : {}), + } + writeState(state) + log.warn('config.rollback', { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.rollback', + [Attr.ERROR_KIND]: reason, + config_etag: marker.etag, + rolled_back_to_slot: marker.previous_slot ?? 'none', + ...(detail ? { detail } : {}), + status: 'ok', + }) + } + + function disarmProbationWatchdog() { + if (watchdog) { + clearTimeout(watchdog) + watchdog = null + } + } + + /** + * Arm the in-process probation timer for the active marker, if any. + * Expiry rolls back and requests a staged restart onto + * last-known-good. The kernel owns this timer — a wedged central + * sink is exactly the failure probation must catch. + * @ref LLP 0023#post-apply-probation [implements] — kernel-owned watchdog, independent of the central plugin functioning + */ + function armProbationWatchdog() { + disarmProbationWatchdog() + const state = readState() + const marker = state.probation + if (!marker) return + const remainingMs = Math.max(0, Date.parse(marker.until) - now()) + watchdog = setTimeout(() => { + watchdog = null + const current = readState().probation + if (!current || current.etag !== marker.etag) return + log.error('config.probation_expired', { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.probation_expired', + config_etag: marker.etag, + status: 'failed', + }) + rollback(current, 'probation_expired') + restartPending = true + requestRestart('probation_expired') + }, remainingMs) + if (typeof watchdog.unref === 'function') watchdog.unref() + } + + /** + * Boot-time probation evaluation, run before plugin activation: a + * kernel-killing-but-valid config can crashloop under the service + * manager faster than any in-process timer fires, so each relaunch + * checks the marker first. + * @ref LLP 0023#post-apply-probation [implements] — probation expiry is evaluated at boot, before plugin activation + */ + async function evaluateAtBoot() { + const state = readState() + const marker = state.probation + if (!marker) return { action: /** @type {const} */ ('none') } + + // A marker whose slot is not the operative pointer means the apply + // crashed between persisting the marker and flipping — the new + // config never took effect, so there is nothing to probe. + if (activeSlot() !== marker.slot) { + delete state.probation + writeState(state) + log.warn('config.probation_orphaned', { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.probation_orphaned', + config_etag: marker.etag, + status: 'ok', + }) + return { action: /** @type {const} */ ('cleared_orphan') } + } + + if (Date.parse(marker.until) <= now()) { + rollback(marker, 'probation_expired') + return { action: /** @type {const} */ ('rolled_back') } + } + return { action: /** @type {const} */ ('none') } + } + + function confirmPoll() { + const state = readState() + if (!state.probation) return + const etag = state.probation.etag + delete state.probation + writeState(state) + disarmProbationWatchdog() + log.info('config.probation_cleared', { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.probation_cleared', + config_etag: etag, + status: 'ok', + }) + } + + /** + * @param {unknown} document + * @param {string} etag + * @returns {Promise} + */ + async function stage(document, etag) { + return withSpan( + 'config.apply', + { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.apply', + config_etag: etag, + status: 'ok', + }, + async (span) => { + /** @param {import('../../../collectivus-plugin-kernel-types.d.ts').ConfigApplyErrorKind} errorKind @param {string} message */ + function fail(errorKind, message) { + span.setAttribute('status', 'failed') + span.setAttribute('error_kind', errorKind) + log.error('config.apply_failed', { + [Attr.COMPONENT]: 'config-control', + [Attr.ERROR_KIND]: errorKind, + config_etag: etag, + message, + }) + return /** @type {ConfigStageResult} */ ({ ok: false, errorKind, message }) + } + + if (restartPending) { + return fail('restart_pending', 'a staged restart is already pending') + } + if (!applyDeps) { + return fail('apply_engine_not_ready', 'apply engine has no validator/installer attached') + } + if (typeof etag !== 'string' || etag.length === 0) { + return fail('config_invalid', 'stage() requires the served etag') + } + if (etag === runningEtag()) { + span.setAttribute('apply_action', 'noop_same_etag') + return { ok: true, action: 'noop_same_etag' } + } + + const state = readState() + // Re-apply backoff: one remembered bad etag, skipped until the + // server serves a different revision. Re-polling is fine; an + // apply-crash loop is not. + if (state.bad_etag && state.bad_etag.etag === etag) { + span.setAttribute('apply_action', 'skipped_bad_etag') + log.warn('config.apply_skipped', { + [Attr.COMPONENT]: 'config-control', + config_etag: etag, + hyp_reason: 'bad_etag_backoff', + }) + return { ok: true, action: 'skipped_bad_etag' } + } + + const serialized = JSON.stringify(document, null, 2) + '\n' + if (Buffer.byteLength(serialized, 'utf8') > MAX_CONFIG_DOCUMENT_BYTES) { + return fail('document_too_large', `config document exceeds ${MAX_CONFIG_DOCUMENT_BYTES} bytes`) + } + + const validation = await applyDeps.validateDocument(document) + if (!validation.ok) { + const first = validation.errors[0] + rememberBadEtag(etag, 'validation_failed') + return fail( + 'config_invalid', + first ? `${first.pointer || ''}: ${first.message}` : 'config validation failed' + ) + } + const config = /** @type {HypAwareV2Config} */ (document) + + const install = await applyDeps.installPinnedPlugins(config.plugins ?? []) + if (!install.ok) { + rememberBadEtag( + etag, + install.errorKind === 'artifact_hash_mismatch' + ? 'artifact_hash_mismatch' + : install.errorKind === 'bundled_version_mismatch' + ? 'bundled_version_mismatch' + : 'plugin_install_failed' + ) + return fail(install.errorKind, install.message) + } + + try { + commit(config, serialized, etag) + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + return fail('apply_io_error', message) + } + + span.setAttribute('apply_action', 'applied') + log.info('config.applied', { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.apply', + config_etag: etag, + status: 'ok', + }) + restartPending = true + requestRestart('config_applied') + return /** @type {ConfigStageResult} */ ({ ok: true, action: 'applied' }) + }, + { component: 'config-control' } + ) + } + + /** + * Remember a rejected revision so re-polls don't become an + * apply-fail loop. Pre-flip failures (validation, install) record + * only the bad etag + reason; `last_rollback` is reserved for actual + * reverts of an applied config. + * + * @param {string} etag + * @param {ConfigRollbackReason} reason + */ + function rememberBadEtag(etag, reason) { + const state = readState() + state.bad_etag = { etag, reason, recorded_at: new Date(now()).toISOString() } + writeState(state) + } + + /** + * Persist `serialized` to the inactive slot, write its etag sidecar + * and the probation marker, then flip the pointer as the last step. + * Ordering is the crash-safety argument: everything before the flip + * is invisible to boot; the flip itself is atomic; the marker's + * `slot` field lets `evaluateAtBoot` discard a marker whose flip + * never happened. + * + * @param {HypAwareV2Config} config + * @param {string} serialized + * @param {string} etag + * @ref LLP 0023#apply-semantics-staged-restart [implements] — A/B slots with an atomic pointer; never live-mutate; restart does the activation + */ + function commit(config, serialized, etag) { + fs.mkdirSync(controlDir, { recursive: true, mode: 0o700 }) + const current = activeSlot() + + /** @type {ConfigSlot | null} */ + let previousSlot = current + if (current === null) { + // First apply over a regular file (the seed, or a hand-written + // config): preserve its bytes in slot 'a' so rollback lands back + // on it. Seed-config mode is a legitimate steady state, so this + // is a safe rollback target by construction. + let seedRaw = null + try { + seedRaw = fs.readFileSync(configPath, 'utf8') + } catch (err) { + if (/** @type {NodeJS.ErrnoException} */ (err).code !== 'ENOENT') throw err + } + if (seedRaw !== null) { + fs.writeFileSync(slotPath('a'), seedRaw, { mode: 0o600 }) + fs.rmSync(slotEtagPath('a'), { force: true }) + previousSlot = 'a' + } + } + + /** @type {ConfigSlot} */ + const target = previousSlot === 'b' ? 'a' : 'b' + fs.writeFileSync(slotPath(target), serialized, { mode: 0o600 }) + fs.writeFileSync(slotEtagPath(target), etag + '\n', { mode: 0o600 }) + + const pollSeconds = pollIntervalFromConfig(config) + const windowSeconds = Math.max(3 * pollSeconds, PROBATION_FLOOR_SECONDS) + const state = readState() + state.probation = { + etag, + applied_at: new Date(now()).toISOString(), + until: new Date(now() + windowSeconds * 1000).toISOString(), + slot: target, + previous_slot: previousSlot, + } + writeState(state) + + flipPointer(target) + } + + /** @returns {Promise} */ + async function status() { + return readConfigControlStatus({ stateRoot, configPath }) + } + + return { + stage, + confirmPoll, + runningEtag, + evaluateAtBoot, + attachApplyDeps(deps) { applyDeps = deps }, + armProbationWatchdog, + disarmProbationWatchdog, + status, + } +} + +/* ---------- shared read-only helpers ---------- */ + +/** + * @param {string} statePath + * @returns {ConfigControlState} + */ +function readControlState(statePath) { + let raw + try { + raw = fs.readFileSync(statePath, 'utf8') + } catch (err) { + if (/** @type {NodeJS.ErrnoException} */ (err).code === 'ENOENT') return {} + throw err + } + const parsed = JSON.parse(raw) + return parsed && typeof parsed === 'object' ? parsed : {} +} + +/** + * Which slot the operative config symlink points at, or null when it + * is a regular file (seed / hand-written config) or missing. + * + * @param {string} controlDir + * @param {string} configPath + * @returns {ConfigSlot | null} + */ +function readActiveSlot(controlDir, configPath) { + let target + try { + target = fs.readlinkSync(configPath) + } catch { + return null + } + const resolved = path.resolve(path.dirname(configPath), target) + if (resolved === path.join(controlDir, 'config.a.json')) return 'a' + if (resolved === path.join(controlDir, 'config.b.json')) return 'b' + return null +} + +/** + * @param {string} controlDir + * @param {string} configPath + * @returns {string | undefined} + */ +function readRunningEtag(controlDir, configPath) { + const slot = readActiveSlot(controlDir, configPath) + if (!slot) return undefined + try { + const etag = fs.readFileSync(path.join(controlDir, `config.${slot}.etag`), 'utf8').trim() + return etag.length > 0 ? etag : undefined + } catch { + return undefined + } +} + +/** + * Read-only view of the apply engine's state for `hypaware status` — + * usable from any process (the CLI is not the daemon), so it never + * constructs the engine or takes its hooks. + * + * @param {{ stateRoot: string, configPath: string }} args + * @returns {ConfigControlStatus} + * @ref LLP 0023#last-known-good-rollback [implements] — operator-visible probation/rollback/bad-etag state without log spelunking + */ +export function readConfigControlStatus({ stateRoot, configPath }) { + const controlDir = path.join(stateRoot, CONTROL_DIRNAME) + /** @type {ConfigControlState} */ + let state = {} + try { + state = readControlState(path.join(controlDir, STATE_BASENAME)) + } catch { + // unreadable state surfaces as empty — status is best-effort + } + return { + probation: state.probation ?? null, + lastRollback: state.last_rollback ?? null, + badEtag: state.bad_etag ?? null, + runningEtag: readRunningEtag(controlDir, configPath) ?? null, + } +} + +/** + * Extract the config pull cadence from the staged document's central + * sink block to size the probation window. The window must track the + * *new* config's cadence — that is the sink that will (or won't) + * confirm the poll. Knowing the first-party plugin name here mirrors + * the client-descriptor precedent in `plugin_catalog.js`. + * + * @param {HypAwareV2Config} config + * @returns {number} + */ +function pollIntervalFromConfig(config) { + let min = Infinity + for (const sink of Object.values(config.sinks ?? {})) { + if (!('plugin' in sink) || sink.plugin !== '@hypaware/central') continue + const v = sink.config?.poll_interval_seconds + if (typeof v === 'number' && Number.isFinite(v) && v > 0) { + min = Math.min(min, v) + } else { + min = Math.min(min, DEFAULT_POLL_INTERVAL_SECONDS) + } + } + return Number.isFinite(min) ? min : DEFAULT_POLL_INTERVAL_SECONDS +} diff --git a/src/core/config/apply_deps.js b/src/core/config/apply_deps.js new file mode 100644 index 0000000..e8db6d1 --- /dev/null +++ b/src/core/config/apply_deps.js @@ -0,0 +1,144 @@ +// @ts-check + +import { Attr, getLogger } from '../observability/index.js' +import { parseConfigShape } from './schema.js' +import { validateConfig } from './validate.js' +import { buildPluginCatalog } from '../plugin_catalog.js' +import { discoverBundledPlugins } from '../runtime/bundled.js' +import { discoverInstalledPlugins } from '../runtime/installed.js' +import { installPlugin, loadLock } from '../plugin_install/install.js' +import { getEntry } from '../plugin_install/lock.js' + +/** + * @import { PluginConfigInstance, PluginName, ValidationError } from '../../../collectivus-plugin-kernel-types.d.ts' + * @import { ConfigApplyDeps, PinnedInstallResult } from './types.d.ts' + */ + +/** + * Build the apply-time dependencies the config apply engine needs: + * full-document validation against the live plugin catalog, and + * hash-pinned plugin installation through the LLP 0007 install path. + * Constructed by the daemon after kernel boot (the catalog needs the + * bundled manifest set) and attached via + * `configControl.attachApplyDeps()`. + * + * @param {{ stateRoot: string, workspaceDir?: string }} opts + * @returns {ConfigApplyDeps} + */ +export function buildConfigApplyDeps(opts) { + const { stateRoot, workspaceDir } = opts + const log = getLogger('config-control') + + /** + * Discover bundled + installed manifests fresh per apply: an apply + * may have just installed a plugin, and a stale catalog would reject + * the very config that named it. + */ + async function discover() { + const bundled = await discoverBundledPlugins( + workspaceDir !== undefined ? { workspaceDir } : {} + ) + const installed = await discoverInstalledPlugins({ stateDir: stateRoot }) + return { bundled, installed } + } + + /** @param {unknown} document */ + async function validateDocument(document) { + const shape = parseConfigShape(document) + if (!shape.ok) { + return { ok: false, errors: shape.errors } + } + const { bundled, installed } = await discover() + const catalog = buildPluginCatalog( + [...bundled.loaded, ...bundled.excluded], + installed.loaded + ) + const result = await validateConfig(shape.config, { + knownPlugins: catalog.pluginMetadata, + knownDatasets: catalog.knownDatasets, + }) + return { ok: result.ok, errors: /** @type {ValidationError[]} */ (result.errors) } + } + + /** + * Install every pinned plugin the staged config names. Bundled + * first-party plugins satisfy the pin by strict version equality and + * skip the hash check (bundled code is inside the kernel's own trust + * boundary); everything else goes through the regular fetch path, + * with the artifact hash verified before the install commits. + * + * @param {PluginConfigInstance[]} entries + * @returns {Promise} + * @ref LLP 0023#install-on-config-hash-pinned [implements] — existing LLP 0007 install path; hash mismatch is an apply failure + */ + async function installPinnedPlugins(entries) { + const { bundled, installed } = await discover() + /** @type {Map} */ + const bundledVersions = new Map() + for (const m of [...bundled.loaded, ...bundled.excluded]) { + bundledVersions.set(m.manifest.name, m.manifest.version) + } + const installedNames = new Set(installed.loaded.map((m) => m.manifest.name)) + const lock = await loadLock(stateRoot) + + for (const entry of entries) { + if (entry.enabled === false) continue + + const bundledVersion = bundledVersions.get(entry.name) + if (bundledVersion !== undefined) { + // @ref LLP 0023#bundled-first-party-plugins [implements] — version checked strictly, artifact hash not checked for bundled plugins + if (entry.version !== undefined && entry.version !== bundledVersion) { + return { + ok: false, + errorKind: 'bundled_version_mismatch', + message: `plugin ${entry.name}: config pins version ${entry.version} but the bundled version is ${bundledVersion}`, + } + } + continue + } + + const locked = getEntry(lock, /** @type {PluginName} */ (entry.name)) + const satisfied = locked + && installedNames.has(entry.name) + && (entry.version === undefined || locked.version === entry.version) + && (entry.artifact_hash === undefined || locked.content_hash === entry.artifact_hash) + if (satisfied) continue + + const result = await installPlugin({ + rawSource: entry.source ?? entry.name, + stateDir: stateRoot, + ...(entry.version !== undefined ? { opts: { ref: `v${entry.version}` } } : {}), + // The hash pin is verified against the staged artifact before + // the install commits — nothing may substitute code after the + // config was authored. + confirm: async (staged) => { + if (entry.artifact_hash !== undefined && staged.contentHash !== entry.artifact_hash) { + log.error('config.pin_hash_mismatch', { + [Attr.COMPONENT]: 'config-control', + [Attr.PLUGIN]: entry.name, + [Attr.ERROR_KIND]: 'artifact_hash_mismatch', + pinned_hash: entry.artifact_hash, + fetched_hash: staged.contentHash, + }) + return { proceed: false, outcome: 'rejected' } + } + if (entry.version !== undefined && staged.manifest.version !== entry.version) { + return { proceed: false, outcome: 'rejected' } + } + return { proceed: true, outcome: 'auto_yes' } + }, + }) + if (!result.ok) { + const hashRejected = result.errorKind === 'remote_install_rejected' + return { + ok: false, + errorKind: hashRejected ? 'artifact_hash_mismatch' : 'plugin_install_failed', + message: `plugin ${entry.name}: ${result.message}`, + } + } + } + return { ok: true } + } + + return { validateDocument, installPinnedPlugins } +} diff --git a/src/core/config/schema.js b/src/core/config/schema.js index 777f561..fb28557 100644 --- a/src/core/config/schema.js +++ b/src/core/config/schema.js @@ -343,10 +343,20 @@ function parsePluginEntry(entry, pointer, errors) { if (obj.config !== undefined && !isPlainObject(obj.config)) { errors.push({ pointer: `${pointer}/config`, message: 'config must be an object when present' }) } + // Pin fields set by centrally-served configs (LLP 0023). Optional in + // hand-written configs; the apply engine enforces them when present. + for (const key of /** @type {const} */ (['version', 'artifact_hash', 'source'])) { + if (obj[key] !== undefined && !isNonEmptyString(obj[key])) { + errors.push({ pointer: `${pointer}/${key}`, message: `${key} must be a non-empty string when present` }) + } + } /** @type {PluginConfigInstance} */ const out = { name: obj.name } if (typeof obj.enabled === 'boolean') out.enabled = obj.enabled if (isPlainObject(obj.config)) out.config = /** @type {JsonObject} */ (obj.config) + if (isNonEmptyString(obj.version)) out.version = obj.version + if (isNonEmptyString(obj.artifact_hash)) out.artifact_hash = obj.artifact_hash + if (isNonEmptyString(obj.source)) out.source = obj.source return out } diff --git a/src/core/config/types.d.ts b/src/core/config/types.d.ts index 8f968c8..c430bd3 100644 --- a/src/core/config/types.d.ts +++ b/src/core/config/types.d.ts @@ -1,5 +1,9 @@ import type { + ConfigApplyErrorKind, + ConfigControlFacade, + ConfigStageResult, HypAwareV2Config, + PluginConfigInstance, PluginName, CapabilityName, ConfigRegistry, @@ -88,3 +92,116 @@ export interface ValidateResult { pluginCount: number sinkCount: number } + +// ============================================================================= +// Config apply engine (LLP 0023) +// ============================================================================= + +/** Structured rollback reason recorded by the apply engine. */ +export type ConfigRollbackReason = + | 'validation_failed' + | 'plugin_install_failed' + | 'artifact_hash_mismatch' + | 'bundled_version_mismatch' + | 'probation_expired' + +/** A/B slot identifier for persisted config documents. */ +export type ConfigSlot = 'a' | 'b' + +/** + * Probation marker persisted before the staged restart and read back at + * the next boot. `slot` is the slot the apply flipped to; rollback + * flips to `previousSlot` (or back to the pre-apply regular file + * content preserved in that slot). + */ +export interface ProbationMarker { + /** ETag of the applied revision under probation. */ + etag: string + applied_at: string + /** ISO time after which an unconfirmed apply rolls back. */ + until: string + slot: ConfigSlot + previous_slot: ConfigSlot | null +} + +export interface ConfigRollbackRecord { + etag: string + reason: ConfigRollbackReason + at: string + detail?: string +} + +export interface RememberedBadEtag { + etag: string + reason: ConfigRollbackReason + recorded_at: string +} + +/** + * Kernel-managed apply bookkeeping, persisted atomically as one file + * under `/config-control/state.json`. + */ +export interface ConfigControlState { + probation?: ProbationMarker + bad_etag?: RememberedBadEtag + last_rollback?: ConfigRollbackRecord +} + +/** Result of installing one pinned plugin entry during apply. */ +export type PinnedInstallResult = + | { ok: true } + | { ok: false, errorKind: ConfigApplyErrorKind, message: string } + +/** + * Apply-time dependencies the daemon attaches once the kernel has + * booted (the validator needs the plugin catalog; the installer needs + * the bundled manifest set). Both are injectable so the engine state + * machine is testable without HTTP, git, or a real kernel boot. + */ +export interface ConfigApplyDeps { + /** Full document validation: shape + cross-plugin. */ + validateDocument(document: unknown): Promise<{ ok: boolean, errors: ValidationError[] }> + /** Install every pinned plugin the config names; verify pins. */ + installPinnedPlugins(entries: PluginConfigInstance[]): Promise +} + +/** Public status surface for `hypaware status`. */ +export interface ConfigControlStatus { + probation: ProbationMarker | null + lastRollback: ConfigRollbackRecord | null + badEtag: RememberedBadEtag | null + runningEtag: string | null +} + +/** + * Kernel-internal handle to the apply engine. The plugin-facing subset + * is `ConfigControlFacade`; everything else is daemon-only. + */ +export interface ConfigControl extends ConfigControlFacade { + /** + * Evaluate probation state before plugin activation: discard + * orphaned markers (apply never committed), roll back expired ones + * (flips the operative config in place; no restart needed since the + * kernel has not loaded it yet). + */ + evaluateAtBoot(): Promise<{ action: 'none' | 'cleared_orphan' | 'rolled_back' }> + /** Attach post-boot apply dependencies; `stage()` fails before this. */ + attachApplyDeps(deps: ConfigApplyDeps): void + /** Arm the in-process probation watchdog timer when a marker is active. */ + armProbationWatchdog(): void + /** Cancel the watchdog timer (daemon shutdown). */ + disarmProbationWatchdog(): void + status(): Promise +} + +export interface CreateConfigControlOptions { + /** Kernel state root (`/hypaware`). */ + stateRoot: string + /** Operative config path the daemon booted with. */ + configPath: string + /** Staged restart hook; the daemon exits with the restart code. */ + requestRestart(reason: string): void + now?: () => number +} + +export type { ConfigStageResult, ConfigApplyErrorKind } diff --git a/src/core/daemon/runtime.js b/src/core/daemon/runtime.js index cee9936..e0931ec 100644 --- a/src/core/daemon/runtime.js +++ b/src/core/daemon/runtime.js @@ -12,7 +12,9 @@ import { } from '../observability/index.js' import { readObservabilityEnv } from '../observability/env.js' import { loadConfigFile } from '../config/schema.js' -import { bootKernel } from '../runtime/boot.js' +import { createConfigControl } from '../config/apply.js' +import { buildConfigApplyDeps } from '../config/apply_deps.js' +import { bootKernel, resolveConfigPath } from '../runtime/boot.js' import { createSinkDriver } from '../sinks/driver.js' import { materializeSinks } from '../sinks/materialize.js' import { @@ -45,6 +47,16 @@ import { statusFilePath, writeStatusFile } from './status.js' const DEFAULT_TICK_INTERVAL_MS = 60_000 const MIN_TICK_INTERVAL_MS = 25 +/** + * Exit code a foreground daemon uses to request its own relaunch after + * a staged config apply or rollback (EX_TEMPFAIL — "try again"). The + * service managers relaunch on any exit (`KeepAlive` / + * `Restart=always`); foreground invokers (smoke harness, dev shells) + * loop on this specific code. + * @ref LLP 0017#staged-restart-for-config-replacement [implements] — a foreground daemon cannot relaunch itself; the invoker loops on this code + */ +export const DAEMON_RESTART_EXIT_CODE = 75 + /** * Boot the kernel, start every configured source, and run sink ticks * on a fixed cadence. Returns a `DaemonHandle` the caller can use to @@ -114,7 +126,7 @@ export async function runDaemon(opts = {}) { const sinkSnapshots = new Map() /** @type {NodeJS.Timeout | null} */ let tickHandle = null - /** @type {((reason: 'signal'|'manual') => Promise) | null} */ + /** @type {((reason: 'signal'|'manual'|'restart') => Promise) | null} */ let triggerShutdown = null let shutdownInFlight = false /** @type {((value: number) => void) | null} */ @@ -138,6 +150,37 @@ export async function runDaemon(opts = {}) { writeStatusFile(stateRoot, status) fileLog.info('daemon.starting', { config_path: opts.configPath ?? null }) + // ----- Config apply engine (LLP 0023) ----- + // Created before bootKernel so probation expiry is evaluated before + // any plugin activates: a kernel-killing-but-valid config that + // crashloops under the service manager may never live long enough + // for an in-process timer to fire. + const operativeConfigPath = resolveConfigPath({ + ...(opts.configPath !== undefined ? { explicit: opts.configPath } : {}), + env, + hypHome, + }) + // An apply can land while the daemon is still wiring up (the pull + // loop's immediate pull races the tail of runDaemon), so a restart + // request before triggerShutdown exists is parked, not dropped. + let pendingRestart = false + const configControl = createConfigControl({ + stateRoot, + configPath: operativeConfigPath, + requestRestart: (reason) => { + fileLog.info('daemon.restart_requested', { hyp_reason: reason }) + if (triggerShutdown) { + void triggerShutdown('restart') + } else { + pendingRestart = true + } + }, + }) + const bootEval = await configControl.evaluateAtBoot() + if (bootEval.action !== 'none') { + fileLog.warn('daemon.config_probation_boot_action', { action: bootEval.action }) + } + /** * Persist the status snapshot to disk and update the gauge. * @param {Partial} [patch] @@ -173,6 +216,7 @@ export async function runDaemon(opts = {}) { mode: 'daemon', runId, env, + configControl, }) /** @type {Map} */ const sourcePluginByName = new Map() @@ -213,6 +257,13 @@ export async function runDaemon(opts = {}) { status.healthyAt = new Date(healthyAtMs).toISOString() } + // Attach apply-time deps before any sink materializes: the central + // sink's pull loop may deliver a document immediately after its + // bootstrap, and `stage()` refuses to run without a validator. The + // watchdog re-arms here on every relaunch that boots mid-probation. + configControl.attachApplyDeps(buildConfigApplyDeps({ stateRoot })) + configControl.armProbationWatchdog() + // ----- Materialize config-backed sinks ----- const sinkResult = await materializeSinks(boot.runtime, boot.config, { stateRoot, @@ -328,10 +379,11 @@ export async function runDaemon(opts = {}) { } // ----- Shutdown ----- - /** @param {'signal'|'manual'} reason */ + /** @param {'signal'|'manual'|'restart'} reason */ async function shutdown(reason) { if (shutdownInFlight) return done shutdownInFlight = true + configControl.disarmProbationWatchdog() if (tickHandle) { clearInterval(tickHandle) tickHandle = null @@ -366,6 +418,7 @@ export async function runDaemon(opts = {}) { for (const snap of status.sources) { snap.state = 'stopped' } + await closeAllSinks({ runtime: boot.runtime, fileLog }) }, { component: 'daemon' } ).catch((err) => { @@ -382,7 +435,8 @@ export async function runDaemon(opts = {}) { if (installSignals) { removeSignalHandlers() } - resolveDone?.(0) + // @ref LLP 0017#staged-restart-for-config-replacement [implements] — the daemon exits and the service manager (or looping invoker) relaunches it + resolveDone?.(reason === 'restart' ? DAEMON_RESTART_EXIT_CODE : 0) return done } triggerShutdown = shutdown @@ -458,6 +512,10 @@ export async function runDaemon(opts = {}) { process.on('SIGHUP', sigHupHandler) } + if (pendingRestart) { + void shutdown('restart') + } + return { done, stop: () => shutdown('manual'), @@ -549,6 +607,23 @@ async function startConfiguredSources({ runtime, log, fileLog, sourcePluginByNam return snapshots } +/** + * Close every materialized sink instance. The central plugin's config + * pull and identity refresh timers stop in its `close()`, so shutdown + * must reach it even though sinks have no started/stopped lifecycle of + * their own. + * + * @param {{ runtime: KernelRuntime, fileLog: ReturnType }} args + */ +async function closeAllSinks({ runtime, fileLog }) { + try { + await runtime.sinks.closeAll() + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + fileLog.error('daemon.sink_close_failed', { message }) + } +} + /** * Stop every started source. Returns the list of names that failed * so the daemon can surface them as warnings on the final status diff --git a/src/core/daemon/status.js b/src/core/daemon/status.js index 8da3a40..5cea50f 100644 --- a/src/core/daemon/status.js +++ b/src/core/daemon/status.js @@ -6,6 +6,7 @@ import path from 'node:path' import process from 'node:process' import { defaultConfigPath, loadConfigFile } from '../config/schema.js' +import { readConfigControlStatus } from '../config/apply.js' import { devTelemetryDir, readObservabilityEnv } from '../observability/env.js' import { diagnoseV1Config, validateConfig } from '../config/validate.js' import { discoverInstalledPlugins } from '../runtime/installed.js' @@ -32,7 +33,7 @@ import { /** * @import { HypAwareV2Config } from '../../../collectivus-plugin-kernel-types.d.ts' - * @import { ConfigValidationError, V1Diagnostic } from '../config/types.d.ts' + * @import { ConfigControlStatus, ConfigValidationError, V1Diagnostic } from '../config/types.d.ts' * @import { ClientAttachReport, CollectStatusOptions, DaemonState, DaemonStatus, HypAwareStatusReport, ServiceState, SinkSnapshot, SourceSnapshot, StatusDiagnostic, StatusDiagnosticKind } from './types.d.ts' * @import { Dirent } from 'node:fs' * @import { PluginCatalog, ClientDescriptor } from '../plugin_catalog.js' @@ -404,6 +405,21 @@ export async function collectHypAwareStatus(opts = {}) { const cacheRoot = opts.runtime?.storage?.cacheRoot ?? path.join(stateRoot, 'cache') const cache = await measureCacheStats(cacheRoot) + // ----- remote config apply state (LLP 0023) ----- + /** @type {ConfigControlStatus | null} */ + let remoteConfig = null + try { + remoteConfig = readConfigControlStatus({ stateRoot, configPath }) + } catch { /* best-effort probe */ } + if (remoteConfig?.lastRollback) { + diagnostics.push({ + severity: 'warning', + kind: 'remote_config_rolled_back', + message: `remote config ${remoteConfig.lastRollback.etag} rolled back at ${remoteConfig.lastRollback.at} (${remoteConfig.lastRollback.reason})`, + repair: ['fix the central config revision; the gateway re-applies when the served etag changes'], + }) + } + // ----- recent errors ----- const recentErrorCount = await countRecentErrors(devTelemetryDir(stateRoot)) if (recentErrorCount > 0) { @@ -441,6 +457,7 @@ export async function collectHypAwareStatus(opts = {}) { recentErrorCount, diagnostics, overall, + remoteConfig, } } diff --git a/src/core/daemon/types.d.ts b/src/core/daemon/types.d.ts index 8320084..35d1e80 100644 --- a/src/core/daemon/types.d.ts +++ b/src/core/daemon/types.d.ts @@ -3,7 +3,7 @@ import type { CapabilityRegistry, QueryRegistry, } from '../../../collectivus-plugin-kernel-types.d.ts' -import type { V1Diagnostic, ConfigValidationError } from '../config/types.d.ts' +import type { ConfigControlStatus, V1Diagnostic, ConfigValidationError } from '../config/types.d.ts' import type { ExtendedSourceRegistry } from '../registry/sources.js' import type { ExtendedSinkRegistry } from '../registry/sinks.js' import type { KernelRuntime } from '../runtime/activation.js' @@ -71,6 +71,7 @@ export type StatusDiagnosticKind = | 'daemon_loaded_no_pid' | 'client_attach_missing' | 'recent_errors' + | 'remote_config_rolled_back' /** * Diagnostic surfaced by `hyp status`. Carries a severity, the @@ -139,6 +140,13 @@ export interface HypAwareStatusReport { recentErrorCount: number diagnostics: StatusDiagnostic[] overall: 'healthy' | 'degraded' + /** + * Remote-config apply state (LLP 0023): probation, last rollback + + * structured reason, remembered bad etag, and the running config's + * etag. Null only when the probe itself failed; a gateway that has + * never applied a remote config reports all-null fields. + */ + remoteConfig: ConfigControlStatus | null } export interface CollectStatusOptions { diff --git a/src/core/runtime/activation.d.ts b/src/core/runtime/activation.d.ts index 9575670..c5ffea6 100644 --- a/src/core/runtime/activation.d.ts +++ b/src/core/runtime/activation.d.ts @@ -4,6 +4,7 @@ import type { BackfillMaterializerRegistry, BackfillRegistry, CommandRegistry, + ConfigControlFacade, ConfigRegistry, InitPresetRegistry, JsonObject, @@ -43,6 +44,12 @@ export interface KernelRuntime { backfills: BackfillRegistry backfillMaterializers: BackfillMaterializerRegistry activationContexts: Map + /** + * Plugin-facing facade of the daemon's config apply engine. Set only + * when the host process runs one (daemon mode); CLI boots leave it + * undefined so transport plugins skip their pull loops. + */ + configControl?: ConfigControlFacade } export interface CreateKernelRuntimeArgs { @@ -55,6 +62,7 @@ export interface CreateKernelRuntimeArgs { backfillMaterializerRegistry?: BackfillMaterializerRegistry storage?: ExtendedQueryStorageService cacheRoot?: string + configControl?: ConfigControlFacade } export interface CreateActivationContextArgs { diff --git a/src/core/runtime/activation.js b/src/core/runtime/activation.js index 4d3ae16..e8ff044 100644 --- a/src/core/runtime/activation.js +++ b/src/core/runtime/activation.js @@ -15,7 +15,7 @@ import { createQueryStorageService } from '../cache/storage.js' import { isSafeContributionName } from './contribution_names.js' /** - * @import { ActivePlugin, AgentContribution, AgentRegistry, BackfillMaterializerRegistry, BackfillRegistry, CapabilityName, CapabilityRegistry, CommandRegistry, ConfigRegistry, InitPresetContribution, InitPresetRegistry, JsonObject, PermissionContext, PluginActivationContext, PluginLogger, PluginManifest, PluginName, PluginPaths, PluginPermission, QueryRegistry, SemverRange, SemverVersion, SinkRegistry, SkillContribution, SkillRegistry, SourceRegistry } from '../../../collectivus-plugin-kernel-types.d.ts' + * @import { ActivePlugin, AgentContribution, AgentRegistry, BackfillMaterializerRegistry, BackfillRegistry, CapabilityName, CapabilityRegistry, CommandRegistry, ConfigControlFacade, ConfigRegistry, InitPresetContribution, InitPresetRegistry, JsonObject, PermissionContext, PluginActivationContext, PluginLogger, PluginManifest, PluginName, PluginPaths, PluginPermission, QueryRegistry, SemverRange, SemverVersion, SinkRegistry, SkillContribution, SkillRegistry, SourceRegistry } from '../../../collectivus-plugin-kernel-types.d.ts' * @import { ExtendedQueryStorageService } from '../cache/types.d.ts' * @import { KernelRuntime } from './activation.d.ts' */ @@ -41,6 +41,7 @@ import { isSafeContributionName } from './contribution_names.js' * backfillMaterializerRegistry?: BackfillMaterializerRegistry, * storage?: ExtendedQueryStorageService, * cacheRoot?: string, + * configControl?: ConfigControlFacade, * }} [opts] * @returns {KernelRuntime} * @ref LLP 0003#intrinsic-not-plugin-provided [implements] — query + storage are wired in as intrinsic services, not plugin contributions @@ -53,6 +54,7 @@ export function createKernelRuntime(opts = {}) { getDeclaration: (dataset) => query.getDataset(dataset)?.cachePartitioning, }) return { + ...(opts.configControl ? { configControl: opts.configControl } : {}), capabilities: opts.capabilityRegistry ?? createCapabilityRegistry(), commands: opts.commandRegistry ?? createCommandRegistry(), configRegistry: createConfigRegistry(), @@ -121,6 +123,8 @@ export function createActivationContext({ runtime, plugin, paths, config, env }) initPresets: runtime.initPresets, backfills: runtime.backfills, backfillMaterializers: runtime.backfillMaterializers, + // @ref LLP 0023#apply-engine-is-kernel-surface [implements] — plugins reach the apply engine only through this narrow facade; absent outside the daemon + ...(runtime.configControl ? { configControl: runtime.configControl } : {}), /** * @template T * @param {CapabilityName} name diff --git a/src/core/runtime/boot.js b/src/core/runtime/boot.js index e23d580..aee6524 100644 --- a/src/core/runtime/boot.js +++ b/src/core/runtime/boot.js @@ -77,7 +77,11 @@ export async function bootKernel(opts = {}) { }, async (span) => { const commandRegistry = opts.commandRegistry ?? createCommandRegistry() - const runtime = createKernelRuntime({ commandRegistry, cacheRoot }) + const runtime = createKernelRuntime({ + commandRegistry, + cacheRoot, + ...(opts.configControl ? { configControl: opts.configControl } : {}), + }) const discovered = await discoverBundledPlugins({ workspaceDir: opts.workspaceDir }) span.setAttribute('bundled_available', discovered.loaded.length) @@ -250,10 +254,13 @@ export async function bootKernel(opts = {}) { * 2. `env.HYP_CONFIG` * 3. `/hypaware-config.json` * + * Exported so the daemon can resolve the same operative path for the + * config apply engine before `bootKernel` runs. + * * @param {{ explicit?: string, env: NodeJS.ProcessEnv, hypHome: string }} args * @returns {string} */ -function resolveConfigPath({ explicit, env, hypHome }) { +export function resolveConfigPath({ explicit, env, hypHome }) { if (explicit) return path.resolve(explicit) if (env.HYP_CONFIG) return path.resolve(env.HYP_CONFIG) return defaultConfigPath(hypHome) diff --git a/src/core/runtime/types.d.ts b/src/core/runtime/types.d.ts index 3936e42..01f960a 100644 --- a/src/core/runtime/types.d.ts +++ b/src/core/runtime/types.d.ts @@ -1,5 +1,6 @@ import type { ActivePlugin, + ConfigControlFacade, HypAwareV2Config, PluginLockEntry, PluginName, @@ -47,6 +48,8 @@ export interface BootKernelOptions { env?: NodeJS.ProcessEnv /** Override OS temp root (tests). */ tmpRoot?: string + /** Apply-engine facade to expose on activation contexts (daemon only). */ + configControl?: ConfigControlFacade } export interface BootKernelResult { diff --git a/test/core/config-apply.test.js b/test/core/config-apply.test.js new file mode 100644 index 0000000..1f76071 --- /dev/null +++ b/test/core/config-apply.test.js @@ -0,0 +1,421 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs' +import fsp from 'node:fs/promises' +import os from 'node:os' +import path from 'node:path' + +import { + DEFAULT_POLL_INTERVAL_SECONDS, + MAX_CONFIG_DOCUMENT_BYTES, + PROBATION_FLOOR_SECONDS, + createConfigControl, + readConfigControlStatus, +} from '../../src/core/config/apply.js' +import { parseConfigShape } from '../../src/core/config/schema.js' + +/** + * @import { ConfigApplyDeps } from '../../src/core/config/types.d.ts' + */ + +const SEED_CONFIG = { + version: 2, + plugins: [{ name: '@hypaware/central' }], + sinks: { + central: { + plugin: '@hypaware/central', + config: { url: 'https://central.example', identity: { bootstrap_token: 'tok' } }, + }, + }, +} + +const REMOTE_CONFIG = { + version: 2, + plugins: [{ name: '@hypaware/central' }, { name: '@hypaware/otel' }], + sinks: { + central: { + plugin: '@hypaware/central', + config: { url: 'https://central.example', identity: {} }, + }, + }, +} + +async function makeFixture() { + const tmp = await fsp.mkdtemp(path.join(os.tmpdir(), 'hyp-config-apply-')) + const stateRoot = path.join(tmp, 'hypaware') + await fsp.mkdir(stateRoot, { recursive: true }) + const configPath = path.join(tmp, 'hypaware-config.json') + await fsp.writeFile(configPath, JSON.stringify(SEED_CONFIG, null, 2) + '\n') + return { tmp, stateRoot, configPath } +} + +/** + * @param {{ validateOk?: boolean, installResult?: import('../../src/core/config/types.d.ts').PinnedInstallResult }} [opts] + * @returns {ConfigApplyDeps & { validateCalls: number, installCalls: number }} + */ +function makeDeps(opts = {}) { + const deps = { + validateCalls: 0, + installCalls: 0, + /** @param {unknown} _document */ + async validateDocument(_document) { + deps.validateCalls += 1 + return opts.validateOk === false + ? { ok: false, errors: [{ pointer: '/plugins/0', message: 'nope' }] } + : { ok: true, errors: [] } + }, + /** @param {import('../../../collectivus-plugin-kernel-types.d.ts').PluginConfigInstance[]} _entries */ + async installPinnedPlugins(_entries) { + deps.installCalls += 1 + return opts.installResult ?? { ok: true } + }, + } + return deps +} + +/** + * @param {{ stateRoot: string, configPath: string, now?: () => number }} args + */ +function makeControl({ stateRoot, configPath, now }) { + /** @type {string[]} */ + const restarts = [] + const control = createConfigControl({ + stateRoot, + configPath, + requestRestart: (reason) => { restarts.push(reason) }, + ...(now ? { now } : {}), + }) + return { control, restarts } +} + +test('stage applies a document: slot persisted, pointer flipped, etag staged, probation armed, restart requested', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + + const result = await control.stage(REMOTE_CONFIG, 'etag-1') + assert.deepEqual(result, { ok: true, action: 'applied' }) + assert.deepEqual(restarts, ['config_applied']) + + // Operative config is now a symlink whose content is the new doc. + const stat = await fsp.lstat(configPath) + assert.ok(stat.isSymbolicLink()) + const operative = JSON.parse(await fsp.readFile(configPath, 'utf8')) + assert.deepEqual(operative.plugins, REMOTE_CONFIG.plugins) + + // The seed was preserved as the rollback target. + const slotA = JSON.parse( + await fsp.readFile(path.join(stateRoot, 'config-control', 'config.a.json'), 'utf8') + ) + assert.deepEqual(slotA, SEED_CONFIG) + + assert.equal(control.runningEtag(), 'etag-1') + const status = await control.status() + assert.equal(status.probation?.etag, 'etag-1') + assert.equal(status.probation?.slot, 'b') + assert.equal(status.probation?.previous_slot, 'a') +}) + +test('probation window is max(3 × poll interval, floor) from the staged document', async () => { + const { stateRoot, configPath } = await makeFixture() + const t0 = Date.parse('2026-06-12T00:00:00.000Z') + const { control } = makeControl({ stateRoot, configPath, now: () => t0 }) + control.attachApplyDeps(makeDeps()) + + // No poll_interval_seconds in the doc → default cadence. + await control.stage(REMOTE_CONFIG, 'etag-1') + let status = await control.status() + assert.equal( + Date.parse(/** @type {string} */ (status.probation?.until)) - t0, + 3 * DEFAULT_POLL_INTERVAL_SECONDS * 1000 + ) + + // A fast cadence is floored. Fresh engine: the first stage left a + // restart pending in the old one. + const relaunch = makeControl({ stateRoot, configPath, now: () => t0 }) + relaunch.control.attachApplyDeps(makeDeps()) + relaunch.control.confirmPoll() + const fastDoc = { + ...REMOTE_CONFIG, + sinks: { + central: { + plugin: '@hypaware/central', + config: { url: 'https://central.example', identity: {}, poll_interval_seconds: 5 }, + }, + }, + } + await relaunch.control.stage(fastDoc, 'etag-2') + status = await relaunch.control.status() + assert.equal( + Date.parse(/** @type {string} */ (status.probation?.until)) - t0, + PROBATION_FLOOR_SECONDS * 1000 + ) +}) + +test('stage before attachApplyDeps fails closed', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + const result = await control.stage(REMOTE_CONFIG, 'etag-1') + assert.equal(result.ok, false) + assert.equal(!result.ok && result.errorKind, 'apply_engine_not_ready') + assert.deepEqual(restarts, []) +}) + +test('validation failure remembers the bad etag and leaves the config untouched', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps({ validateOk: false })) + + const result = await control.stage(REMOTE_CONFIG, 'etag-bad') + assert.equal(result.ok, false) + assert.equal(!result.ok && result.errorKind, 'config_invalid') + assert.deepEqual(restarts, []) + + // Still the seed, still a regular file. + const stat = await fsp.lstat(configPath) + assert.ok(!stat.isSymbolicLink()) + const status = await control.status() + assert.equal(status.badEtag?.etag, 'etag-bad') + assert.equal(status.badEtag?.reason, 'validation_failed') + assert.equal(status.runningEtag, null) +}) + +test('a remembered bad etag backs off re-apply until the etag changes', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + const deps = makeDeps({ validateOk: false }) + control.attachApplyDeps(deps) + + await control.stage(REMOTE_CONFIG, 'etag-bad') + assert.equal(deps.validateCalls, 1) + + // Same etag again: skipped without re-validating. + const skipped = await control.stage(REMOTE_CONFIG, 'etag-bad') + assert.deepEqual(skipped, { ok: true, action: 'skipped_bad_etag' }) + assert.equal(deps.validateCalls, 1) + + // A different etag proceeds (and fails validation again here). + const retried = await control.stage(REMOTE_CONFIG, 'etag-fixed') + assert.equal(retried.ok, false) + assert.equal(deps.validateCalls, 2) +}) + +test('pinned install hash mismatch is an apply failure with a structured reason', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps({ + installResult: { ok: false, errorKind: 'artifact_hash_mismatch', message: 'hash differs' }, + })) + + const result = await control.stage(REMOTE_CONFIG, 'etag-hash') + assert.equal(!result.ok && result.errorKind, 'artifact_hash_mismatch') + assert.deepEqual(restarts, []) + const status = await control.status() + assert.equal(status.badEtag?.reason, 'artifact_hash_mismatch') +}) + +test('oversized documents are rejected before validation', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + const deps = makeDeps() + control.attachApplyDeps(deps) + + const huge = { ...REMOTE_CONFIG, padding: 'x'.repeat(MAX_CONFIG_DOCUMENT_BYTES) } + const result = await control.stage(huge, 'etag-huge') + assert.equal(!result.ok && result.errorKind, 'document_too_large') + assert.equal(deps.validateCalls, 0) +}) + +test('staging the running etag is a no-op', async () => { + const { stateRoot, configPath } = await makeFixture() + const first = makeControl({ stateRoot, configPath }) + first.control.attachApplyDeps(makeDeps()) + await first.control.stage(REMOTE_CONFIG, 'etag-1') + + // Relaunch: a fresh engine over the same state. + const second = makeControl({ stateRoot, configPath }) + second.control.attachApplyDeps(makeDeps()) + const result = await second.control.stage(REMOTE_CONFIG, 'etag-1') + assert.deepEqual(result, { ok: true, action: 'noop_same_etag' }) +}) + +test('a second stage in the same process is refused while a restart is pending', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + const result = await control.stage(REMOTE_CONFIG, 'etag-2') + assert.equal(!result.ok && result.errorKind, 'restart_pending') +}) + +test('confirmPoll clears probation', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + control.confirmPoll() + const status = await control.status() + assert.equal(status.probation, null) + assert.equal(status.runningEtag, 'etag-1') + // Idempotent. + control.confirmPoll() +}) + +test('chained applies alternate slots and roll back one revision', async () => { + const { stateRoot, configPath } = await makeFixture() + + const first = makeControl({ stateRoot, configPath }) + first.control.attachApplyDeps(makeDeps()) + await first.control.stage(REMOTE_CONFIG, 'etag-1') + + // Relaunch, probation clears, a newer revision arrives. + const second = makeControl({ stateRoot, configPath }) + second.control.attachApplyDeps(makeDeps()) + second.control.confirmPoll() + const doc2 = { ...REMOTE_CONFIG, plugins: [{ name: '@hypaware/central' }] } + await second.control.stage(doc2, 'etag-2') + assert.equal(second.control.runningEtag(), 'etag-2') + const status = await second.control.status() + assert.equal(status.probation?.slot, 'a') + assert.equal(status.probation?.previous_slot, 'b') + + // Expired probation at the next boot rolls back to etag-1, not the seed. + const future = Date.now() + 10 * 24 * 60 * 60 * 1000 + const third = makeControl({ stateRoot, configPath, now: () => future }) + const evaluated = await third.control.evaluateAtBoot() + assert.equal(evaluated.action, 'rolled_back') + assert.equal(third.control.runningEtag(), 'etag-1') + const rolled = await third.control.status() + assert.equal(rolled.lastRollback?.etag, 'etag-2') + assert.equal(rolled.lastRollback?.reason, 'probation_expired') + assert.equal(rolled.badEtag?.etag, 'etag-2') +}) + +test('evaluateAtBoot rolls an expired first apply back onto the seed', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + const future = Date.now() + 10 * 24 * 60 * 60 * 1000 + const relaunch = makeControl({ stateRoot, configPath, now: () => future }) + const evaluated = await relaunch.control.evaluateAtBoot() + assert.equal(evaluated.action, 'rolled_back') + + const operative = JSON.parse(await fsp.readFile(configPath, 'utf8')) + assert.deepEqual(operative, SEED_CONFIG) + assert.equal(relaunch.control.runningEtag(), undefined) +}) + +test('evaluateAtBoot keeps an unexpired probation marker', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + const relaunch = makeControl({ stateRoot, configPath }) + const evaluated = await relaunch.control.evaluateAtBoot() + assert.equal(evaluated.action, 'none') + const status = await relaunch.control.status() + assert.equal(status.probation?.etag, 'etag-1') +}) + +test('evaluateAtBoot discards a probation marker whose flip never committed', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + // Simulate a crash between the marker write and the pointer flip by + // pointing the marker at the slot that is NOT active. + const statePath = path.join(stateRoot, 'config-control', 'state.json') + const state = JSON.parse(fs.readFileSync(statePath, 'utf8')) + state.probation.slot = 'a' + fs.writeFileSync(statePath, JSON.stringify(state)) + + const relaunch = makeControl({ stateRoot, configPath }) + const evaluated = await relaunch.control.evaluateAtBoot() + assert.equal(evaluated.action, 'cleared_orphan') + const status = await relaunch.control.status() + assert.equal(status.probation, null) + // The operative config is untouched by orphan cleanup. + assert.equal(relaunch.control.runningEtag(), 'etag-1') +}) + +test('the probation watchdog rolls back and requests a restart on expiry', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + assert.deepEqual(restarts, ['config_applied']) + + // Shrink the live marker's window so the real timer fires fast. + const statePath = path.join(stateRoot, 'config-control', 'state.json') + const state = JSON.parse(fs.readFileSync(statePath, 'utf8')) + state.probation.until = new Date(Date.now() + 20).toISOString() + fs.writeFileSync(statePath, JSON.stringify(state)) + + control.armProbationWatchdog() + await new Promise((resolve) => setTimeout(resolve, 100)) + + assert.deepEqual(restarts, ['config_applied', 'probation_expired']) + const status = await control.status() + assert.equal(status.lastRollback?.reason, 'probation_expired') + assert.equal(status.runningEtag, null) + const operative = JSON.parse(await fsp.readFile(configPath, 'utf8')) + assert.deepEqual(operative, SEED_CONFIG) +}) + +test('a confirmed poll disarms the watchdog before it fires', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + const statePath = path.join(stateRoot, 'config-control', 'state.json') + const state = JSON.parse(fs.readFileSync(statePath, 'utf8')) + state.probation.until = new Date(Date.now() + 30).toISOString() + fs.writeFileSync(statePath, JSON.stringify(state)) + + control.armProbationWatchdog() + control.confirmPoll() + await new Promise((resolve) => setTimeout(resolve, 100)) + + assert.deepEqual(restarts, ['config_applied']) + assert.equal(control.runningEtag(), 'etag-1') +}) + +test('readConfigControlStatus reads without an engine and tolerates a fresh install', async () => { + const { stateRoot, configPath } = await makeFixture() + const empty = readConfigControlStatus({ stateRoot, configPath }) + assert.deepEqual(empty, { probation: null, lastRollback: null, badEtag: null, runningEtag: null }) + + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + const status = readConfigControlStatus({ stateRoot, configPath }) + assert.equal(status.runningEtag, 'etag-1') + assert.equal(status.probation?.etag, 'etag-1') +}) + +test('parseConfigShape accepts and validates plugin pin fields', () => { + const ok = parseConfigShape({ + version: 2, + plugins: [{ name: '@x/y', version: '1.2.3', artifact_hash: 'abc123', source: 'github:x/y' }], + }) + assert.ok(ok.ok) + assert.equal(ok.ok && ok.config.plugins?.[0].version, '1.2.3') + assert.equal(ok.ok && ok.config.plugins?.[0].artifact_hash, 'abc123') + assert.equal(ok.ok && ok.config.plugins?.[0].source, 'github:x/y') + + const bad = parseConfigShape({ + version: 2, + plugins: [{ name: '@x/y', version: 7 }], + }) + assert.ok(!bad.ok) + assert.ok(!bad.ok && bad.errors.some((e) => e.pointer === '/plugins/0/version')) +}) diff --git a/test/core/daemon.test.js b/test/core/daemon.test.js index 38814d9..dd4ff7c 100644 --- a/test/core/daemon.test.js +++ b/test/core/daemon.test.js @@ -173,6 +173,32 @@ test('renderDaemonInstall renders a deterministic LaunchAgent dry-run payload', ]) }) +test('installers default to relaunch-on-exit (staged restart requirement, LLP 0017)', () => { + // Defaults — no keepAlive/restart override. The service manager MUST + // relaunch the daemon after a staged config-apply exit. + const launchd = renderDaemonInstall({ + platform: 'darwin', + homeDir: '/Users/hyp', + binPath: '/opt/hypaware/bin/hypaware.js', + nodePath: '/usr/local/bin/node', + }) + assert.match(launchd.content, /KeepAlive<\/key>\n /) + + const systemd = renderDaemonInstall({ + platform: 'linux', + homeDir: '/home/hyp', + binPath: '/opt/hypaware/bin/hypaware.js', + nodePath: '/usr/local/bin/node', + }) + assert.match(systemd.content, /^Restart=always$/m) +}) + +test('the staged-restart exit code is distinct from success and error exits', async () => { + const { DAEMON_RESTART_EXIT_CODE } = await import('../../src/core/daemon/runtime.js') + assert.equal(typeof DAEMON_RESTART_EXIT_CODE, 'number') + assert.ok(DAEMON_RESTART_EXIT_CODE !== 0 && DAEMON_RESTART_EXIT_CODE !== 1 && DAEMON_RESTART_EXIT_CODE !== 2) +}) + test('runDaemon reload refreshes plugin config before source.reload', async () => { const hypHome = await fs.mkdtemp(path.join(os.tmpdir(), 'hypaware-daemon-reload-config-')) let handle diff --git a/test/core/join-command.test.js b/test/core/join-command.test.js new file mode 100644 index 0000000..44bf23d --- /dev/null +++ b/test/core/join-command.test.js @@ -0,0 +1,131 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs/promises' +import os from 'node:os' +import path from 'node:path' + +import { dispatch } from '../../src/core/cli/dispatch.js' + +function makeBuf() { + let value = '' + return { + write(chunk) { + value += String(chunk) + return true + }, + text() { + return value + }, + } +} + +/** @param {string} [stdinText] */ +async function makeDispatchOpts(stdinText) { + const hypHome = await fs.mkdtemp(path.join(os.tmpdir(), 'hyp-join-test-')) + const stdout = makeBuf() + const stderr = makeBuf() + /** @type {any} */ + let stdin + if (stdinText !== undefined) { + stdin = { + isTTY: false, + async *[Symbol.asyncIterator]() { yield stdinText }, + } + } else { + stdin = { isTTY: true } + } + return { + hypHome, + stdout, + stderr, + opts: { stdout, stderr, stdin, env: { ...process.env, HYP_HOME: hypHome, HYP_CONFIG: '' } }, + } +} + +test('join writes a seed config (mode 0600) and skips daemon install with --no-daemon', async () => { + const { hypHome, stdout, opts } = await makeDispatchOpts() + const code = await dispatch( + ['join', 'https://central.example', 'policy-token-1', '--no-daemon'], + opts + ) + assert.equal(code, 0, stdout.text()) + + const configPath = path.join(hypHome, 'hypaware-config.json') + const stat = await fs.stat(configPath) + assert.equal(stat.mode & 0o777, 0o600) + + const seed = JSON.parse(await fs.readFile(configPath, 'utf8')) + assert.equal(seed.version, 2) + assert.deepEqual(seed.plugins, [{ name: '@hypaware/central' }]) + assert.equal(seed.sinks.central.plugin, '@hypaware/central') + assert.equal(seed.sinks.central.config.url, 'https://central.example') + assert.equal(seed.sinks.central.config.identity.bootstrap_token, 'policy-token-1') + assert.match(stdout.text(), /daemon install skipped/) +}) + +test('join reads the token from --token-file', async () => { + const { hypHome, opts } = await makeDispatchOpts() + const tokenFile = path.join(hypHome, 'token.txt') + await fs.writeFile(tokenFile, 'file-token\n') + + const code = await dispatch( + ['join', 'https://central.example', '--token-file', tokenFile, '--no-daemon'], + opts + ) + assert.equal(code, 0) + const seed = JSON.parse( + await fs.readFile(path.join(hypHome, 'hypaware-config.json'), 'utf8') + ) + assert.equal(seed.sinks.central.config.identity.bootstrap_token, 'file-token') +}) + +test('join reads the token from stdin when piped', async () => { + const { hypHome, opts } = await makeDispatchOpts('stdin-token\n') + const code = await dispatch(['join', 'https://central.example', '--no-daemon'], opts) + assert.equal(code, 0) + const seed = JSON.parse( + await fs.readFile(path.join(hypHome, 'hypaware-config.json'), 'utf8') + ) + assert.equal(seed.sinks.central.config.identity.bootstrap_token, 'stdin-token') +}) + +test('join rejects missing url, bad url, missing token, and conflicting token sources', async () => { + { + const { stderr, opts } = await makeDispatchOpts() + assert.equal(await dispatch(['join'], opts), 2) + assert.match(stderr.text(), /missing /) + } + { + const { stderr, opts } = await makeDispatchOpts() + assert.equal(await dispatch(['join', 'ftp://x', 'tok', '--no-daemon'], opts), 2) + assert.match(stderr.text(), /http\(s\)/) + } + { + // TTY stdin and no token anywhere. + const { stderr, opts } = await makeDispatchOpts() + assert.equal(await dispatch(['join', 'https://central.example', '--no-daemon'], opts), 2) + assert.match(stderr.text(), /no token given/) + } + { + const { hypHome, stderr, opts } = await makeDispatchOpts() + const tokenFile = path.join(hypHome, 'token.txt') + await fs.writeFile(tokenFile, 'x') + assert.equal( + await dispatch( + ['join', 'https://central.example', 'tok', '--token-file', tokenFile, '--no-daemon'], + opts + ), + 2 + ) + assert.match(stderr.text(), /not both/) + } +}) + +test('join help exits 0 and documents token sources', async () => { + const { stdout, opts } = await makeDispatchOpts() + assert.equal(await dispatch(['join', '--help'], opts), 0) + assert.match(stdout.text(), /--token-file/) + assert.match(stdout.text(), /stdin/) +}) diff --git a/test/plugins/central-config-pull.test.js b/test/plugins/central-config-pull.test.js new file mode 100644 index 0000000..a3036cb --- /dev/null +++ b/test/plugins/central-config-pull.test.js @@ -0,0 +1,233 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' + +import { + MAX_CONFIG_DOCUMENT_BYTES, + createConfigPullLoop, +} from '../../hypaware-core/plugins-workspace/central/src/config_client.js' + +function makeLog() { + /** @type {Array<{ level: string, message: string, fields: Record }>} */ + const rows = [] + /** @param {string} level */ + const emit = (level) => + /** @param {string} message @param {Record} [fields] */ + (message, fields) => { rows.push({ level, message, fields: fields ?? {} }) } + return { + rows, + debug: emit('debug'), + info: emit('info'), + warn: emit('warn'), + error: emit('error'), + } +} + +/** @param {{ runningEtag?: string }} [opts] */ +function makeControl(opts = {}) { + /** @type {Array<{ document: unknown, etag: string }>} */ + const staged = [] + let confirms = 0 + return { + staged, + get confirms() { return confirms }, + /** @param {unknown} document @param {string} etag */ + async stage(document, etag) { + staged.push({ document, etag }) + return /** @type {const} */ ({ ok: true, action: 'applied' }) + }, + confirmPoll() { confirms += 1 }, + runningEtag() { return opts.runningEtag }, + } +} + +/** @param {Array<{ status: number, headers?: Record, body?: string }>} responses */ +function makeFetch(responses) { + /** @type {Array<{ url: string, headers: Record }>} */ + const requests = [] + /** @type {typeof fetch} */ + const fetchFn = async (url, init) => { + requests.push({ + url: String(url), + headers: /** @type {Record} */ (init?.headers ?? {}), + }) + const next = responses.shift() ?? { status: 500 } + const headers = new Headers(next.headers ?? {}) + return /** @type {Response} */ (/** @type {unknown} */ ({ + status: next.status, + ok: next.status >= 200 && next.status < 300, + headers, + async text() { return next.body ?? '' }, + })) + } + return { fetchFn, requests } +} + +function makeIdentity() { + let refreshes = 0 + return { + get refreshes() { return refreshes }, + async getCurrentJwt() { return 'jwt-1' }, + async refresh() { refreshes += 1 }, + } +} + +/** + * @param {object} overrides + */ +function makeLoop(overrides) { + const log = makeLog() + const args = /** @type {any} */ ({ + centralUrl: 'https://central.example', + identityClient: makeIdentity(), + pollIntervalSeconds: 3600, + log, + ...overrides, + }) + return { loop: createConfigPullLoop(args), log } +} + +test('start pulls immediately; a 200 confirms the poll and stages the document with its etag', async () => { + const control = makeControl() + const { fetchFn, requests } = makeFetch([ + { status: 200, headers: { etag: 'rev-1' }, body: JSON.stringify({ version: 2 }) }, + ]) + const { loop } = makeLoop({ configControl: control, fetchFn }) + + loop.start() + await loop.stop() + + assert.equal(requests.length, 1) + assert.ok(requests[0].url.endsWith('/v1/config')) + assert.equal(requests[0].headers.authorization, 'Bearer jwt-1') + // No running config etag → no If-None-Match (first 200 must happen). + assert.equal('if-none-match' in requests[0].headers, false) + assert.equal(control.confirms, 1) + assert.deepEqual(control.staged, [{ document: { version: 2 }, etag: 'rev-1' }]) +}) + +test('If-None-Match always presents the running config etag', async () => { + const control = makeControl({ runningEtag: 'rev-current' }) + const { fetchFn, requests } = makeFetch([{ status: 304 }]) + const { loop } = makeLoop({ configControl: control, fetchFn }) + + loop.start() + await loop.stop() + + assert.equal(requests[0].headers['if-none-match'], 'rev-current') + assert.equal(control.confirms, 1) + assert.deepEqual(control.staged, []) +}) + +test('401 refreshes the JWT and retries once; a second 401 escalates without staging', async () => { + const control = makeControl() + const identityClient = makeIdentity() + const ok = makeFetch([ + { status: 401 }, + { status: 304 }, + ]) + const { loop } = makeLoop({ configControl: control, identityClient, fetchFn: ok.fetchFn }) + loop.start() + await loop.stop() + assert.equal(identityClient.refreshes, 1) + assert.equal(ok.requests.length, 2) + assert.equal(control.confirms, 1) + + const identity2 = makeIdentity() + const bad = makeFetch([{ status: 401 }, { status: 401 }]) + const second = makeLoop({ configControl: control, identityClient: identity2, fetchFn: bad.fetchFn }) + second.loop.start() + await second.loop.stop() + assert.equal(identity2.refreshes, 1) + assert.equal(control.confirms, 1) + assert.deepEqual(control.staged, []) +}) + +test('a 200 without an etag header is dropped, not staged', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([ + { status: 200, body: JSON.stringify({ version: 2 }) }, + ]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.deepEqual(control.staged, []) + assert.equal(control.confirms, 0) + assert.ok(log.rows.some((r) => r.fields.error_kind === 'config_missing_etag')) +}) + +test('an oversized 200 body is dropped before parsing', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([ + { status: 200, headers: { etag: 'rev-big' }, body: 'x'.repeat(MAX_CONFIG_DOCUMENT_BYTES + 1) }, + ]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.deepEqual(control.staged, []) + assert.ok(log.rows.some((r) => r.fields.error_kind === 'config_document_too_large')) +}) + +test('invalid JSON in a 200 body is dropped', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([ + { status: 200, headers: { etag: 'rev-1' }, body: '{nope' }, + ]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.deepEqual(control.staged, []) + assert.ok(log.rows.some((r) => r.fields.error_kind === 'config_invalid_json')) +}) + +test('404 takes the legacy backoff branch without confirming probation', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([{ status: 404 }]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.equal(control.confirms, 0) + assert.ok(log.rows.some((r) => r.fields.hyp_reason === 'no_config_registered_legacy')) +}) + +test('the steady timer keeps polling on the configured cadence', async () => { + const control = makeControl() + const { fetchFn, requests } = makeFetch([ + { status: 304 }, { status: 304 }, { status: 304 }, { status: 304 }, + ]) + // Sub-second cadence is rejected by config validation but accepted + // by the loop itself — that's what makes this test fast. + const { loop } = makeLoop({ configControl: control, fetchFn, pollIntervalSeconds: 0.02 }) + loop.start() + await new Promise((resolve) => setTimeout(resolve, 120)) + await loop.stop() + assert.ok(requests.length >= 2, `expected repeat polls, saw ${requests.length}`) + assert.ok(control.confirms >= 2, `expected repeat confirms, saw ${control.confirms}`) +}) + +test('stop prevents any further polls', async () => { + const control = makeControl() + const { fetchFn, requests } = makeFetch([{ status: 304 }, { status: 304 }]) + const { loop } = makeLoop({ configControl: control, fetchFn, pollIntervalSeconds: 0.01 }) + loop.start() + await loop.stop() + const seen = requests.length + await new Promise((resolve) => setTimeout(resolve, 60)) + assert.equal(requests.length, seen) +}) + +test('transport errors back off and keep the loop alive', async () => { + const control = makeControl() + let calls = 0 + /** @type {typeof fetch} */ + const fetchFn = async () => { + calls += 1 + throw new Error('connection refused') + } + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.equal(calls, 1) + assert.ok(log.rows.some((r) => r.message === 'central.config.poll_failed')) +}) From ed28cbf3c90e6f9585fc9d075422f2ce5cdfa0f5 Mon Sep 17 00:00:00 2001 From: Phillip Cunliffe Date: Fri, 12 Jun 2026 13:44:58 -0700 Subject: [PATCH 4/7] Flip LLP 0023 to Active Co-Authored-By: Claude Fable 5 --- llp/0023-remote-config-join-flow.spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llp/0023-remote-config-join-flow.spec.md b/llp/0023-remote-config-join-flow.spec.md index 56b4b10..08d0202 100644 --- a/llp/0023-remote-config-join-flow.spec.md +++ b/llp/0023-remote-config-join-flow.spec.md @@ -1,7 +1,7 @@ # LLP 0023: Remote Config and Join Flow **Type:** Spec -**Status:** Draft +**Status:** Active **Systems:** Config, Sinks, Plugins **Author:** Phil / Claude **Date:** 2026-06-12 From 17665e7cd3f115789a39972c566d4435e4e7582f Mon Sep 17 00:00:00 2001 From: Phillip Cunliffe Date: Fri, 12 Jun 2026 14:27:34 -0700 Subject: [PATCH 5/7] Address dual-review findings on remote config (PR #98) Apply engine (LLP 0023): - Reorder stage() to shape-check -> install pinned plugins -> full validation, so a served config can name a not-yet-installed plugin (catalog-backed validation only knows a plugin once it is installed). LLP 0023 install-on-config section updated with the ordering rationale. Config pull transport: - Enforce the 1 MiB cap before buffering: oversized Content-Length is rejected without reading, chunked bodies stream through a byte counter that cancels at the cap. - Per-poll AbortController with a 30s request deadline covering request and body read; stop() aborts an in-flight poll after a 1s drain grace so a stalled config GET cannot wedge daemon shutdown. Tests: - New test/core/config-apply-deps.test.js covering the real pin enforcement: bundled version mismatch, bundled hash exemption, satisfied-lock skip, artifact hash mismatch via a local git fixture, and install-then-validate over a fresh catalog. - Pull-loop tests for Content-Length pre-reject, chunked cap cancel, stop() abort, request deadline, 429/503 Retry-After, and parseRetryAfter (now exported). - Engine ordering test (install before validate) and shape-gate test. Typecheck and style: - Fix broken inline type-import path and literal exit-code comparisons that broke npm run typecheck; replace remaining inline import() types with top-of-file @import blocks. - Reword closeAllSinks JSDoc: identity refresh is lazy and has no timer. Co-Authored-By: Claude Fable 5 --- .../central/src/config_client.js | 208 +++++++++++-- .../smoke/flows/join_flow_remote_config.js | 6 +- llp/0023-remote-config-join-flow.spec.md | 31 +- src/core/config/apply.js | 40 ++- src/core/daemon/runtime.js | 6 +- test/core/config-apply-deps.test.js | 293 ++++++++++++++++++ test/core/config-apply.test.js | 41 ++- test/core/daemon.test.js | 4 +- test/plugins/central-config-pull.test.js | 153 ++++++++- 9 files changed, 718 insertions(+), 64 deletions(-) create mode 100644 test/core/config-apply-deps.test.js diff --git a/hypaware-core/plugins-workspace/central/src/config_client.js b/hypaware-core/plugins-workspace/central/src/config_client.js index da5d5a4..bb23c0c 100644 --- a/hypaware-core/plugins-workspace/central/src/config_client.js +++ b/hypaware-core/plugins-workspace/central/src/config_client.js @@ -15,10 +15,28 @@ export const DEFAULT_POLL_INTERVAL_SECONDS = 300 /** * Transport-level cap on a pulled config body. Mirrors the kernel's * `MAX_CONFIG_DOCUMENT_BYTES` — the apply engine enforces it again, - * but an oversized body should be dropped before it is buffered whole. + * but an oversized body is dropped before it is buffered whole: an + * oversized `Content-Length` is rejected without reading, and a + * chunked body is read through a byte counter that cancels the stream + * the moment it crosses the cap. */ export const MAX_CONFIG_DOCUMENT_BYTES = 1024 * 1024 +/** + * Hard deadline (seconds) on a single poll, covering the request and + * the body read. Bounds how long `stop()` can wait on an in-flight + * poll even when the caller's `fetchFn` ignores abort signals. + */ +export const DEFAULT_REQUEST_TIMEOUT_SECONDS = 30 + +/** + * How long `stop()` lets an in-flight poll drain before aborting it + * (seconds). A healthy poll finishes in this window — a mid-flight + * apply should commit rather than be cancelled — while a stalled + * request is cut off so shutdown stays prompt. + */ +export const DEFAULT_STOP_GRACE_SECONDS = 1 + /** Linear backoff ladder (seconds) for 429/503/transport failures, per proto.md. */ const RETRY_BACKOFF_SECONDS = [30, 60, 120, 300] @@ -44,11 +62,19 @@ const LEGACY_404_BACKOFF_SECONDS = 300 * and that polling idle state is a legitimate steady state, not an * exit condition. * + * Every poll runs under its own `AbortController` with a hard + * deadline: a stalled config GET must not be able to wedge `stop()` — + * and through it daemon shutdown or a staged restart — so a poll that + * outlives the deadline is aborted, and `stop()` aborts an in-flight + * poll after a short drain grace. + * * @param {{ * centralUrl: string, * identityClient: IdentityClient, * configControl: ConfigControlFacade, * pollIntervalSeconds?: number, + * requestTimeoutSeconds?: number, + * stopGraceSeconds?: number, * log: PluginLogger, * fetchFn?: typeof fetch, * }} args @@ -58,6 +84,8 @@ export function createConfigPullLoop(args) { const { centralUrl, identityClient, configControl, log } = args const fetchFn = args.fetchFn ?? fetch const pollIntervalSeconds = args.pollIntervalSeconds ?? DEFAULT_POLL_INTERVAL_SECONDS + const requestTimeoutSeconds = args.requestTimeoutSeconds ?? DEFAULT_REQUEST_TIMEOUT_SECONDS + const stopGraceSeconds = args.stopGraceSeconds ?? DEFAULT_STOP_GRACE_SECONDS /** @type {NodeJS.Timeout | null} */ let timer = null @@ -65,6 +93,8 @@ export function createConfigPullLoop(args) { let consecutiveFailures = 0 /** @type {Promise | null} */ let inFlight = null + /** @type {AbortController | null} */ + let activeController = null /** @param {number} delaySeconds */ function schedule(delaySeconds) { @@ -77,9 +107,16 @@ export function createConfigPullLoop(args) { /** @returns {Promise} */ async function pollOnce() { + const controller = new AbortController() + activeController = controller + const deadline = setTimeout( + () => controller.abort(new Error(`config poll exceeded ${requestTimeoutSeconds}s`)), + requestTimeoutSeconds * 1000 + ) + if (typeof deadline.unref === 'function') deadline.unref() let nextDelay = pollIntervalSeconds try { - const outcome = await pull() + const outcome = await pull(controller.signal) if (outcome === 'retry_backoff') { nextDelay = RETRY_BACKOFF_SECONDS[ Math.min(consecutiveFailures, RETRY_BACKOFF_SECONDS.length) - 1 @@ -90,16 +127,22 @@ export function createConfigPullLoop(args) { nextDelay = outcome } } catch (err) { - consecutiveFailures += 1 - const message = err instanceof Error ? err.message : String(err) - log.warn('central.config.poll_failed', { - error_kind: 'config_poll_error', - consecutive_failures: consecutiveFailures, - message, - }) - nextDelay = RETRY_BACKOFF_SECONDS[ - Math.min(consecutiveFailures, RETRY_BACKOFF_SECONDS.length) - 1 - ] + // An abort from stop() is the shutdown path, not a poll failure. + if (!(stopped && controller.signal.aborted)) { + consecutiveFailures += 1 + const message = err instanceof Error ? err.message : String(err) + log.warn('central.config.poll_failed', { + error_kind: 'config_poll_error', + consecutive_failures: consecutiveFailures, + message, + }) + nextDelay = RETRY_BACKOFF_SECONDS[ + Math.min(consecutiveFailures, RETRY_BACKOFF_SECONDS.length) - 1 + ] + } + } finally { + clearTimeout(deadline) + activeController = null } schedule(nextDelay) } @@ -108,18 +151,19 @@ export function createConfigPullLoop(args) { * One poll. Returns `'ok'`, `'retry_backoff'`, `'legacy_404'`, or an * explicit next-delay in seconds (server-provided `Retry-After`). * + * @param {AbortSignal} signal * @returns {Promise<'ok' | 'retry_backoff' | 'legacy_404' | number>} */ - async function pull() { + async function pull(signal) { const url = joinUrl(centralUrl, '/v1/config') const runningEtag = configControl.runningEtag() - let response = await doFetch(url, runningEtag) + let response = await doFetch(url, runningEtag, signal) if (response.status === 401) { // One-shot refresh + retry; a second 401 escalates as an auth // failure (proto.md "Refresh window"). await identityClient.refresh() - response = await doFetch(url, runningEtag) + response = await doFetch(url, runningEtag, signal) if (response.status === 401) { consecutiveFailures += 1 log.error('central.config.poll_failed', { @@ -143,16 +187,17 @@ export function createConfigPullLoop(args) { if (response.status === 200) { const etag = response.headers.get('etag') - const body = await response.text() - if (Buffer.byteLength(body, 'utf8') > MAX_CONFIG_DOCUMENT_BYTES) { + const read = await readBodyCapped(response, MAX_CONFIG_DOCUMENT_BYTES, signal) + if (!read.ok) { consecutiveFailures += 1 log.error('central.config.poll_failed', { error_kind: 'config_document_too_large', http_status: 200, - body_bytes: Buffer.byteLength(body, 'utf8'), + body_bytes: read.bytesRead, }) return 'retry_backoff' } + const body = read.body if (!etag) { consecutiveFailures += 1 log.error('central.config.poll_failed', { @@ -228,19 +273,24 @@ export function createConfigPullLoop(args) { /** * @param {string} url * @param {string | undefined} runningEtag + * @param {AbortSignal} signal */ - async function doFetch(url, runningEtag) { + async function doFetch(url, runningEtag, signal) { const jwt = await identityClient.getCurrentJwt() - return fetchFn(url, { - method: 'GET', - headers: { - authorization: `Bearer ${jwt}`, - // If-None-Match always reflects the *running* config — the - // server reads it as the fleet-convergence signal, so a - // gateway mid-apply keeps presenting its old etag. - ...(runningEtag ? { 'if-none-match': runningEtag } : {}), - }, - }) + return abortable( + fetchFn(url, { + method: 'GET', + signal, + headers: { + authorization: `Bearer ${jwt}`, + // If-None-Match always reflects the *running* config — the + // server reads it as the fleet-convergence signal, so a + // gateway mid-apply keeps presenting its old etag. + ...(runningEtag ? { 'if-none-match': runningEtag } : {}), + }, + }), + signal + ) } return { @@ -249,23 +299,117 @@ export function createConfigPullLoop(args) { if (stopped || timer || inFlight) return inFlight = pollOnce().finally(() => { inFlight = null }) }, - /** Stop polling; resolves after any in-flight poll settles. */ + /** + * Stop polling. Lets an in-flight poll drain for a short grace + * (a mid-flight apply should commit, not be cancelled), then + * aborts it — so the wait is bounded even against a stalled + * server or a `fetchFn` that ignores abort signals. + */ async stop() { stopped = true if (timer) { clearTimeout(timer) timer = null } - if (inFlight) await inFlight + if (inFlight) { + const grace = setTimeout(() => { + activeController?.abort(new Error('config pull loop stopped')) + }, stopGraceSeconds * 1000) + if (typeof grace.unref === 'function') grace.unref() + try { + await inFlight + } finally { + clearTimeout(grace) + } + } }, } } /** + * Read a response body under a hard byte cap without ever buffering + * past it: an oversized `Content-Length` is rejected before any read, + * and a chunked body is streamed through a byte counter that cancels + * the moment it crosses the cap. Responses without a readable stream + * (e.g. test doubles) fall back to `text()` with a post-hoc check. + * + * @param {Response} response + * @param {number} maxBytes + * @param {AbortSignal} signal + * @returns {Promise<{ ok: true, body: string } | { ok: false, bytesRead: number }>} + */ +async function readBodyCapped(response, maxBytes, signal) { + const contentLength = Number(response.headers.get('content-length')) + if (Number.isFinite(contentLength) && contentLength > maxBytes) { + if (response.body) await response.body.cancel().catch(() => {}) + return { ok: false, bytesRead: contentLength } + } + if (!response.body) { + const text = await abortable(response.text(), signal) + const bytes = Buffer.byteLength(text, 'utf8') + return bytes > maxBytes ? { ok: false, bytesRead: bytes } : { ok: true, body: text } + } + const reader = response.body.getReader() + /** @type {Uint8Array[]} */ + const chunks = [] + let total = 0 + for (;;) { + const { done, value } = await abortable(reader.read(), signal) + if (done) break + total += value.byteLength + if (total > maxBytes) { + reader.cancel().catch(() => {}) + return { ok: false, bytesRead: total } + } + chunks.push(value) + } + return { ok: true, body: Buffer.concat(chunks).toString('utf8') } +} + +/** + * Await `promise`, but reject as soon as `signal` aborts — even when + * the underlying promise never settles. A misbehaving `fetchFn` (or a + * server that stalls mid-body) must not be able to wedge `stop()`, + * and through it daemon shutdown. + * + * @template T + * @param {Promise} promise + * @param {AbortSignal} signal + * @returns {Promise} + */ +function abortable(promise, signal) { + if (signal.aborted) return Promise.reject(abortReason(signal)) + return new Promise((resolve, reject) => { + const onAbort = () => reject(abortReason(signal)) + signal.addEventListener('abort', onAbort, { once: true }) + promise.then( + (value) => { + signal.removeEventListener('abort', onAbort) + resolve(value) + }, + (err) => { + signal.removeEventListener('abort', onAbort) + reject(err) + } + ) + }) +} + +/** @param {AbortSignal} signal */ +function abortReason(signal) { + return signal.reason instanceof Error ? signal.reason : new Error(String(signal.reason ?? 'aborted')) +} + +/** + * Parse a `Retry-After` header into whole seconds: delta-seconds or an + * HTTP-date, anything unparseable → `undefined` (callers fall back to + * the backoff ladder — a garbage header must not produce a zero-delay + * poll loop). Exported for direct unit tests. + * * @param {string | null} value * @returns {number | undefined} */ -function parseRetryAfter(value) { +export function parseRetryAfter(value) { if (!value) return undefined const seconds = Number.parseInt(value, 10) if (Number.isInteger(seconds) && seconds >= 0) return seconds diff --git a/hypaware-core/smoke/flows/join_flow_remote_config.js b/hypaware-core/smoke/flows/join_flow_remote_config.js index f282b50..65ede1b 100644 --- a/hypaware-core/smoke/flows/join_flow_remote_config.js +++ b/hypaware-core/smoke/flows/join_flow_remote_config.js @@ -11,6 +11,10 @@ import { readConfigControlStatus } from '../../../src/core/config/apply.js' import { DAEMON_RESTART_EXIT_CODE, runDaemon } from '../../../src/core/daemon/runtime.js' import { dispatch } from '../../../src/core/cli/dispatch.js' +/** + * @import { AddressInfo } from 'node:net' + */ + /** * Join-flow smoke (LLP 0023): drives the full remote-config lifecycle * against a stub central server — @@ -327,7 +331,7 @@ async function startStubCentralServer() { }) await new Promise((resolve) => server.listen(0, '127.0.0.1', () => resolve(undefined))) - const address = /** @type {import('node:net').AddressInfo} */ (server.address()) + const address = /** @type {AddressInfo} */ (server.address()) return { baseUrl: `http://127.0.0.1:${address.port}`, diff --git a/llp/0023-remote-config-join-flow.spec.md b/llp/0023-remote-config-join-flow.spec.md index 08d0202..a40c40d 100644 --- a/llp/0023-remote-config-join-flow.spec.md +++ b/llp/0023-remote-config-join-flow.spec.md @@ -60,9 +60,17 @@ on their own timers when wired in" — this spec wires the config pull: config's etag persists across restarts so a relaunch short-circuits to 304; it is kernel-managed state read through the facade (below). - A pulled 200 body above **1 MiB** is dropped — enforced at both the - transport (before buffering completes its way into a parse) and the apply - engine. Wholesale-replace means an authenticated 200 goes straight into - memory and onto disk; the stated cap is one line of defense-in-depth. + transport and the apply engine. The transport check is a genuine memory + bound, not a post-hoc one: an oversized `Content-Length` is rejected + without reading, and a chunked body is streamed through a byte counter + that cancels the moment it crosses the cap. Wholesale-replace means an + authenticated 200 goes straight into memory and onto disk; the stated cap + is one line of defense-in-depth. +- Every poll runs under its own abort controller with a **hard request + deadline (30 s)** covering the request and the body read, and the loop's + `stop()` aborts an in-flight poll after a short drain grace (1 s) — a + stalled config GET must not be able to wedge daemon shutdown or a staged + restart behind it. - **`If-None-Match` must reflect the *running* config, never a downloaded-but-not-yet-applied one.** The server reads this header to track fleet convergence (it lands in the queryable `gateways` dataset), so a @@ -128,8 +136,8 @@ The central plugin is **transport only**: pull, ETag bookkeeping, auth. It hands a downloaded document to a narrow kernel facade — `ctx.configControl.stage(document, etag)`, plus `confirmPoll()` (poll liveness) and `runningEtag()` (for `If-None-Match`); the **kernel** owns -validate → install pinned plugins → persist last-known-good → swap → -restart, and the rollback bookkeeping. The facade exists only where an +shape-check → install pinned plugins → validate → persist last-known-good → +swap → restart, and the rollback bookkeeping. The facade exists only where an apply engine runs (the daemon); plain CLI boots leave `ctx.configControl` undefined and the plugin keeps its pull loop off — `hyp status` must not fire config polls as a side effect. Recorded in @@ -169,6 +177,17 @@ the artifact hash and treat a mismatch as an apply failure (→ rollback, below). The config names exactly one artifact; nothing may substitute code after authoring. +Install runs **before full validation**: catalog-backed validation can only +know a plugin once it is installed, so validating first would reject the +very config that names a not-yet-installed plugin. The apply engine instead +shape-checks the document (including the pin fields' types), installs the +pinned plugins it names, and only then runs full validation against the +freshly rebuilt catalog. Acting on a not-yet-fully-validated document is +bounded by the shape gate and the hash pin — an install can only bring in +the exact artifact the config authored — and plugin trees installed for a +config that then fails validation stay on disk by the same rule as rollback +(the lock records what is installed, not what is active). + ### Bundled first-party plugins First-party plugins ship bundled in the kernel package @@ -298,6 +317,8 @@ Three knobs the draft left open were fixed when the client landed: - **Maximum config document size: 1 MiB**, enforced at both the transport and the apply engine. - **Probation floor: 120 s** (`W = max(3 × poll_interval_seconds, 120 s)`). +- **Poll request deadline: 30 s** per poll (request + body read), with a + **1 s drain grace** before `stop()` aborts an in-flight poll. ## Open questions diff --git a/src/core/config/apply.js b/src/core/config/apply.js index 1655cb4..1dc18b8 100644 --- a/src/core/config/apply.js +++ b/src/core/config/apply.js @@ -4,9 +4,10 @@ import fs from 'node:fs' import path from 'node:path' import { Attr, getLogger, withSpan } from '../observability/index.js' +import { parseConfigShape } from './schema.js' /** - * @import { ConfigControlFacade, ConfigStageResult, HypAwareV2Config, PluginConfigInstance } from '../../../collectivus-plugin-kernel-types.d.ts' + * @import { ConfigApplyErrorKind, ConfigControlFacade, ConfigStageResult, HypAwareV2Config, PluginConfigInstance } from '../../../collectivus-plugin-kernel-types.d.ts' * @import { * ConfigApplyDeps, * ConfigControl, @@ -48,9 +49,10 @@ const CONTROL_DIRNAME = 'config-control' const STATE_BASENAME = 'state.json' /** - * Build the kernel config apply engine: validate → install pinned - * plugins → persist to an A/B slot → flip the operative pointer → - * staged restart, plus probation and last-known-good rollback. + * Build the kernel config apply engine: shape-check → install pinned + * plugins → validate against the post-install catalog → persist to an + * A/B slot → flip the operative pointer → staged restart, plus + * probation and last-known-good rollback. * * Persistence idiom: each applied config is written to its own slot * file under `/config-control/`, with the served ETag in a @@ -262,7 +264,7 @@ export function createConfigControl(opts) { status: 'ok', }, async (span) => { - /** @param {import('../../../collectivus-plugin-kernel-types.d.ts').ConfigApplyErrorKind} errorKind @param {string} message */ + /** @param {ConfigApplyErrorKind} errorKind @param {string} message */ function fail(errorKind, message) { span.setAttribute('status', 'failed') span.setAttribute('error_kind', errorKind) @@ -308,16 +310,24 @@ export function createConfigControl(opts) { return fail('document_too_large', `config document exceeds ${MAX_CONFIG_DOCUMENT_BYTES} bytes`) } - const validation = await applyDeps.validateDocument(document) - if (!validation.ok) { - const first = validation.errors[0] + // Shape-gate, then install, then full validation. Catalog-backed + // validation can only know a plugin once it is installed, so a + // served config naming a not-yet-installed plugin must install + // first — but install must not act on an arbitrary document, so + // the shape (including the pin fields' types) is checked before + // anything is fetched, and the hash pin bounds what an install + // can bring in. + // @ref LLP 0023#install-on-config-hash-pinned [implements] — shape-gate → install pinned plugins → validate against the post-install catalog + const shape = parseConfigShape(document) + if (!shape.ok) { + const first = shape.errors[0] rememberBadEtag(etag, 'validation_failed') return fail( 'config_invalid', - first ? `${first.pointer || ''}: ${first.message}` : 'config validation failed' + first ? `${first.pointer || ''}: ${first.message}` : 'config shape invalid' ) } - const config = /** @type {HypAwareV2Config} */ (document) + const config = shape.config const install = await applyDeps.installPinnedPlugins(config.plugins ?? []) if (!install.ok) { @@ -332,6 +342,16 @@ export function createConfigControl(opts) { return fail(install.errorKind, install.message) } + const validation = await applyDeps.validateDocument(document) + if (!validation.ok) { + const first = validation.errors[0] + rememberBadEtag(etag, 'validation_failed') + return fail( + 'config_invalid', + first ? `${first.pointer || ''}: ${first.message}` : 'config validation failed' + ) + } + try { commit(config, serialized, etag) } catch (err) { diff --git a/src/core/daemon/runtime.js b/src/core/daemon/runtime.js index e0931ec..76f5922 100644 --- a/src/core/daemon/runtime.js +++ b/src/core/daemon/runtime.js @@ -609,9 +609,9 @@ async function startConfiguredSources({ runtime, log, fileLog, sourcePluginByNam /** * Close every materialized sink instance. The central plugin's config - * pull and identity refresh timers stop in its `close()`, so shutdown - * must reach it even though sinks have no started/stopped lifecycle of - * their own. + * pull loop stops in its `close()` (identity refresh is lazy and has + * no timer), so shutdown must reach it even though sinks have no + * started/stopped lifecycle of their own. * * @param {{ runtime: KernelRuntime, fileLog: ReturnType }} args */ diff --git a/test/core/config-apply-deps.test.js b/test/core/config-apply-deps.test.js new file mode 100644 index 0000000..6049875 --- /dev/null +++ b/test/core/config-apply-deps.test.js @@ -0,0 +1,293 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import { spawn } from 'node:child_process' +import fs from 'node:fs/promises' +import os from 'node:os' +import path from 'node:path' + +import { buildConfigApplyDeps } from '../../src/core/config/apply_deps.js' +import { loadLock } from '../../src/core/plugin_install/install.js' +import { getEntry, writeLock } from '../../src/core/plugin_install/lock.js' + +/** + * @import { PluginName } from '../../collectivus-plugin-kernel-types.d.ts' + */ + +/** + * Pin enforcement is the apply path's core security property — nothing + * may substitute code after the config was authored (LLP 0023 + * install-on-config). The apply-engine tests mock these deps away, so + * the real decisions are exercised here against real fixtures: a + * fixture bundled workspace, a lock-backed installed plugin, and a + * local git repo standing in for a served artifact. + */ + +const HASH_A = 'a'.repeat(64) + +/** @param {string} dir @param {string} name @param {string} version */ +async function writePluginDir(dir, name, version) { + await fs.mkdir(dir, { recursive: true }) + await fs.writeFile( + path.join(dir, 'hypaware.plugin.json'), + JSON.stringify({ + schema_version: 1, + name, + version, + hypaware_api: '^1.0.0', + runtime: 'node', + entrypoint: './index.js', + }) + ) + await fs.writeFile(path.join(dir, 'index.js'), 'export async function activate(){}\n') +} + +/** + * A temp HYP state root plus a fixture bundled workspace holding a + * fake `@hypaware/otel` at a controlled version, so the bundled-pin + * checks don't depend on the real workspace's version numbers. + */ +async function makeFixture() { + const tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'hyp-apply-deps-')) + const stateRoot = path.join(tmpRoot, 'state') + const workspaceDir = path.join(tmpRoot, 'workspace') + await writePluginDir(path.join(workspaceDir, 'otel'), '@hypaware/otel', '9.9.9') + return { + tmpRoot, + stateRoot, + workspaceDir, + cleanup: () => fs.rm(tmpRoot, { recursive: true, force: true }), + } +} + +test('bundled plugin: pinned version mismatch is a bundled_version_mismatch failure', async () => { + const fx = await makeFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + const result = await deps.installPinnedPlugins([ + { name: '@hypaware/otel', version: '1.0.0' }, + ]) + assert.equal(result.ok, false) + assert.equal(!result.ok && result.errorKind, 'bundled_version_mismatch') + assert.ok(!result.ok && /1\.0\.0/.test(result.message) && /9\.9\.9/.test(result.message)) + } finally { + await fx.cleanup() + } +}) + +test('bundled plugin: matching version pin is satisfied without an install; hash is not checked', async () => { + const fx = await makeFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + // The artifact_hash refers to a git release artifact that + // legitimately differs from the npm-bundled tree — a garbage hash + // must not fail a bundled pin (LLP 0023 bundled-first-party). + const result = await deps.installPinnedPlugins([ + { name: '@hypaware/otel', version: '9.9.9', artifact_hash: 'f'.repeat(64) }, + ]) + assert.deepEqual(result, { ok: true }) + const lock = await loadLock(fx.stateRoot) + assert.equal(getEntry(lock, /** @type {PluginName} */ ('@hypaware/otel')), undefined) + } finally { + await fx.cleanup() + } +}) + +test('bundled plugin: an unpinned entry is satisfied by any bundled version', async () => { + const fx = await makeFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + const result = await deps.installPinnedPlugins([{ name: '@hypaware/otel' }]) + assert.deepEqual(result, { ok: true }) + } finally { + await fx.cleanup() + } +}) + +test('disabled entries are skipped entirely', async () => { + const fx = await makeFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + // The unreachable source proves no install was attempted. + const result = await deps.installPinnedPlugins([ + { name: '@third-party/off', enabled: false, source: `file://${fx.tmpRoot}/nonexistent` }, + ]) + assert.deepEqual(result, { ok: true }) + } finally { + await fx.cleanup() + } +}) + +test('an installed lock entry matching version + hash is satisfied without re-install', async () => { + const fx = await makeFixture() + try { + const installDir = path.join(fx.tmpRoot, 'installed-fixture') + await writePluginDir(installDir, '@third-party/installed-fixture', '1.0.0') + await writeLock(fx.stateRoot, { + schema_version: 1, + plugins: { + '@third-party/installed-fixture': { + name: '@third-party/installed-fixture', + version: '1.0.0', + source: { kind: 'local-dir', raw: installDir, path: installDir }, + install_dir: installDir, + content_hash: HASH_A, + manifest_hash: 'b'.repeat(64), + installed_at: '2026-06-12T00:00:00.000Z', + }, + }, + }) + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + // The unreachable source proves the satisfied entry never hits the + // install path. + const result = await deps.installPinnedPlugins([ + { + name: '@third-party/installed-fixture', + version: '1.0.0', + artifact_hash: HASH_A, + source: `file://${fx.tmpRoot}/nonexistent`, + }, + ]) + assert.deepEqual(result, { ok: true }) + + // A different pinned hash is NOT satisfied: the install path runs + // (and fails here on the unreachable source). + const mismatched = await deps.installPinnedPlugins([ + { + name: '@third-party/installed-fixture', + version: '1.0.0', + artifact_hash: 'c'.repeat(64), + source: `file://${fx.tmpRoot}/nonexistent`, + }, + ]) + assert.equal(mismatched.ok, false) + assert.equal(!mismatched.ok && mismatched.errorKind, 'plugin_install_failed') + } finally { + await fx.cleanup() + } +}) + +test('a fetched artifact failing its hash pin is an artifact_hash_mismatch and nothing is installed', async () => { + const fx = await makeFixture() + const git = await buildGitPluginFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + const result = await deps.installPinnedPlugins([ + { + name: '@third-party/pin-fixture', + source: git.sourceUrl, + version: '0.1.0', + artifact_hash: 'f'.repeat(64), + }, + ]) + assert.equal(result.ok, false) + assert.equal(!result.ok && result.errorKind, 'artifact_hash_mismatch') + const lock = await loadLock(fx.stateRoot) + assert.equal(getEntry(lock, /** @type {PluginName} */ ('@third-party/pin-fixture')), undefined) + } finally { + await git.cleanup() + await fx.cleanup() + } +}) + +test('a correct hash pin installs, and validation then sees the plugin it could not know before', async () => { + // The install-before-validate ordering only works because a fresh + // catalog is discovered per call — this is the integration check for + // a served config naming a not-yet-installed plugin. + const fx = await makeFixture() + const git = await buildGitPluginFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + const document = { version: 2, plugins: [{ name: '@third-party/pin-fixture' }] } + + const before = await deps.validateDocument(document) + assert.equal(before.ok, false) + assert.ok(before.errors.some((e) => /pin-fixture/.test(e.message))) + + // Learn the artifact hash by installing unpinned once, then prove a + // config pinning that exact hash is accepted from a clean state. + const unpinned = await deps.installPinnedPlugins([ + { name: '@third-party/pin-fixture', source: git.sourceUrl, version: '0.1.0' }, + ]) + assert.deepEqual(unpinned, { ok: true }) + const lock = await loadLock(fx.stateRoot) + const entry = getEntry(lock, /** @type {PluginName} */ ('@third-party/pin-fixture')) + assert.ok(entry) + + const fresh = await makeFixture() + try { + const freshDeps = buildConfigApplyDeps({ + stateRoot: fresh.stateRoot, + workspaceDir: fresh.workspaceDir, + }) + const pinned = await freshDeps.installPinnedPlugins([ + { + name: '@third-party/pin-fixture', + source: git.sourceUrl, + version: '0.1.0', + artifact_hash: entry?.content_hash, + }, + ]) + assert.deepEqual(pinned, { ok: true }) + + const after = await freshDeps.validateDocument(document) + assert.equal(after.ok, true, JSON.stringify(after.errors)) + } finally { + await fresh.cleanup() + } + } finally { + await git.cleanup() + await fx.cleanup() + } +}) + +/* ---------- git fixture ---------- */ + +/** + * A bare local git repo serving a plugin tagged `v0.1.0`, standing in + * for a served config's pinned artifact source. + */ +async function buildGitPluginFixture() { + const tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'hyp-apply-deps-git-')) + const workdir = path.join(tmpRoot, 'work') + await writePluginDir(workdir, '@third-party/pin-fixture', '0.1.0') + await runGit(['init', '-q', '-b', 'main'], { cwd: workdir }) + await runGit(['config', 'user.email', 'unit@hypaware.test'], { cwd: workdir }) + await runGit(['config', 'user.name', 'HypAware Test'], { cwd: workdir }) + await runGit(['add', '.'], { cwd: workdir }) + await runGit(['commit', '--quiet', '--no-gpg-sign', '-m', 'initial'], { cwd: workdir }) + await runGit(['tag', 'v0.1.0'], { cwd: workdir }) + + const bareRepoDir = path.join(tmpRoot, 'bare.git') + await runGit(['init', '--bare', '-q', '-b', 'main', bareRepoDir]) + await runGit(['remote', 'add', 'origin', bareRepoDir], { cwd: workdir }) + await runGit(['push', '--quiet', 'origin', 'main', '--tags'], { cwd: workdir }) + + return { + sourceUrl: `file://${bareRepoDir}`, + cleanup: () => fs.rm(tmpRoot, { recursive: true, force: true }), + } +} + +/** + * @param {string[]} args + * @param {{ cwd?: string }} [opts] + * @returns {Promise} + */ +function runGit(args, opts = {}) { + return new Promise((resolve, reject) => { + const child = spawn('git', args, { + cwd: opts.cwd, + env: { ...process.env, GIT_TERMINAL_PROMPT: '0' }, + stdio: ['ignore', 'ignore', 'pipe'], + }) + /** @type {Buffer[]} */ + const stderrChunks = [] + child.stderr.on('data', (chunk) => stderrChunks.push(chunk)) + child.on('close', (code) => { + if (code === 0) resolve() + else reject(new Error(`git ${args.join(' ')} exited ${code}: ${Buffer.concat(stderrChunks)}`)) + }) + }) +} diff --git a/test/core/config-apply.test.js b/test/core/config-apply.test.js index 1f76071..de94f8b 100644 --- a/test/core/config-apply.test.js +++ b/test/core/config-apply.test.js @@ -17,7 +17,8 @@ import { import { parseConfigShape } from '../../src/core/config/schema.js' /** - * @import { ConfigApplyDeps } from '../../src/core/config/types.d.ts' + * @import { PluginConfigInstance } from '../../collectivus-plugin-kernel-types.d.ts' + * @import { ConfigApplyDeps, PinnedInstallResult } from '../../src/core/config/types.d.ts' */ const SEED_CONFIG = { @@ -52,23 +53,27 @@ async function makeFixture() { } /** - * @param {{ validateOk?: boolean, installResult?: import('../../src/core/config/types.d.ts').PinnedInstallResult }} [opts] - * @returns {ConfigApplyDeps & { validateCalls: number, installCalls: number }} + * @param {{ validateOk?: boolean, installResult?: PinnedInstallResult }} [opts] + * @returns {ConfigApplyDeps & { validateCalls: number, installCalls: number, calls: string[] }} */ function makeDeps(opts = {}) { const deps = { validateCalls: 0, installCalls: 0, + /** @type {string[]} */ + calls: [], /** @param {unknown} _document */ async validateDocument(_document) { deps.validateCalls += 1 + deps.calls.push('validate') return opts.validateOk === false ? { ok: false, errors: [{ pointer: '/plugins/0', message: 'nope' }] } : { ok: true, errors: [] } }, - /** @param {import('../../../collectivus-plugin-kernel-types.d.ts').PluginConfigInstance[]} _entries */ + /** @param {PluginConfigInstance[]} _entries */ async installPinnedPlugins(_entries) { deps.installCalls += 1 + deps.calls.push('install') return opts.installResult ?? { ok: true } }, } @@ -182,6 +187,34 @@ test('validation failure remembers the bad etag and leaves the config untouched' assert.equal(status.runningEtag, null) }) +test('pinned plugins install before full validation, so a config can name a not-yet-installed plugin', async () => { + // Catalog-backed validation only knows a plugin once it is installed; + // install-on-config breaks if validation runs first (LLP 0023 + // install-on-config). The shape gate runs before install instead. + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + const deps = makeDeps() + control.attachApplyDeps(deps) + + const result = await control.stage(REMOTE_CONFIG, 'etag-order') + assert.equal(result.ok, true) + assert.deepEqual(deps.calls, ['install', 'validate']) +}) + +test('a shape-invalid document is rejected before any install runs', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + const deps = makeDeps() + control.attachApplyDeps(deps) + + const result = await control.stage({ version: 1 }, 'etag-shape') + assert.equal(result.ok, false) + assert.equal(!result.ok && result.errorKind, 'config_invalid') + assert.equal(deps.installCalls, 0) + const status = await control.status() + assert.equal(status.badEtag?.reason, 'validation_failed') +}) + test('a remembered bad etag backs off re-apply until the etag changes', async () => { const { stateRoot, configPath } = await makeFixture() const { control } = makeControl({ stateRoot, configPath }) diff --git a/test/core/daemon.test.js b/test/core/daemon.test.js index dd4ff7c..0da5156 100644 --- a/test/core/daemon.test.js +++ b/test/core/daemon.test.js @@ -196,7 +196,9 @@ test('installers default to relaunch-on-exit (staged restart requirement, LLP 00 test('the staged-restart exit code is distinct from success and error exits', async () => { const { DAEMON_RESTART_EXIT_CODE } = await import('../../src/core/daemon/runtime.js') assert.equal(typeof DAEMON_RESTART_EXIT_CODE, 'number') - assert.ok(DAEMON_RESTART_EXIT_CODE !== 0 && DAEMON_RESTART_EXIT_CODE !== 1 && DAEMON_RESTART_EXIT_CODE !== 2) + /** @type {number} */ + const code = DAEMON_RESTART_EXIT_CODE + assert.ok(code !== 0 && code !== 1 && code !== 2) }) test('runDaemon reload refreshes plugin config before source.reload', async () => { diff --git a/test/plugins/central-config-pull.test.js b/test/plugins/central-config-pull.test.js index a3036cb..d2bf3f4 100644 --- a/test/plugins/central-config-pull.test.js +++ b/test/plugins/central-config-pull.test.js @@ -6,6 +6,7 @@ import assert from 'node:assert/strict' import { MAX_CONFIG_DOCUMENT_BYTES, createConfigPullLoop, + parseRetryAfter, } from '../../hypaware-core/plugins-workspace/central/src/config_client.js' function makeLog() { @@ -42,7 +43,12 @@ function makeControl(opts = {}) { } } -/** @param {Array<{ status: number, headers?: Record, body?: string }>} responses */ +/** + * Real `Response` objects so the transport path under test (streamed + * body reads, abort signals) matches what `fetch` actually returns. + * + * @param {Array<{ status: number, headers?: Record, body?: string }>} responses + */ function makeFetch(responses) { /** @type {Array<{ url: string, headers: Record }>} */ const requests = [] @@ -53,13 +59,9 @@ function makeFetch(responses) { headers: /** @type {Record} */ (init?.headers ?? {}), }) const next = responses.shift() ?? { status: 500 } - const headers = new Headers(next.headers ?? {}) - return /** @type {Response} */ (/** @type {unknown} */ ({ - status: next.status, - ok: next.status >= 200 && next.status < 300, - headers, - async text() { return next.body ?? '' }, - })) + // Response forbids a body on null-body statuses (204/304). + const body = next.status === 204 || next.status === 304 ? null : next.body ?? null + return new Response(body, { status: next.status, headers: next.headers ?? {} }) } return { fetchFn, requests } } @@ -231,3 +233,138 @@ test('transport errors back off and keep the loop alive', async () => { assert.equal(calls, 1) assert.ok(log.rows.some((r) => r.message === 'central.config.poll_failed')) }) + +test('an oversized Content-Length is rejected without reading the body', async () => { + const control = makeControl() + /** @type {typeof fetch} */ + const fetchFn = async () => { + // A stream that never produces and never closes: only the + // Content-Length pre-reject can finish this poll promptly — the + // streaming counter would wait on it until the deadline. + const stream = new ReadableStream({ pull() {} }) + const response = new Response(stream, { status: 200, headers: { etag: 'rev-huge' } }) + response.headers.set('content-length', String(MAX_CONFIG_DOCUMENT_BYTES + 1)) + return response + } + const { loop, log } = makeLoop({ configControl: control, fetchFn, requestTimeoutSeconds: 600 }) + loop.start() + await loop.stop() + assert.deepEqual(control.staged, []) + const row = log.rows.find((r) => r.fields.error_kind === 'config_document_too_large') + assert.ok(row, 'expected the Content-Length pre-reject to fire') + // body_bytes reports the declared length — the streaming path could + // never have observed this number from an empty stream. + assert.equal(row?.fields.body_bytes, MAX_CONFIG_DOCUMENT_BYTES + 1) +}) + +test('a chunked oversized body is cancelled at the cap, not buffered whole', async () => { + const control = makeControl() + const chunk = new TextEncoder().encode('x'.repeat(64 * 1024)) + let chunksServed = 0 + /** @type {typeof fetch} */ + const fetchFn = async () => { + // Endless chunked stream with no Content-Length: only the byte + // counter can stop this one. + const stream = new ReadableStream({ + pull(controller) { + chunksServed += 1 + controller.enqueue(chunk) + }, + }) + return new Response(stream, { status: 200, headers: { etag: 'rev-endless' } }) + } + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.deepEqual(control.staged, []) + assert.ok(log.rows.some((r) => r.fields.error_kind === 'config_document_too_large')) + // Reads stop within one chunk of the cap instead of draining forever. + assert.ok( + chunksServed <= MAX_CONFIG_DOCUMENT_BYTES / chunk.byteLength + 2, + `expected the read to stop at the cap, served ${chunksServed} chunks` + ) +}) + +test('stop() aborts a poll stuck on a never-resolving fetch after the drain grace', async () => { + const control = makeControl() + /** @type {typeof fetch} */ + const fetchFn = () => new Promise(() => {}) + // Long request timeout: the stop-grace abort, not the deadline, is + // what must unblock shutdown here. + const { loop } = makeLoop({ + configControl: control, + fetchFn, + requestTimeoutSeconds: 600, + stopGraceSeconds: 0.02, + }) + loop.start() + const before = Date.now() + await loop.stop() + assert.ok(Date.now() - before < 5000, 'stop() must not wait out the request timeout') + assert.deepEqual(control.staged, []) +}) + +test('the request deadline aborts a stalled poll and the loop stays alive', async () => { + const control = makeControl() + let aborted = false + /** @type {typeof fetch} */ + const fetchFn = (_url, init) => + new Promise((_resolve, reject) => { + init?.signal?.addEventListener('abort', () => { + aborted = true + reject(init.signal?.reason ?? new Error('aborted')) + }) + }) + const { loop, log } = makeLoop({ + configControl: control, + fetchFn, + requestTimeoutSeconds: 0.02, + }) + loop.start() + await new Promise((resolve) => setTimeout(resolve, 100)) + await loop.stop() + assert.equal(aborted, true) + const row = log.rows.find((r) => r.fields.error_kind === 'config_poll_error') + assert.ok(row, 'expected the timed-out poll to log a failure') + assert.match(String(row?.fields.message), /exceeded/) +}) + +test('429 with Retry-After schedules from the header without confirming the poll', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([{ status: 429, headers: { 'retry-after': '7' } }]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.equal(control.confirms, 0) + assert.deepEqual(control.staged, []) + const row = log.rows.find((r) => r.fields.error_kind === 'config_poll_throttled') + assert.ok(row) + assert.equal(row?.fields.http_status, 429) + assert.equal(row?.fields.retry_after_seconds, 7) +}) + +test('503 with a garbage Retry-After falls back to the backoff ladder', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([{ status: 503, headers: { 'retry-after': 'soonish' } }]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.equal(control.confirms, 0) + const row = log.rows.find((r) => r.fields.error_kind === 'config_poll_throttled') + assert.ok(row) + assert.equal(row?.fields.http_status, 503) + assert.equal('retry_after_seconds' in (row?.fields ?? {}), false) +}) + +test('parseRetryAfter: delta-seconds, HTTP-date, and garbage', () => { + assert.equal(parseRetryAfter('7'), 7) + assert.equal(parseRetryAfter('0'), 0) + // An HTTP-date resolves to a non-negative whole-second delay. + const future = parseRetryAfter(new Date(Date.now() + 30_000).toUTCString()) + assert.ok(typeof future === 'number' && future >= 28 && future <= 31, `got ${future}`) + // A past date clamps to zero rather than going negative. + assert.equal(parseRetryAfter(new Date(Date.now() - 60_000).toUTCString()), 0) + assert.equal(parseRetryAfter('soonish'), undefined) + assert.equal(parseRetryAfter(''), undefined) + assert.equal(parseRetryAfter(null), undefined) +}) From 5f66b71b8dd9181888c004b3dc5eb1da10469a0c Mon Sep 17 00:00:00 2001 From: Phillip Cunliffe Date: Fri, 12 Jun 2026 14:31:35 -0700 Subject: [PATCH 6/7] Don't unref the poll deadline and stop-grace timers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An unref'd timer can't fire once a wedged fetch is the only live handle: the event loop drains, so the deadline/grace abort never happens — in CI this surfaced as node:test cancelling the pull-loop tests with 'Promise resolution is still pending but the event loop has already resolved'. Both timers are cleared as soon as the poll settles, and the loop's policy is no-unref anyway. Co-Authored-By: Claude Fable 5 --- .../plugins-workspace/central/src/config_client.js | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hypaware-core/plugins-workspace/central/src/config_client.js b/hypaware-core/plugins-workspace/central/src/config_client.js index bb23c0c..59ae1a7 100644 --- a/hypaware-core/plugins-workspace/central/src/config_client.js +++ b/hypaware-core/plugins-workspace/central/src/config_client.js @@ -109,11 +109,13 @@ export function createConfigPullLoop(args) { async function pollOnce() { const controller = new AbortController() activeController = controller + // Not unref'd (matching the loop's no-unref policy): the deadline + // must be able to fire while a wedged poll is the only live + // handle, and it is cleared as soon as the poll settles. const deadline = setTimeout( () => controller.abort(new Error(`config poll exceeded ${requestTimeoutSeconds}s`)), requestTimeoutSeconds * 1000 ) - if (typeof deadline.unref === 'function') deadline.unref() let nextDelay = pollIntervalSeconds try { const outcome = await pull(controller.signal) @@ -312,10 +314,12 @@ export function createConfigPullLoop(args) { timer = null } if (inFlight) { + // Not unref'd: against a fetch wedged on the last live handle, + // an unref'd grace timer would let the event loop drain before + // it ever fired, leaving stop() hanging on the poll forever. const grace = setTimeout(() => { activeController?.abort(new Error('config pull loop stopped')) }, stopGraceSeconds * 1000) - if (typeof grace.unref === 'function') grace.unref() try { await inFlight } finally { From 93f95cface74362096ec83a6fba3307b08f48098 Mon Sep 17 00:00:00 2001 From: Phillip Cunliffe Date: Fri, 12 Jun 2026 14:37:08 -0700 Subject: [PATCH 7/7] Renumber remote-config spec to LLP 0025 (0024 is claimed by open PR #99) Co-Authored-By: Claude Fable 5 --- .../plugins-workspace/central/index.js | 2 +- .../plugins-workspace/central/proto.md | 6 +++--- .../central/src/config_client.js | 2 +- .../plugins-workspace/central/src/types.d.ts | 2 +- .../smoke/flows/join_flow_remote_config.js | 4 ++-- llp/0000-hypaware.explainer.md | 2 +- llp/0003-core-vs-plugin-surface.spec.md | 2 +- llp/0011-setup-and-onboarding.decision.md | 2 +- llp/0017-daemon-runtime.decision.md | 4 ++-- ...md => 0025-remote-config-join-flow.spec.md} | 2 +- src/core/cli/core_commands.js | 4 ++-- src/core/config/apply.js | 18 +++++++++--------- src/core/config/apply_deps.js | 4 ++-- src/core/config/schema.js | 2 +- src/core/config/types.d.ts | 2 +- src/core/daemon/runtime.js | 2 +- src/core/daemon/status.js | 2 +- src/core/daemon/types.d.ts | 2 +- src/core/runtime/activation.js | 2 +- test/core/config-apply-deps.test.js | 4 ++-- test/core/config-apply.test.js | 2 +- 21 files changed, 36 insertions(+), 36 deletions(-) rename llp/{0024-remote-config-join-flow.spec.md => 0025-remote-config-join-flow.spec.md} (99%) diff --git a/hypaware-core/plugins-workspace/central/index.js b/hypaware-core/plugins-workspace/central/index.js index 76dde7c..ba0916f 100644 --- a/hypaware-core/plugins-workspace/central/index.js +++ b/hypaware-core/plugins-workspace/central/index.js @@ -70,7 +70,7 @@ export async function activate(ctx) { if (!configControl) return sink - // @ref LLP 0024#config-pull-loop [implements] — pull immediately on bootstrap success, then on the steady timer + // @ref LLP 0025#config-pull-loop [implements] — pull immediately on bootstrap success, then on the steady timer const pullLoop = createConfigPullLoop({ centralUrl: config.url, identityClient, diff --git a/hypaware-core/plugins-workspace/central/proto.md b/hypaware-core/plugins-workspace/central/proto.md index 5e899bf..d714a5a 100644 --- a/hypaware-core/plugins-workspace/central/proto.md +++ b/hypaware-core/plugins-workspace/central/proto.md @@ -83,7 +83,7 @@ Headers (request): downloaded-but-not-yet-applied one. The server reads this header to track fleet convergence, so a gateway mid-install/mid-apply keeps presenting its old etag until the new config has taken effect -(LLP 0024). +(LLP 0025). Response 200: @@ -99,11 +99,11 @@ Response 200: The body is a full HypAware v2 config and replaces the gateway's operative config wholesale. Plugin entries are pinned by **version + artifact content hash**; the gateway verifies the artifact hash on -install and treats a mismatch as an apply failure (LLP 0024). +install and treats a mismatch as an apply failure (LLP 0025). `ETag: ` accompanies every 200 response. Clients persist the etag of the *running* config in kernel-managed state (it transitions -atomically with the operative config on apply and rollback — LLP 0024) +atomically with the operative config on apply and rollback — LLP 0025) so a restart short-circuits to 304 instead of re-pulling and re-validating. diff --git a/hypaware-core/plugins-workspace/central/src/config_client.js b/hypaware-core/plugins-workspace/central/src/config_client.js index 367720b..88869e8 100644 --- a/hypaware-core/plugins-workspace/central/src/config_client.js +++ b/hypaware-core/plugins-workspace/central/src/config_client.js @@ -78,7 +78,7 @@ const LEGACY_404_BACKOFF_SECONDS = 300 * log: PluginLogger, * fetchFn?: typeof fetch, * }} args - * @ref LLP 0024#config-pull-loop [implements] — immediate pull on bootstrap success, then a steady plugin-internal timer + * @ref LLP 0025#config-pull-loop [implements] — immediate pull on bootstrap success, then a steady plugin-internal timer */ export function createConfigPullLoop(args) { const { centralUrl, identityClient, configControl, log } = args diff --git a/hypaware-core/plugins-workspace/central/src/types.d.ts b/hypaware-core/plugins-workspace/central/src/types.d.ts index e30e68b..42706b7 100644 --- a/hypaware-core/plugins-workspace/central/src/types.d.ts +++ b/hypaware-core/plugins-workspace/central/src/types.d.ts @@ -50,7 +50,7 @@ export interface CentralSinkConfig { * Poll cadence (seconds) for the config-pull loop. Default 300s * (5 minutes) — 304s are cheap, and propagation latency equals this * cadence (no push channel in V1). The running config's etag is - * kernel-managed (LLP 0024); the plugin reads it through the + * kernel-managed (LLP 0025); the plugin reads it through the * `configControl` facade, so there is no plugin-side sidecar path. */ poll_interval_seconds?: number diff --git a/hypaware-core/smoke/flows/join_flow_remote_config.js b/hypaware-core/smoke/flows/join_flow_remote_config.js index fe7bb84..a2195f3 100644 --- a/hypaware-core/smoke/flows/join_flow_remote_config.js +++ b/hypaware-core/smoke/flows/join_flow_remote_config.js @@ -16,7 +16,7 @@ import { dispatch } from '../../../src/core/cli/dispatch.js' */ /** - * Join-flow smoke (LLP 0024): drives the full remote-config lifecycle + * Join-flow smoke (LLP 0025): drives the full remote-config lifecycle * against a stub central server — * * join (seed write) → seed boot → identity bootstrap → config pull @@ -37,7 +37,7 @@ import { dispatch } from '../../../src/core/cli/dispatch.js' * `config.probation_cleared` log rows, `join.run` span. * * @param {{ harness: any, expect: any }} args - * @ref LLP 0024#the-join-sequence [tests] — seed → bootstrap → pull → apply → restart → operational, end to end against a stub server + * @ref LLP 0025#the-join-sequence [tests] — seed → bootstrap → pull → apply → restart → operational, end to end against a stub server */ export async function run({ harness, expect }) { const obs = installObservability() diff --git a/llp/0000-hypaware.explainer.md b/llp/0000-hypaware.explainer.md index 2412ac1..a249b69 100644 --- a/llp/0000-hypaware.explainer.md +++ b/llp/0000-hypaware.explainer.md @@ -72,7 +72,7 @@ plugin that registers a dataset gets query and formatting for free. | Observability & self-instrumentation | [0021](./0021-observability.spec.md) | Spec | | Iceberg export partitioning | [0022](./0022-iceberg-export-partitioning.spec.md) | Spec | | Context-graph T0 projection | [0023](./0023-context-graph-projection.decision.md) | Decision | -| Remote config & join flow | [0024](./0024-remote-config-join-flow.spec.md) | Spec | +| Remote config & join flow | [0025](./0025-remote-config-join-flow.spec.md) | Spec | ## Where to start diff --git a/llp/0003-core-vs-plugin-surface.spec.md b/llp/0003-core-vs-plugin-surface.spec.md index ff15d5a..8d8a5cf 100644 --- a/llp/0003-core-vs-plugin-surface.spec.md +++ b/llp/0003-core-vs-plugin-surface.spec.md @@ -32,7 +32,7 @@ copy-pasted into every plugin, it belongs in core. install pinned plugins, persist last-known-good, swap, staged restart, rollback bookkeeping. Exposed to plugins as a narrow context facade; the document's *transport* (e.g. `@hypaware/central`'s pull loop) is plugin - domain. See [LLP 0024](./0024-remote-config-join-flow.spec.md#apply-engine-is-kernel-surface). + domain. See [LLP 0025](./0025-remote-config-join-flow.spec.md#apply-engine-is-kernel-surface). ## Intrinsic, not plugin-provided diff --git a/llp/0011-setup-and-onboarding.decision.md b/llp/0011-setup-and-onboarding.decision.md index 2bf7d83..6142a39 100644 --- a/llp/0011-setup-and-onboarding.decision.md +++ b/llp/0011-setup-and-onboarding.decision.md @@ -60,4 +60,4 @@ For centrally-managed gateways, `hypaware join ` writes a seed config (central plugin only) and performs the non-interactive daemon install; the full config arrives from the server at join. It is sugar over "write the config file + install the daemon", not a separate path. See -[LLP 0024](./0024-remote-config-join-flow.spec.md#seed-config-mode). +[LLP 0025](./0025-remote-config-join-flow.spec.md#seed-config-mode). diff --git a/llp/0017-daemon-runtime.decision.md b/llp/0017-daemon-runtime.decision.md index 27b317f..ab5fb35 100644 --- a/llp/0017-daemon-runtime.decision.md +++ b/llp/0017-daemon-runtime.decision.md @@ -5,7 +5,7 @@ **Systems:** Daemon **Author:** Phil / Claude **Date:** 2026-06-01 -**Related:** LLP 0002, LLP 0011, LLP 0012, LLP 0014, LLP 0024 +**Related:** LLP 0002, LLP 0011, LLP 0012, LLP 0014, LLP 0025 > The primary daemon and how it is installed. Decomposed from the V1 finishing > plan (`finish-v1` Phases 3–4, now tombstoned) and `hypaware-design.md`. @@ -31,7 +31,7 @@ long-lived host that drives them together. ## Staged restart for config replacement When the operative config is **replaced wholesale** — remote config apply -([LLP 0024](./0024-remote-config-join-flow.spec.md#apply-semantics-staged-restart)), +([LLP 0025](./0025-remote-config-join-flow.spec.md#apply-semantics-staged-restart)), or any change to the plugin set or installed plugin code — the daemon does **not** reload in place. It persists the new config and **exits; the service manager relaunches it** onto the new config. diff --git a/llp/0024-remote-config-join-flow.spec.md b/llp/0025-remote-config-join-flow.spec.md similarity index 99% rename from llp/0024-remote-config-join-flow.spec.md rename to llp/0025-remote-config-join-flow.spec.md index 65e78d7..8c0ff12 100644 --- a/llp/0024-remote-config-join-flow.spec.md +++ b/llp/0025-remote-config-join-flow.spec.md @@ -1,4 +1,4 @@ -# LLP 0024: Remote Config and Join Flow +# LLP 0025: Remote Config and Join Flow **Type:** Spec **Status:** Active diff --git a/src/core/cli/core_commands.js b/src/core/cli/core_commands.js index b680102..23a5779 100644 --- a/src/core/cli/core_commands.js +++ b/src/core/cli/core_commands.js @@ -511,7 +511,7 @@ function renderStatusJson({ report, clientNames, datasets, cacheRoot }) { oldest_partition_date: report.cache.oldestDate, }, recent_error_count: report.recentErrorCount, - // Remote-config apply state (LLP 0024). All-null until the gateway + // Remote-config apply state (LLP 0025). All-null until the gateway // applies its first centrally-served config. remote_config: report.remoteConfig ? { @@ -2849,7 +2849,7 @@ async function runInitFromFile(flags, ctx) { * * @param {string[]} argv * @param {CommandRunContext} ctx - * @ref LLP 0024#seed-config-mode [implements] — join = write-seed-config + daemon install; a wrapper, not a second code path + * @ref LLP 0025#seed-config-mode [implements] — join = write-seed-config + daemon install; a wrapper, not a second code path */ async function runJoin(argv, ctx) { const parsed = parseJoinArgs(argv) diff --git a/src/core/config/apply.js b/src/core/config/apply.js index 2097945..4cbb5bb 100644 --- a/src/core/config/apply.js +++ b/src/core/config/apply.js @@ -25,7 +25,7 @@ import { parseConfigShape } from './schema.js' * parsed and persisted wholesale, so a stated cap bounds memory and * disk regardless of what an authenticated server sends. 1 MiB is * orders of magnitude above any real config. - * @ref LLP 0024#config-pull-loop [implements] — max accepted config document size, settled at 1 MiB + * @ref LLP 0025#config-pull-loop [implements] — max accepted config document size, settled at 1 MiB */ export const MAX_CONFIG_DOCUMENT_BYTES = 1024 * 1024 @@ -41,7 +41,7 @@ export const DEFAULT_POLL_INTERVAL_SECONDS = 300 * Probation window floor (seconds). The window is * `max(3 × poll_interval_seconds, floor)` so a fast poll cadence still * leaves room for daemon relaunch + identity refresh + one retry. - * @ref LLP 0024#post-apply-probation [implements] — window formula with the floor settled at 120s + * @ref LLP 0025#post-apply-probation [implements] — window formula with the floor settled at 120s */ export const PROBATION_FLOOR_SECONDS = 120 @@ -64,7 +64,7 @@ const STATE_BASENAME = 'state.json' * * @param {CreateConfigControlOptions} opts * @returns {ConfigControl} - * @ref LLP 0024#apply-engine-is-kernel-surface [implements] — the engine is kernel-owned; plugins only see the narrow facade + * @ref LLP 0025#apply-engine-is-kernel-surface [implements] — the engine is kernel-owned; plugins only see the narrow facade */ export function createConfigControl(opts) { const { stateRoot, configPath, requestRestart } = opts @@ -135,7 +135,7 @@ export function createConfigControl(opts) { * @param {ProbationMarker} marker * @param {ConfigRollbackReason} reason * @param {string} [detail] - * @ref LLP 0024#last-known-good-rollback [implements] — flip back + remembered bad etag + structured reason, recorded client-side from day one + * @ref LLP 0025#last-known-good-rollback [implements] — flip back + remembered bad etag + structured reason, recorded client-side from day one */ function rollback(marker, reason, detail) { if (marker.previous_slot) { @@ -175,7 +175,7 @@ export function createConfigControl(opts) { * Expiry rolls back and requests a staged restart onto * last-known-good. The kernel owns this timer — a wedged central * sink is exactly the failure probation must catch. - * @ref LLP 0024#post-apply-probation [implements] — kernel-owned watchdog, independent of the central plugin functioning + * @ref LLP 0025#post-apply-probation [implements] — kernel-owned watchdog, independent of the central plugin functioning */ function armProbationWatchdog() { disarmProbationWatchdog() @@ -205,7 +205,7 @@ export function createConfigControl(opts) { * kernel-killing-but-valid config can crashloop under the service * manager faster than any in-process timer fires, so each relaunch * checks the marker first. - * @ref LLP 0024#post-apply-probation [implements] — probation expiry is evaluated at boot, before plugin activation + * @ref LLP 0025#post-apply-probation [implements] — probation expiry is evaluated at boot, before plugin activation */ async function evaluateAtBoot() { const state = readState() @@ -317,7 +317,7 @@ export function createConfigControl(opts) { // the shape (including the pin fields' types) is checked before // anything is fetched, and the hash pin bounds what an install // can bring in. - // @ref LLP 0024#install-on-config-hash-pinned [implements] — shape-gate → install pinned plugins → validate against the post-install catalog + // @ref LLP 0025#install-on-config-hash-pinned [implements] — shape-gate → install pinned plugins → validate against the post-install catalog const shape = parseConfigShape(document) if (!shape.ok) { const first = shape.errors[0] @@ -400,7 +400,7 @@ export function createConfigControl(opts) { * @param {HypAwareV2Config} config * @param {string} serialized * @param {string} etag - * @ref LLP 0024#apply-semantics-staged-restart [implements] — A/B slots with an atomic pointer; never live-mutate; restart does the activation + * @ref LLP 0025#apply-semantics-staged-restart [implements] — A/B slots with an atomic pointer; never live-mutate; restart does the activation */ function commit(config, serialized, etag) { fs.mkdirSync(controlDir, { recursive: true, mode: 0o700 }) @@ -525,7 +525,7 @@ function readRunningEtag(controlDir, configPath) { * * @param {{ stateRoot: string, configPath: string }} args * @returns {ConfigControlStatus} - * @ref LLP 0024#last-known-good-rollback [implements] — operator-visible probation/rollback/bad-etag state without log spelunking + * @ref LLP 0025#last-known-good-rollback [implements] — operator-visible probation/rollback/bad-etag state without log spelunking */ export function readConfigControlStatus({ stateRoot, configPath }) { const controlDir = path.join(stateRoot, CONTROL_DIRNAME) diff --git a/src/core/config/apply_deps.js b/src/core/config/apply_deps.js index 285ec6a..6b303f9 100644 --- a/src/core/config/apply_deps.js +++ b/src/core/config/apply_deps.js @@ -69,7 +69,7 @@ export function buildConfigApplyDeps(opts) { * * @param {PluginConfigInstance[]} entries * @returns {Promise} - * @ref LLP 0024#install-on-config-hash-pinned [implements] — existing LLP 0007 install path; hash mismatch is an apply failure + * @ref LLP 0025#install-on-config-hash-pinned [implements] — existing LLP 0007 install path; hash mismatch is an apply failure */ async function installPinnedPlugins(entries) { const { bundled, installed } = await discover() @@ -86,7 +86,7 @@ export function buildConfigApplyDeps(opts) { const bundledVersion = bundledVersions.get(entry.name) if (bundledVersion !== undefined) { - // @ref LLP 0024#bundled-first-party-plugins [implements] — version checked strictly, artifact hash not checked for bundled plugins + // @ref LLP 0025#bundled-first-party-plugins [implements] — version checked strictly, artifact hash not checked for bundled plugins if (entry.version !== undefined && entry.version !== bundledVersion) { return { ok: false, diff --git a/src/core/config/schema.js b/src/core/config/schema.js index 9e6af81..c63d5b7 100644 --- a/src/core/config/schema.js +++ b/src/core/config/schema.js @@ -343,7 +343,7 @@ function parsePluginEntry(entry, pointer, errors) { if (obj.config !== undefined && !isPlainObject(obj.config)) { errors.push({ pointer: `${pointer}/config`, message: 'config must be an object when present' }) } - // Pin fields set by centrally-served configs (LLP 0024). Optional in + // Pin fields set by centrally-served configs (LLP 0025). Optional in // hand-written configs; the apply engine enforces them when present. for (const key of /** @type {const} */ (['version', 'artifact_hash', 'source'])) { if (obj[key] !== undefined && !isNonEmptyString(obj[key])) { diff --git a/src/core/config/types.d.ts b/src/core/config/types.d.ts index 0fc5ccf..95ac807 100644 --- a/src/core/config/types.d.ts +++ b/src/core/config/types.d.ts @@ -94,7 +94,7 @@ export interface ValidateResult { } // ============================================================================= -// Config apply engine (LLP 0024) +// Config apply engine (LLP 0025) // ============================================================================= /** Structured rollback reason recorded by the apply engine. */ diff --git a/src/core/daemon/runtime.js b/src/core/daemon/runtime.js index 5638bd5..d2dc82f 100644 --- a/src/core/daemon/runtime.js +++ b/src/core/daemon/runtime.js @@ -150,7 +150,7 @@ export async function runDaemon(opts = {}) { writeStatusFile(stateRoot, status) fileLog.info('daemon.starting', { config_path: opts.configPath ?? null }) - // ----- Config apply engine (LLP 0024) ----- + // ----- Config apply engine (LLP 0025) ----- // Created before bootKernel so probation expiry is evaluated before // any plugin activates: a kernel-killing-but-valid config that // crashloops under the service manager may never live long enough diff --git a/src/core/daemon/status.js b/src/core/daemon/status.js index 29f689d..b9a8267 100644 --- a/src/core/daemon/status.js +++ b/src/core/daemon/status.js @@ -405,7 +405,7 @@ export async function collectHypAwareStatus(opts = {}) { const cacheRoot = opts.runtime?.storage?.cacheRoot ?? path.join(stateRoot, 'cache') const cache = await measureCacheStats(cacheRoot) - // ----- remote config apply state (LLP 0024) ----- + // ----- remote config apply state (LLP 0025) ----- /** @type {ConfigControlStatus | null} */ let remoteConfig = null try { diff --git a/src/core/daemon/types.d.ts b/src/core/daemon/types.d.ts index a095f6b..77237cd 100644 --- a/src/core/daemon/types.d.ts +++ b/src/core/daemon/types.d.ts @@ -141,7 +141,7 @@ export interface HypAwareStatusReport { diagnostics: StatusDiagnostic[] overall: 'healthy' | 'degraded' /** - * Remote-config apply state (LLP 0024): probation, last rollback + + * Remote-config apply state (LLP 0025): probation, last rollback + * structured reason, remembered bad etag, and the running config's * etag. Null only when the probe itself failed; a gateway that has * never applied a remote config reports all-null fields. diff --git a/src/core/runtime/activation.js b/src/core/runtime/activation.js index 8793799..5379e62 100644 --- a/src/core/runtime/activation.js +++ b/src/core/runtime/activation.js @@ -123,7 +123,7 @@ export function createActivationContext({ runtime, plugin, paths, config, env }) initPresets: runtime.initPresets, backfills: runtime.backfills, backfillMaterializers: runtime.backfillMaterializers, - // @ref LLP 0024#apply-engine-is-kernel-surface [implements] — plugins reach the apply engine only through this narrow facade; absent outside the daemon + // @ref LLP 0025#apply-engine-is-kernel-surface [implements] — plugins reach the apply engine only through this narrow facade; absent outside the daemon ...(runtime.configControl ? { configControl: runtime.configControl } : {}), /** * @template T diff --git a/test/core/config-apply-deps.test.js b/test/core/config-apply-deps.test.js index 1ca98eb..ef22f54 100644 --- a/test/core/config-apply-deps.test.js +++ b/test/core/config-apply-deps.test.js @@ -17,7 +17,7 @@ import { getEntry, writeLock } from '../../src/core/plugin_install/lock.js' /** * Pin enforcement is the apply path's core security property — nothing - * may substitute code after the config was authored (LLP 0024 + * may substitute code after the config was authored (LLP 0025 * install-on-config). The apply-engine tests mock these deps away, so * the real decisions are exercised here against real fixtures: a * fixture bundled workspace, a lock-backed installed plugin, and a @@ -82,7 +82,7 @@ test('bundled plugin: matching version pin is satisfied without an install; hash const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) // The artifact_hash refers to a git release artifact that // legitimately differs from the npm-bundled tree — a garbage hash - // must not fail a bundled pin (LLP 0024 bundled-first-party). + // must not fail a bundled pin (LLP 0025 bundled-first-party). const result = await deps.installPinnedPlugins([ { name: '@hypaware/otel', version: '9.9.9', artifact_hash: 'f'.repeat(64) }, ]) diff --git a/test/core/config-apply.test.js b/test/core/config-apply.test.js index 8927bcb..1c6db5f 100644 --- a/test/core/config-apply.test.js +++ b/test/core/config-apply.test.js @@ -189,7 +189,7 @@ test('validation failure remembers the bad etag and leaves the config untouched' test('pinned plugins install before full validation, so a config can name a not-yet-installed plugin', async () => { // Catalog-backed validation only knows a plugin once it is installed; - // install-on-config breaks if validation runs first (LLP 0024 + // install-on-config breaks if validation runs first (LLP 0025 // install-on-config). The shape gate runs before install instead. const { stateRoot, configPath } = await makeFixture() const { control } = makeControl({ stateRoot, configPath })