diff --git a/collectivus-plugin-kernel-types.d.ts b/collectivus-plugin-kernel-types.d.ts index fbba741..02a2c77 100644 --- a/collectivus-plugin-kernel-types.d.ts +++ b/collectivus-plugin-kernel-types.d.ts @@ -323,10 +323,60 @@ export interface PluginActivationContext { * before appending to the cache. */ backfillMaterializers: BackfillMaterializerRegistry + /** + * Narrow facade over the kernel config apply engine (LLP 0023). Only + * present when the host process runs an apply engine (the daemon); + * absent in plain CLI boots, so transport plugins must treat it as + * optional and skip their pull loops when it is missing. The facade + * is the only channel a plugin has into config application — the + * kernel owns validation, install, persistence, restart, probation, + * and rollback. + */ + configControl?: ConfigControlFacade requireCapability(name: CapabilityName, range?: SemverRange): T provideCapability(name: CapabilityName, version: SemverVersion, value: T): void } +/** + * Plugin-facing surface of the kernel config apply engine. Handed to + * transport plugins (e.g. `@hypaware/central`) so they can deliver a + * downloaded config document and report poll liveness. Deliberately + * narrow: plugins never see probation state, slot paths, or rollback + * bookkeeping. + */ +export interface ConfigControlFacade { + /** + * Deliver a downloaded config document (parsed JSON) plus the ETag it + * was served under. The kernel validates, installs pinned plugins, + * persists, swaps, and requests a staged restart. Resolves before the + * restart happens; callers should treat `{ ok: true }` as "apply + * committed, restart pending". + */ + stage(document: unknown, etag: string): Promise + /** + * Report a successful authenticated config poll (200 or 304). Clears + * the post-apply probation window when one is active; a no-op + * otherwise. + */ + confirmPoll(): void + /** ETag of the *running* config, for `If-None-Match`. Undefined when the operative config was never applied from the server (e.g. seed). */ + runningEtag(): string | undefined +} + +export type ConfigStageResult = + | { ok: true, action: 'applied' | 'noop_same_etag' | 'skipped_bad_etag' } + | { ok: false, errorKind: ConfigApplyErrorKind, message: string } + +export type ConfigApplyErrorKind = + | 'config_invalid' + | 'plugin_install_failed' + | 'artifact_hash_mismatch' + | 'bundled_version_mismatch' + | 'document_too_large' + | 'apply_engine_not_ready' + | 'restart_pending' + | 'apply_io_error' + export interface PluginDeactivationContext { plugin: ActivePlugin log: PluginLogger @@ -405,6 +455,26 @@ export interface PluginConfigInstance { name: PluginName enabled?: boolean config?: JsonObject + /** + * Pinned plugin version. Set by centrally-served configs (LLP 0023): + * the apply engine refuses a config whose pins it cannot satisfy. + * For bundled first-party plugins the pin is checked strictly against + * the bundled version; for fetched plugins it selects the artifact. + */ + version?: SemverVersion + /** + * Pinned artifact content hash for fetched plugins. The apply engine + * verifies the fetched artifact against this before committing the + * install; a mismatch is an apply failure. Ignored (not checked) for + * plugins bundled with the running kernel. + */ + artifact_hash?: string + /** + * Optional explicit install source (raw source string accepted by the + * plugin installer). Defaults to the plugin name, which the resolver + * maps to its canonical git source. + */ + source?: string } /** diff --git a/hypaware-core/plugins-workspace/central/index.js b/hypaware-core/plugins-workspace/central/index.js index 1200a7d..ba0916f 100644 --- a/hypaware-core/plugins-workspace/central/index.js +++ b/hypaware-core/plugins-workspace/central/index.js @@ -3,6 +3,7 @@ import path from 'node:path' import { validateCentralConfig } from './src/config.js' +import { createConfigPullLoop } from './src/config_client.js' import { IdentityClient } from './src/identity_client.js' import { createForwardSink } from './src/sink.js' @@ -26,6 +27,10 @@ import { createForwardSink } from './src/sink.js' export async function activate(ctx) { const query = ctx.query const storage = ctx.storage + // Present only in daemon mode. Without an apply engine there is no + // one to hand a pulled document to, so the pull loop stays off (CLI + // boots must not fire config polls as a side effect of `hyp status`). + const configControl = ctx.configControl ctx.sinks.register({ name: 'forward', @@ -55,13 +60,35 @@ export async function activate(ctx) { hyp_identity_source: source, }) - return createForwardSink({ + const sink = createForwardSink({ config, identityClient, query, storage, log: sinkCtx.log, }) + + if (!configControl) return sink + + // @ref LLP 0025#config-pull-loop [implements] — pull immediately on bootstrap success, then on the steady timer + const pullLoop = createConfigPullLoop({ + centralUrl: config.url, + identityClient, + configControl, + ...(config.poll_interval_seconds !== undefined + ? { pollIntervalSeconds: config.poll_interval_seconds } + : {}), + log: sinkCtx.log, + }) + pullLoop.start() + + return { + ...sink, + async close() { + await pullLoop.stop() + await sink.close() + }, + } }, }) } diff --git a/hypaware-core/plugins-workspace/central/proto.md b/hypaware-core/plugins-workspace/central/proto.md index 1ab9395..d714a5a 100644 --- a/hypaware-core/plugins-workspace/central/proto.md +++ b/hypaware-core/plugins-workspace/central/proto.md @@ -30,8 +30,9 @@ atomic tmp+rename). ### POST `/v1/identity/bootstrap` Exchange an operator-issued bootstrap token for a long-lived JWT. -Bootstrap tokens are single-use; a successful bootstrap response also -invalidates the bootstrap token server-side. +Bootstrap tokens are **policy tokens** (server LLP 0008): multi-use, so +one token can be deployed fleet-wide via MDM, and every token references +a config at mint (see "Config pull" below). Request: @@ -78,6 +79,12 @@ Headers (request): - `Authorization: Bearer ` - `If-None-Match: ` (optional) +`If-None-Match` reflects the **running** config, never a +downloaded-but-not-yet-applied one. The server reads this header to +track fleet convergence, so a gateway mid-install/mid-apply keeps +presenting its old etag until the new config has taken effect +(LLP 0025). + Response 200: ```json @@ -89,15 +96,23 @@ Response 200: } ``` +The body is a full HypAware v2 config and replaces the gateway's +operative config wholesale. Plugin entries are pinned by **version + +artifact content hash**; the gateway verifies the artifact hash on +install and treats a mismatch as an apply failure (LLP 0025). + `ETag: ` accompanies every 200 response. Clients persist the etag -in a sidecar (`/config-etag.json`) so a restart -short-circuits to 304 instead of re-pulling and re-validating. +of the *running* config in kernel-managed state (it transitions +atomically with the operative config on apply and rollback — LLP 0025) +so a restart short-circuits to 304 instead of re-pulling and +re-validating. Response 304: no body. The gateway keeps its current config. -Response 404: the operator has not registered a config for this -gateway. Gateways back off to 5 minutes and log once until the state -clears. +Response 404: legacy-only branch — every token now references a config +at mint (server LLP 0009), so gateways enrolled under that flow always +resolve. Kept for conformance against older servers: back off to +5 minutes and log once until the state clears. Response 401: see "Refresh window" above. diff --git a/hypaware-core/plugins-workspace/central/src/config.js b/hypaware-core/plugins-workspace/central/src/config.js index e39043e..a9ab971 100644 --- a/hypaware-core/plugins-workspace/central/src/config.js +++ b/hypaware-core/plugins-workspace/central/src/config.js @@ -58,10 +58,6 @@ export function validateCentralConfig(value) { } } - if (cfg.config_etag_path !== undefined && typeof cfg.config_etag_path !== 'string') { - return invalid('central.config_etag_path must be a string when set') - } - return { ok: true, config: /** @type {CentralSinkConfig} */ (/** @type {unknown} */ (cfg)) } } diff --git a/hypaware-core/plugins-workspace/central/src/config_client.js b/hypaware-core/plugins-workspace/central/src/config_client.js new file mode 100644 index 0000000..88869e8 --- /dev/null +++ b/hypaware-core/plugins-workspace/central/src/config_client.js @@ -0,0 +1,432 @@ +// @ts-check + +/** + * @import { ConfigControlFacade, PluginLogger } from '../../../../collectivus-plugin-kernel-types.d.ts' + * @import { IdentityClient } from './identity_client.js' + */ + +/** + * Default pull cadence when `poll_interval_seconds` is not configured. + * Mirrors the kernel apply engine's `DEFAULT_POLL_INTERVAL_SECONDS` + * (it sizes the probation window from the same number). + */ +export const DEFAULT_POLL_INTERVAL_SECONDS = 300 + +/** + * Transport-level cap on a pulled config body. Mirrors the kernel's + * `MAX_CONFIG_DOCUMENT_BYTES` — the apply engine enforces it again, + * but an oversized body is dropped before it is buffered whole: an + * oversized `Content-Length` is rejected without reading, and a + * chunked body is read through a byte counter that cancels the stream + * the moment it crosses the cap. + */ +export const MAX_CONFIG_DOCUMENT_BYTES = 1024 * 1024 + +/** + * Hard deadline (seconds) on a single poll, covering the request and + * the body read. Bounds how long `stop()` can wait on an in-flight + * poll even when the caller's `fetchFn` ignores abort signals. + */ +export const DEFAULT_REQUEST_TIMEOUT_SECONDS = 30 + +/** + * How long `stop()` lets an in-flight poll drain before aborting it + * (seconds). A healthy poll finishes in this window — a mid-flight + * apply should commit rather than be cancelled — while a stalled + * request is cut off so shutdown stays prompt. + */ +export const DEFAULT_STOP_GRACE_SECONDS = 1 + +/** Linear backoff ladder (seconds) for 429/503/transport failures, per proto.md. */ +const RETRY_BACKOFF_SECONDS = [30, 60, 120, 300] + +/** Polite backoff (seconds) for the legacy 404 branch, per proto.md. */ +const LEGACY_404_BACKOFF_SECONDS = 300 + +/** + * The config pull loop: poll `GET /v1/config` with `If-None-Match` set + * to the *running* config's etag, confirm successful polls to the + * kernel (clearing post-apply probation), and hand 200 bodies to the + * apply facade. Transport only — validation, persistence, restart, + * probation, and rollback are all kernel-owned behind `configControl`. + * + * The loop is a self-rescheduling timeout rather than an interval so + * backoff (404 / 429 / 503 / transport errors) can stretch a single + * gap without skewing the steady cadence. Identity refresh needs no + * timer of its own: every poll goes through `getCurrentJwt()`, which + * eagerly refreshes inside the 24h window, and the poll cadence is + * capped at one hour. + * + * Timers are deliberately *not* unref'd: in seed-config mode (central + * sink only, no sources) this loop is the daemon's only live handle, + * and that polling idle state is a legitimate steady state, not an + * exit condition. + * + * Every poll runs under its own `AbortController` with a hard + * deadline: a stalled config GET must not be able to wedge `stop()` — + * and through it daemon shutdown or a staged restart — so a poll that + * outlives the deadline is aborted, and `stop()` aborts an in-flight + * poll after a short drain grace. + * + * @param {{ + * centralUrl: string, + * identityClient: IdentityClient, + * configControl: ConfigControlFacade, + * pollIntervalSeconds?: number, + * requestTimeoutSeconds?: number, + * stopGraceSeconds?: number, + * log: PluginLogger, + * fetchFn?: typeof fetch, + * }} args + * @ref LLP 0025#config-pull-loop [implements] — immediate pull on bootstrap success, then a steady plugin-internal timer + */ +export function createConfigPullLoop(args) { + const { centralUrl, identityClient, configControl, log } = args + const fetchFn = args.fetchFn ?? fetch + const pollIntervalSeconds = args.pollIntervalSeconds ?? DEFAULT_POLL_INTERVAL_SECONDS + const requestTimeoutSeconds = args.requestTimeoutSeconds ?? DEFAULT_REQUEST_TIMEOUT_SECONDS + const stopGraceSeconds = args.stopGraceSeconds ?? DEFAULT_STOP_GRACE_SECONDS + + /** @type {NodeJS.Timeout | null} */ + let timer = null + let stopped = false + let consecutiveFailures = 0 + /** @type {Promise | null} */ + let inFlight = null + /** @type {AbortController | null} */ + let activeController = null + + /** @param {number} delaySeconds */ + function schedule(delaySeconds) { + if (stopped) return + timer = setTimeout(() => { + timer = null + inFlight = pollOnce().finally(() => { inFlight = null }) + }, delaySeconds * 1000) + } + + /** @returns {Promise} */ + async function pollOnce() { + const controller = new AbortController() + activeController = controller + // Not unref'd (matching the loop's no-unref policy): the deadline + // must be able to fire while a wedged poll is the only live + // handle, and it is cleared as soon as the poll settles. + const deadline = setTimeout( + () => controller.abort(new Error(`config poll exceeded ${requestTimeoutSeconds}s`)), + requestTimeoutSeconds * 1000 + ) + let nextDelay = pollIntervalSeconds + try { + const outcome = await pull(controller.signal) + if (outcome === 'retry_backoff') { + nextDelay = RETRY_BACKOFF_SECONDS[ + Math.min(consecutiveFailures, RETRY_BACKOFF_SECONDS.length) - 1 + ] ?? RETRY_BACKOFF_SECONDS[RETRY_BACKOFF_SECONDS.length - 1] + } else if (outcome === 'legacy_404') { + nextDelay = Math.max(LEGACY_404_BACKOFF_SECONDS, pollIntervalSeconds) + } else if (typeof outcome === 'number') { + nextDelay = outcome + } + } catch (err) { + // An abort from stop() is the shutdown path, not a poll failure. + if (!(stopped && controller.signal.aborted)) { + consecutiveFailures += 1 + const message = err instanceof Error ? err.message : String(err) + log.warn('central.config.poll_failed', { + error_kind: 'config_poll_error', + consecutive_failures: consecutiveFailures, + message, + }) + nextDelay = RETRY_BACKOFF_SECONDS[ + Math.min(consecutiveFailures, RETRY_BACKOFF_SECONDS.length) - 1 + ] + } + } finally { + clearTimeout(deadline) + activeController = null + } + schedule(nextDelay) + } + + /** + * One poll. Returns `'ok'`, `'retry_backoff'`, `'legacy_404'`, or an + * explicit next-delay in seconds (server-provided `Retry-After`). + * + * @param {AbortSignal} signal + * @returns {Promise<'ok' | 'retry_backoff' | 'legacy_404' | number>} + */ + async function pull(signal) { + const url = joinUrl(centralUrl, '/v1/config') + const runningEtag = configControl.runningEtag() + + let response = await doFetch(url, runningEtag, signal) + if (response.status === 401) { + // One-shot refresh + retry; a second 401 escalates as an auth + // failure (proto.md "Refresh window"). + await identityClient.refresh() + response = await doFetch(url, runningEtag, signal) + if (response.status === 401) { + consecutiveFailures += 1 + log.error('central.config.poll_failed', { + error_kind: 'config_poll_auth_failed', + http_status: 401, + }) + return 'retry_backoff' + } + } + + if (response.status === 304) { + consecutiveFailures = 0 + configControl.confirmPoll() + log.info('central.config.poll', { + hyp_operation: 'config.pull', + http_status: 304, + status: 'ok', + }) + return 'ok' + } + + if (response.status === 200) { + const etag = response.headers.get('etag') + const read = await readBodyCapped(response, MAX_CONFIG_DOCUMENT_BYTES, signal) + if (!read.ok) { + consecutiveFailures += 1 + log.error('central.config.poll_failed', { + error_kind: 'config_document_too_large', + http_status: 200, + body_bytes: read.bytesRead, + }) + return 'retry_backoff' + } + const body = read.body + if (!etag) { + consecutiveFailures += 1 + log.error('central.config.poll_failed', { + error_kind: 'config_missing_etag', + http_status: 200, + }) + return 'retry_backoff' + } + /** @type {unknown} */ + let document + try { + document = JSON.parse(body) + } catch (err) { + consecutiveFailures += 1 + log.error('central.config.poll_failed', { + error_kind: 'config_invalid_json', + http_status: 200, + message: err instanceof Error ? err.message : String(err), + }) + return 'retry_backoff' + } + consecutiveFailures = 0 + // The 200 itself is a successful authenticated poll: it clears + // any active probation before the new revision stages its own. + // A probation-clearing poll returning a newer revision chains + // into the next apply by design. + configControl.confirmPoll() + const staged = await configControl.stage(document, etag) + log.info('central.config.poll', { + hyp_operation: 'config.pull', + http_status: 200, + config_etag: etag, + apply_action: staged.ok ? staged.action : 'failed', + ...(staged.ok ? {} : { error_kind: staged.errorKind }), + status: staged.ok ? 'ok' : 'failed', + }) + return 'ok' + } + + if (response.status === 404) { + // Legacy-only branch: servers that mint tokens without a config. + if (consecutiveFailures === 0) { + log.warn('central.config.poll', { + hyp_operation: 'config.pull', + http_status: 404, + status: 'skipped', + hyp_reason: 'no_config_registered_legacy', + }) + } + consecutiveFailures += 1 + return 'legacy_404' + } + + if (response.status === 429 || response.status === 503) { + consecutiveFailures += 1 + const retryAfter = parseRetryAfter(response.headers.get('retry-after')) + log.warn('central.config.poll_failed', { + error_kind: 'config_poll_throttled', + http_status: response.status, + ...(retryAfter !== undefined ? { retry_after_seconds: retryAfter } : {}), + }) + return retryAfter !== undefined ? retryAfter : 'retry_backoff' + } + + consecutiveFailures += 1 + log.warn('central.config.poll_failed', { + error_kind: 'config_poll_http_error', + http_status: response.status, + }) + return 'retry_backoff' + } + + /** + * @param {string} url + * @param {string | undefined} runningEtag + * @param {AbortSignal} signal + */ + async function doFetch(url, runningEtag, signal) { + const jwt = await identityClient.getCurrentJwt() + return abortable( + fetchFn(url, { + method: 'GET', + signal, + headers: { + authorization: `Bearer ${jwt}`, + // If-None-Match always reflects the *running* config — the + // server reads it as the fleet-convergence signal, so a + // gateway mid-apply keeps presenting its old etag. + ...(runningEtag ? { 'if-none-match': runningEtag } : {}), + }, + }), + signal + ) + } + + return { + /** Pull immediately, then settle into the steady cadence. */ + start() { + if (stopped || timer || inFlight) return + inFlight = pollOnce().finally(() => { inFlight = null }) + }, + /** + * Stop polling. Lets an in-flight poll drain for a short grace + * (a mid-flight apply should commit, not be cancelled), then + * aborts it — so the wait is bounded even against a stalled + * server or a `fetchFn` that ignores abort signals. + */ + async stop() { + stopped = true + if (timer) { + clearTimeout(timer) + timer = null + } + if (inFlight) { + // Not unref'd: against a fetch wedged on the last live handle, + // an unref'd grace timer would let the event loop drain before + // it ever fired, leaving stop() hanging on the poll forever. + const grace = setTimeout(() => { + activeController?.abort(new Error('config pull loop stopped')) + }, stopGraceSeconds * 1000) + try { + await inFlight + } finally { + clearTimeout(grace) + } + } + }, + } +} + +/** + * Read a response body under a hard byte cap without ever buffering + * past it: an oversized `Content-Length` is rejected before any read, + * and a chunked body is streamed through a byte counter that cancels + * the moment it crosses the cap. Responses without a readable stream + * (e.g. test doubles) fall back to `text()` with a post-hoc check. + * + * @param {Response} response + * @param {number} maxBytes + * @param {AbortSignal} signal + * @returns {Promise<{ ok: true, body: string } | { ok: false, bytesRead: number }>} + */ +async function readBodyCapped(response, maxBytes, signal) { + const contentLength = Number(response.headers.get('content-length')) + if (Number.isFinite(contentLength) && contentLength > maxBytes) { + if (response.body) await response.body.cancel().catch(() => {}) + return { ok: false, bytesRead: contentLength } + } + if (!response.body) { + const text = await abortable(response.text(), signal) + const bytes = Buffer.byteLength(text, 'utf8') + return bytes > maxBytes ? { ok: false, bytesRead: bytes } : { ok: true, body: text } + } + const reader = response.body.getReader() + /** @type {Uint8Array[]} */ + const chunks = [] + let total = 0 + for (;;) { + const { done, value } = await abortable(reader.read(), signal) + if (done) break + total += value.byteLength + if (total > maxBytes) { + reader.cancel().catch(() => {}) + return { ok: false, bytesRead: total } + } + chunks.push(value) + } + return { ok: true, body: Buffer.concat(chunks).toString('utf8') } +} + +/** + * Await `promise`, but reject as soon as `signal` aborts — even when + * the underlying promise never settles. A misbehaving `fetchFn` (or a + * server that stalls mid-body) must not be able to wedge `stop()`, + * and through it daemon shutdown. + * + * @template T + * @param {Promise} promise + * @param {AbortSignal} signal + * @returns {Promise} + */ +function abortable(promise, signal) { + if (signal.aborted) return Promise.reject(abortReason(signal)) + return new Promise((resolve, reject) => { + const onAbort = () => reject(abortReason(signal)) + signal.addEventListener('abort', onAbort, { once: true }) + promise.then( + (value) => { + signal.removeEventListener('abort', onAbort) + resolve(value) + }, + (err) => { + signal.removeEventListener('abort', onAbort) + reject(err) + } + ) + }) +} + +/** @param {AbortSignal} signal */ +function abortReason(signal) { + return signal.reason instanceof Error ? signal.reason : new Error(String(signal.reason ?? 'aborted')) +} + +/** + * Parse a `Retry-After` header into whole seconds: delta-seconds or an + * HTTP-date, anything unparseable → `undefined` (callers fall back to + * the backoff ladder — a garbage header must not produce a zero-delay + * poll loop). Exported for direct unit tests. + * + * @param {string | null} value + * @returns {number | undefined} + */ +export function parseRetryAfter(value) { + if (!value) return undefined + const seconds = Number.parseInt(value, 10) + if (Number.isInteger(seconds) && seconds >= 0) return seconds + const date = Date.parse(value) + if (!Number.isNaN(date)) return Math.max(0, Math.round((date - Date.now()) / 1000)) + return undefined +} + +/** + * @param {string} base + * @param {string} suffix + */ +function joinUrl(base, suffix) { + const baseWithSlash = base.endsWith('/') ? base : `${base}/` + return new URL(suffix.replace(/^\//, ''), baseWithSlash).toString() +} diff --git a/hypaware-core/plugins-workspace/central/src/sink.js b/hypaware-core/plugins-workspace/central/src/sink.js index 1d0b560..451acad 100644 --- a/hypaware-core/plugins-workspace/central/src/sink.js +++ b/hypaware-core/plugins-workspace/central/src/sink.js @@ -99,8 +99,9 @@ export function createForwardSink(args) { }, async close() { - // No background loops to stop in the V1 forward sink; identity - // refresh and config pull live on their own timers when wired in. + // No background loops to stop here: the config pull loop wraps + // this sink's close() in index.js, and identity refresh is lazy + // (every authenticated call refreshes inside the 24h window). }, } } diff --git a/hypaware-core/plugins-workspace/central/src/types.d.ts b/hypaware-core/plugins-workspace/central/src/types.d.ts index 482e507..42706b7 100644 --- a/hypaware-core/plugins-workspace/central/src/types.d.ts +++ b/hypaware-core/plugins-workspace/central/src/types.d.ts @@ -47,17 +47,12 @@ export interface CentralSinkConfig { persisted_path?: string } /** - * Override the etag sidecar path used by the config-pull loop. Defaults - * to `/config-etag.json`. The loop itself is opt-in. + * Poll cadence (seconds) for the config-pull loop. Default 300s + * (5 minutes) — 304s are cheap, and propagation latency equals this + * cadence (no push channel in V1). The running config's etag is + * kernel-managed (LLP 0025); the plugin reads it through the + * `configControl` facade, so there is no plugin-side sidecar path. */ - config_etag_path?: string - /** Poll cadence (seconds) for the config-pull loop. Default 30s. */ poll_interval_seconds?: number } -/** Payload of the `config-changed` event emitted by `ConfigClient`. */ -export interface ConfigChangedEvent { - newConfig: unknown - etag: string - fetchedAt: string -} diff --git a/hypaware-core/smoke/flows/join_flow_remote_config.js b/hypaware-core/smoke/flows/join_flow_remote_config.js new file mode 100644 index 0000000..a2195f3 --- /dev/null +++ b/hypaware-core/smoke/flows/join_flow_remote_config.js @@ -0,0 +1,409 @@ +// @ts-check + +import fs from 'node:fs/promises' +import http from 'node:http' +import path from 'node:path' +import process from 'node:process' + +import { installObservability } from '../../../src/core/observability/index.js' +import { defaultConfigPath } from '../../../src/core/config/schema.js' +import { readConfigControlStatus } from '../../../src/core/config/apply.js' +import { DAEMON_RESTART_EXIT_CODE, runDaemon } from '../../../src/core/daemon/runtime.js' +import { dispatch } from '../../../src/core/cli/dispatch.js' + +/** + * @import { AddressInfo } from 'node:net' + */ + +/** + * Join-flow smoke (LLP 0025): drives the full remote-config lifecycle + * against a stub central server — + * + * join (seed write) → seed boot → identity bootstrap → config pull + * (200) → kernel apply → staged restart → relaunch on the served + * config → probation cleared by the first successful poll (304). + * + * The daemon runs in-process and the smoke plays the role of the + * foreground invoker: it relaunches `runDaemon` when `handle.done` + * resolves with the restart exit code, exactly as a dev shell or the + * service manager would. + * + * Asserted signals (Log-Driven Development): + * - external: operative config replaced wholesale (token retired), + * seed preserved as the rollback slot, otlp source running on the + * served config, `If-None-Match` convergence transitions on the + * stub server. + * - internal: `config.apply` span (status=ok), `config.applied` and + * `config.probation_cleared` log rows, `join.run` span. + * + * @param {{ harness: any, expect: any }} args + * @ref LLP 0025#the-join-sequence [tests] — seed → bootstrap → pull → apply → restart → operational, end to end against a stub server + */ +export async function run({ harness, expect }) { + const obs = installObservability() + if (!obs.tracer.provider) { + throw new Error( + 'join_flow_remote_config: tracer provider not installed — expected HYP_DEV_TELEMETRY=1' + ) + } + + process.env.HYP_HOME = harness.hypHome + delete process.env.HYP_CONFIG + const configPath = defaultConfigPath(harness.hypHome) + const stateRoot = path.join(harness.hypHome, 'hypaware') + + // ----- smoke_step: stub_server_up ----- + const server = await startStubCentralServer() + try { + // The served revision: a full v2 config. The otel pin exercises + // the bundled-plugin strict version check on the apply path. + const otelManifest = JSON.parse(await fs.readFile( + path.join( + path.dirname(new URL(import.meta.url).pathname), + '..', '..', 'plugins-workspace', 'otel', 'hypaware.plugin.json' + ), + 'utf8' + )) + server.setConfig({ + version: 2, + plugins: [ + { name: '@hypaware/central' }, + { name: '@hypaware/otel', version: otelManifest.version, config: { listen_host: '127.0.0.1', listen_port: 0 } }, + ], + sinks: { + central: { + plugin: '@hypaware/central', + config: { + url: server.baseUrl, + identity: {}, + schedule: '0 * * * *', + poll_interval_seconds: 5, + }, + }, + }, + query: { cache: { retention: { default_days: 30 } } }, + }, 'rev-1') + + // ----- smoke_step: join (write seed + skip daemon install) ----- + const joinOut = makeBuf() + const joinErr = makeBuf() + const joinCode = await dispatch( + ['join', server.baseUrl, 'policy-token-smoke', '--no-daemon'], + { + stdout: joinOut, + stderr: joinErr, + env: { ...process.env, HYP_HOME: harness.hypHome }, + } + ) + expect.that( + `join: exits 0 (stderr: ${joinErr.text()})`, + joinCode, + (v) => v === 0 + ) + const seed = JSON.parse(await fs.readFile(configPath, 'utf8')) + expect.that( + 'join: seed config carries the policy token', + seed.sinks?.central?.config?.identity?.bootstrap_token, + (v) => v === 'policy-token-smoke' + ) + + // ----- smoke_step: seed_boot (bootstrap → pull → apply → restart) ----- + const first = await runDaemon({ + hypHome: harness.hypHome, + env: process.env, + runId: harness.devRunId, + tickIntervalMs: 0, + installSignalHandlers: false, + }) + const firstExit = await withTimeout( + first.done, + 30_000, + 'seed boot did not request a staged restart within 30s' + ) + expect.that( + `seed boot: daemon exited with the restart code (got ${firstExit})`, + firstExit, + (v) => v === DAEMON_RESTART_EXIT_CODE + ) + + // The apply replaced the operative config wholesale and preserved + // the seed as the rollback slot. + const operative = JSON.parse(await fs.readFile(configPath, 'utf8')) + expect.that( + 'apply: operative config no longer carries the policy token', + operative.sinks?.central?.config?.identity?.bootstrap_token, + (v) => v === undefined + ) + expect.that( + 'apply: operative config names the otel plugin from the served revision', + operative.plugins?.some((/** @type {any} */ p) => p.name === '@hypaware/otel'), + (v) => v === true + ) + const slotA = JSON.parse( + await fs.readFile(path.join(stateRoot, 'config-control', 'config.a.json'), 'utf8') + ) + expect.that( + 'apply: the seed survives in the rollback slot', + slotA.sinks?.central?.config?.identity?.bootstrap_token, + (v) => v === 'policy-token-smoke' + ) + const midStatus = readConfigControlStatus({ stateRoot, configPath }) + expect.that( + 'apply: probation marker armed for the served revision', + midStatus.probation?.etag, + (v) => v === 'rev-1' + ) + + // ----- smoke_step: relaunch (service-manager role) ----- + const second = await runDaemon({ + hypHome: harness.hypHome, + env: process.env, + runId: harness.devRunId, + tickIntervalMs: 0, + installSignalHandlers: false, + }) + try { + // Probation clears on the first successful poll (304 here). + await waitFor( + () => readConfigControlStatus({ stateRoot, configPath }).probation === null, + 10_000, + 'probation did not clear within 10s of relaunch' + ) + const cleared = readConfigControlStatus({ stateRoot, configPath }) + expect.that( + 'probation: cleared with the served revision running', + cleared.runningEtag, + (v) => v === 'rev-1' + ) + expect.that( + 'probation: no rollback was recorded', + cleared.lastRollback, + (v) => v === null + ) + + const snapshot = second.snapshot() + expect.that( + `relaunch: daemon state is healthy (got ${snapshot.state})`, + snapshot.state, + (v) => v === 'healthy' + ) + expect.that( + 'relaunch: otlp source from the served config is started', + snapshot.sources.find((/** @type {any} */ s) => s.name === 'otlp')?.state, + (v) => v === 'started' + ) + + // Convergence semantics on the wire: the first GET presented no + // etag (seed has none), the post-apply GET presented rev-1 and + // was answered 304. + const configGets = server.requests.filter((r) => r.path === '/v1/config') + expect.that( + `stub server: at least two config pulls observed (got ${configGets.length})`, + configGets.length, + (v) => typeof v === 'number' && v >= 2 + ) + expect.that( + 'stub server: the seed-boot pull presented no If-None-Match', + configGets[0]?.ifNoneMatch, + (v) => v === undefined + ) + expect.that( + 'stub server: a post-apply pull presented the running etag and converged', + configGets.some((r) => r.ifNoneMatch === 'rev-1' && r.responseStatus === 304), + (v) => v === true + ) + expect.that( + 'stub server: exactly one bootstrap happened (policy token not re-spent)', + server.requests.filter((r) => r.path === '/v1/identity/bootstrap').length, + (v) => v === 1 + ) + } finally { + await second.stop() + await second.done + } + } finally { + await server.close() + } + + await obs.shutdown() + + // ----- smoke_step: telemetry ----- + const traces = await expect.traces() + const applySpans = traces.filter((/** @type {any} */ t) => t.name === 'config.apply') + expect.that( + 'traces: a config.apply span was emitted with status=ok and apply_action=applied', + applySpans.some((/** @type {any} */ s) => + s.attributes?.status === 'ok' && s.attributes?.apply_action === 'applied' + ), + (v) => v === true + ) + expect.that( + 'traces: a join.run span was emitted', + traces.some((/** @type {any} */ t) => t.name === 'join.run'), + (v) => v === true + ) + + const logs = await expect.logs() + expect.that( + 'logs: config.applied recorded for rev-1', + logs.some((/** @type {any} */ l) => + l.body === 'config.applied' && l.attributes?.config_etag === 'rev-1' + ), + (v) => v === true + ) + expect.that( + 'logs: config.probation_cleared recorded for rev-1', + logs.some((/** @type {any} */ l) => + l.body === 'config.probation_cleared' && l.attributes?.config_etag === 'rev-1' + ), + (v) => v === true + ) + expect.that( + 'logs: central.config.poll observed both a 200 and a 304', + [200, 304].every((status) => + logs.some((/** @type {any} */ l) => + l.body === 'central.config.poll' && l.attributes?.http_status === status + ) + ), + (v) => v === true + ) +} + +/* ---------- stub central server ---------- */ + +/** + * Minimal `@hypaware/server` stand-in: identity bootstrap/refresh, + * etag-aware config serving, and an ingest acceptor. Every request is + * recorded for convergence assertions. + */ +async function startStubCentralServer() { + /** @type {Array<{ method: string, path: string, ifNoneMatch?: string, responseStatus: number }>} */ + const requests = [] + /** @type {unknown} */ + let configDoc = null + /** @type {string} */ + let configEtag = '' + + const jwt = buildFakeJwt('gateway-smoke-1') + const expiresAt = Math.floor(Date.now() / 1000) + 30 * 24 * 60 * 60 + + const server = http.createServer((req, res) => { + const url = new URL(req.url ?? '/', 'http://localhost') + /** @param {number} status @param {Record} headers @param {string} [body] */ + function reply(status, headers, body) { + requests.push({ + method: req.method ?? '', + path: url.pathname, + ...(req.headers['if-none-match'] + ? { ifNoneMatch: String(req.headers['if-none-match']) } + : {}), + responseStatus: status, + }) + res.writeHead(status, headers) + res.end(body ?? '') + } + + if (req.method === 'POST' && (url.pathname === '/v1/identity/bootstrap' || url.pathname === '/v1/identity/refresh')) { + reply(200, { 'content-type': 'application/json' }, JSON.stringify({ jwt, expires_at: expiresAt })) + return + } + if (req.method === 'GET' && url.pathname === '/v1/config') { + if (!configDoc) { + reply(404, { 'content-type': 'application/json' }, JSON.stringify({ error: 'no_config' })) + return + } + if (req.headers['if-none-match'] === configEtag) { + reply(304, { etag: configEtag }) + return + } + reply( + 200, + { 'content-type': 'application/json', etag: configEtag }, + JSON.stringify(configDoc) + ) + return + } + if (req.method === 'POST' && url.pathname.startsWith('/v1/ingest/')) { + reply(202, {}) + return + } + reply(404, { 'content-type': 'application/json' }, JSON.stringify({ error: 'not_found' })) + }) + + await new Promise((resolve) => server.listen(0, '127.0.0.1', () => resolve(undefined))) + const address = /** @type {AddressInfo} */ (server.address()) + + return { + baseUrl: `http://127.0.0.1:${address.port}`, + requests, + /** @param {unknown} doc @param {string} etag */ + setConfig(doc, etag) { + configDoc = doc + configEtag = etag + }, + close() { + return new Promise((resolve) => server.close(() => resolve(undefined))) + }, + } +} + +/** + * Unsigned JWT with the `sub` claim the identity client decodes. The + * gateway never verifies signatures (it trusts TLS), so a fake + * signature is wire-faithful enough for the smoke. + * + * @param {string} sub + */ +function buildFakeJwt(sub) { + /** @param {object} obj */ + const b64 = (obj) => Buffer.from(JSON.stringify(obj)).toString('base64url') + return `${b64({ alg: 'none', typ: 'JWT' })}.${b64({ sub })}.smoke` +} + +/* ---------- helpers ---------- */ + +function makeBuf() { + let value = '' + return { + /** @param {string} chunk */ + write(chunk) { + value += String(chunk) + return true + }, + text() { + return value + }, + } +} + +/** + * @template T + * @param {Promise} promise + * @param {number} ms + * @param {string} message + * @returns {Promise} + */ +function withTimeout(promise, ms, message) { + /** @type {NodeJS.Timeout} */ + let timer + return Promise.race([ + promise.finally(() => clearTimeout(timer)), + new Promise((_resolve, reject) => { + timer = setTimeout(() => reject(new Error(`join_flow_remote_config: ${message}`)), ms) + }), + ]) +} + +/** + * @param {() => boolean} predicate + * @param {number} ms + * @param {string} message + */ +async function waitFor(predicate, ms, message) { + const deadline = Date.now() + ms + while (Date.now() < deadline) { + if (predicate()) return + await new Promise((resolve) => setTimeout(resolve, 50)) + } + throw new Error(`join_flow_remote_config: ${message}`) +} diff --git a/llp/0000-hypaware.explainer.md b/llp/0000-hypaware.explainer.md index daf0138..a249b69 100644 --- a/llp/0000-hypaware.explainer.md +++ b/llp/0000-hypaware.explainer.md @@ -72,6 +72,7 @@ plugin that registers a dataset gets query and formatting for free. | Observability & self-instrumentation | [0021](./0021-observability.spec.md) | Spec | | Iceberg export partitioning | [0022](./0022-iceberg-export-partitioning.spec.md) | Spec | | Context-graph T0 projection | [0023](./0023-context-graph-projection.decision.md) | Decision | +| Remote config & join flow | [0025](./0025-remote-config-join-flow.spec.md) | Spec | ## Where to start diff --git a/llp/0003-core-vs-plugin-surface.spec.md b/llp/0003-core-vs-plugin-surface.spec.md index fbbcc10..8d8a5cf 100644 --- a/llp/0003-core-vs-plugin-surface.spec.md +++ b/llp/0003-core-vs-plugin-surface.spec.md @@ -28,6 +28,11 @@ copy-pasted into every plugin, it belongs in core. - the Iceberg-backed cache/storage implementation and freshness checks - result formatting (table / json / jsonl / markdown) - managed state directories, lock files, permission prompts +- the **config apply engine** — staging a replacement config: validate, + install pinned plugins, persist last-known-good, swap, staged restart, + rollback bookkeeping. Exposed to plugins as a narrow context facade; the + document's *transport* (e.g. `@hypaware/central`'s pull loop) is plugin + domain. See [LLP 0025](./0025-remote-config-join-flow.spec.md#apply-engine-is-kernel-surface). ## Intrinsic, not plugin-provided @@ -50,6 +55,12 @@ They are therefore promoted to a neutral core home re-exported from `src/core/index.js`, not buried in the cache — the cache is one consumer, not the owner. +"Query is intrinsic" means the **SQL/dataset surface** specifically: the +dataset registry, SQL execution, cursors, freshness, and formatting. Other +query modalities (e.g. vector similarity search) are **plugin capabilities** +that build on the intrinsic surface, not kernel surface — decided 2026-06-12 +when scoping `@hypaware/vector-search`. + ## Plugins own Domain behavior only, expressed through what they `require`, `provide`, and diff --git a/llp/0011-setup-and-onboarding.decision.md b/llp/0011-setup-and-onboarding.decision.md index bea7263..6142a39 100644 --- a/llp/0011-setup-and-onboarding.decision.md +++ b/llp/0011-setup-and-onboarding.decision.md @@ -55,3 +55,9 @@ For scripted installs (CI, fleet provisioning), `hypaware init ` accepts named presets contributed by plugins, and `hypaware init --from-file ./team.json` provisions a fleet of identical installs. Presets are named after what they are *for*, never after an architectural role. + +For centrally-managed gateways, `hypaware join ` writes a seed +config (central plugin only) and performs the non-interactive daemon install; +the full config arrives from the server at join. It is sugar over "write the +config file + install the daemon", not a separate path. See +[LLP 0025](./0025-remote-config-join-flow.spec.md#seed-config-mode). diff --git a/llp/0017-daemon-runtime.decision.md b/llp/0017-daemon-runtime.decision.md index ebb431e..ab5fb35 100644 --- a/llp/0017-daemon-runtime.decision.md +++ b/llp/0017-daemon-runtime.decision.md @@ -5,7 +5,7 @@ **Systems:** Daemon **Author:** Phil / Claude **Date:** 2026-06-01 -**Related:** LLP 0002, LLP 0011, LLP 0012, LLP 0014 +**Related:** LLP 0002, LLP 0011, LLP 0012, LLP 0014, LLP 0025 > The primary daemon and how it is installed. Decomposed from the V1 finishing > plan (`finish-v1` Phases 3–4, now tombstoned) and `hypaware-design.md`. @@ -20,12 +20,41 @@ V1 introduces a primary daemon that boots the kernel and runs the steady state: - run the **sink export loop** — tick each configured sink on its cron schedule ([LLP 0014](./0014-sinks.spec.md)) - watch config and reload sources in place on change (same-shape reload, see - [LLP 0004](./0004-activation-and-paths.spec.md#same-shape-reload)) + [LLP 0004](./0004-activation-and-paths.spec.md#same-shape-reload)) — this + path covers **same-shape** changes only; config *replacement* takes the + [staged restart](#staged-restart-for-config-replacement) below - report health for `hypaware status` ([LLP 0009](./0009-cli-registry.spec.md#core-rendered-status)) The source registry and sink driver exist independently; the daemon is the long-lived host that drives them together. +## Staged restart for config replacement + +When the operative config is **replaced wholesale** — remote config apply +([LLP 0025](./0025-remote-config-join-flow.spec.md#apply-semantics-staged-restart)), +or any change to the plugin set or installed plugin code — the daemon does +**not** reload in place. It persists the new config and **exits; the service +manager relaunches it** onto the new config. + +Process restart is the only correct model here, not a simplification target: +install-on-config can upgrade a plugin that is already loaded, and Node's ESM +module cache cannot be invalidated — an in-process re-activate would run stale +code against the new config, defeating the artifact hash verification that +just passed. Restarting the process guarantees executed code = pinned artifact. + +Consequences: + +- The launchd / systemd user units **must be configured to relaunch on exit** + (`KeepAlive` / `Restart=always`). This is now a requirement of the + installers, not a nicety. +- A foreground (non-service) daemon cannot relaunch itself: it exits with a + distinct restart exit code — **75** (`EX_TEMPFAIL`, + `DAEMON_RESTART_EXIT_CODE`) — and the invoker (smoke harness, dev shell) + loops on that code. +- Same-shape reload ([LLP 0004](./0004-activation-and-paths.spec.md#same-shape-reload)) + remains the path for in-place source config changes; there are exactly two + paths, distinguished by whether the plugin set / plugin code changed. + ## Install: global package, then service manager When daemon install is requested from `npx hypaware`, **install a persistent diff --git a/llp/0025-remote-config-join-flow.spec.md b/llp/0025-remote-config-join-flow.spec.md new file mode 100644 index 0000000..8c0ff12 --- /dev/null +++ b/llp/0025-remote-config-join-flow.spec.md @@ -0,0 +1,339 @@ +# LLP 0025: Remote Config and Join Flow + +**Type:** Spec +**Status:** Active +**Systems:** Config, Sinks, Plugins +**Author:** Phil / Claude +**Date:** 2026-06-12 +**Related:** LLP 0007, LLP 0008, LLP 0010, LLP 0014, LLP 0017; hypaware-server LLP 0009 (out of tree, design authority) + +> Client-side spec for centrally-managed gateway configuration. Derived from +> the hypaware-server LLP 0009 handoff +> (`~/workspace/hypaware-server/llp/0009-remote-config.spec.md` is the design +> authority for the feature as a whole; this document owns the client half). + +## Summary + +A gateway can be configured entirely from the central server. MDM deploys a +**seed** — server URL + policy token, nothing else — and the gateway joins the +fleet, pulls its full config, installs any plugins that config names, and +becomes operational without the user ever touching a config file. Later edits +to the central config reconfigure the fleet on the poll cadence. This document +specifies the join sequence, the config pull loop, seed-config mode, apply +semantics, install-on-config, and last-known-good rollback. + +This is **post-V1 work**: `@hypaware/central` is explicitly out of V1 scope +([LLP 0002](./0002-v1-scope.decision.md#out-of-v1-scope)). + +## Motivation + +The client user never touches a config file or knows one exists. Everything +the gateway does (plugins, sinks, query) is authored centrally and delivered +at join. The existing `@hypaware/central` plugin already has identity +bootstrap/refresh and the ingest path; what is missing is the config pull loop +and the apply machinery around it. + +## The join sequence + +1. Seed boots the kernel with the central plugin only. +2. `POST /v1/identity/bootstrap` exchanges the policy token for a JWT. +3. `GET /v1/config` pulls the operator-authored config. +4. Apply (persist + staged restart) → fully operational. + +## Config pull loop + +The central plugin is configured as a **sink instance** +([LLP 0014](./0014-sinks.spec.md#config-two-shapes)); the pull cadence lives +in its sink config block as `poll_interval_seconds` (already validated by +`central/src/config.js`, 5–3600s), separate from the cron `schedule` that +drives ingest exports. The pull and identity-refresh timers are +plugin-internal: started at activation, stopped at `close()` — no change to +the LLP 0014 sink contract. + +`@hypaware/central`'s `src/sink.js` notes that refresh and config pull "live +on their own timers when wired in" — this spec wires the config pull: + +- Pull **immediately on bootstrap success**, then on a steady timer + (`poll_interval_seconds`, default **300 s** — 304s are cheap; the server + ETag is a content hash of the served revision). +- The `proto.md` ETag/304/404/429 semantics are unchanged. The running + config's etag persists across restarts so a relaunch short-circuits to + 304; it is kernel-managed state read through the facade (below). +- A pulled 200 body above **1 MiB** is dropped — enforced at both the + transport and the apply engine. The transport check is a genuine memory + bound, not a post-hoc one: an oversized `Content-Length` is rejected + without reading, and a chunked body is streamed through a byte counter + that cancels the moment it crosses the cap. Wholesale-replace means an + authenticated 200 goes straight into memory and onto disk; the stated cap + is one line of defense-in-depth. +- Every poll runs under its own abort controller with a **hard request + deadline (30 s)** covering the request and the body read, and the loop's + `stop()` aborts an in-flight poll after a short drain grace (1 s) — a + stalled config GET must not be able to wedge daemon shutdown or a staged + restart behind it. +- **`If-None-Match` must reflect the *running* config, never a + downloaded-but-not-yet-applied one.** The server reads this header to track + fleet convergence (it lands in the queryable `gateways` dataset), so a + gateway mid-install/mid-apply keeps presenting its old etag until the new + config has actually taken effect. + +## Seed-config mode + +The seed is an **ordinary v2 config file** — `~/.hyp/hypaware-config.json` +containing exactly the central plugin (server URL + policy token), nothing +else. There is no seed-specific file format and the kernel has no "seed" +state: seed-config mode is just this particular config booted, consistent +with [LLP 0010](./0010-config-model.spec.md#no-mode-field) (no mode flag; a +host is what its config says). + +Such a config must boot cleanly: no sources, no other sinks, collecting +nothing, polling for config. This is a legitimate steady state for the +seconds between enrollment and first 200 — not an error. + +The policy token lives in the seed config itself (the config file is mode +0600). Policy tokens are multi-use (server LLP 0008), so it is not consumed +on bootstrap; the first successful apply replaces the seed config wholesale, +which naturally retires the token from disk. From then on `identity.json` +carries the JWT. + +`hypaware join ` is convenience sugar for MDM install scripts: +it writes the seed config and performs the non-interactive daemon install, +and is specified as **exactly equivalent** to doing those two steps by hand — +a wrapper, not a second code path. It joins `init ` and +`init --from-file` as a non-interactive entry point +([LLP 0011](./0011-setup-and-onboarding.decision.md#non-interactive-entry)). +Because a policy token is a multi-use, fleet-wide credential, `join` also +accepts `--token-file ` and stdin, and MDM scripts should prefer those +forms — a bare argv token lands in shell history and process listings. + +## Apply semantics: staged restart + +A pulled 200 body is a **full HypAware v2 config and replaces the operative +config wholesale** — no merging, no client-owned sections. Persist the +document, then restart. Never live-mutate. + +Staged restart is a **process-level restart**: the daemon persists the new +config and exits; the service manager relaunches it +([LLP 0017](./0017-daemon-runtime.decision.md#staged-restart-for-config-replacement) +records the decision and why in-process re-activation is unsound — Node's ESM +module cache would run stale plugin code past the artifact hash check). The +in-place [same-shape reload](./0004-activation-and-paths.spec.md#same-shape-reload) +path is never used for remote apply. + +Recommended persistence idiom: **A/B slots** — write each config to its own +path and flip an atomic pointer (symlink or one-line file) as the last step +before exit. Same semantics as "file swap," but a crash between persist and +restart can never leave an ambiguous operative config, and last-known-good +is crash-safe by construction. As implemented: slot files live under +`/config-control/`, the operative config path becomes a relative +symlink to the active slot (replaced atomically via tmp + rename), and each +slot carries its served etag in a per-slot sidecar written before the flip — +so the document and its etag commit on the same rename, in both directions. + +### Apply engine is kernel surface + +The central plugin is **transport only**: pull, ETag bookkeeping, auth. It +hands a downloaded document to a narrow kernel facade — +`ctx.configControl.stage(document, etag)`, plus `confirmPoll()` (poll +liveness) and `runningEtag()` (for `If-None-Match`); the **kernel** owns +shape-check → install pinned plugins → validate → persist last-known-good → +swap → restart, and the rollback bookkeeping. The facade exists only where an +apply engine runs (the daemon); plain CLI boots leave `ctx.configControl` +undefined and the plugin keeps its pull loop off — `hyp status` must not +fire config polls as a side effect. Recorded in +[LLP 0003](./0003-core-vs-plugin-surface.spec.md#core-owns). + +Why kernel-side: rollback state must survive the restart and pairs with the +kernel-owned config file; the apply engine is testable without HTTP (rollback +is exactly the code that must not be discovered broken in production); and a +future second management channel reuses it. Consequently +**last-known-good config and the remembered bad etag live in kernel-managed +state** ([LLP 0004](./0004-activation-and-paths.spec.md#state-directories)), +not the central plugin's state dir. + +The etag sidecar must transition **atomically with the operative config, in +both directions**: it carries the etag of the *running* config, so apply +moves it forward and rollback reverts it (otherwise a rolled-back gateway +would present a converged etag while running last-known-good). Since every +sidecar change coincides with an apply or rollback, the facade takes the +etag alongside the document and the **apply engine stages the sidecar with +the swap** (realized as the per-slot etag files above — flipping the +pointer flips the etag); the central plugin only reads it, through +`configControl.runningEtag()`, to populate `If-None-Match`. + +Identity state (`identity.json`, JWT, gateway id) is **not config** and is +never touched by config application. + +## Install-on-config (hash-pinned) + +A pulled config may name plugins not installed on the machine. The client +installs them through the **existing +[LLP 0007](./0007-plugin-install-and-locking.decision.md) install path** +(prebuilt git artifact, never `npm install` — +[LLP 0008](./0008-plugin-runtime-dependencies.decision.md) — recorded in the +plugin lock file). Served configs always pin **version + artifact content +hash** (the server's save pipeline guarantees this); the client must verify +the artifact hash and treat a mismatch as an apply failure (→ rollback, +below). The config names exactly one artifact; nothing may substitute code +after authoring. + +Install runs **before full validation**: catalog-backed validation can only +know a plugin once it is installed, so validating first would reject the +very config that names a not-yet-installed plugin. The apply engine instead +shape-checks the document (including the pin fields' types), installs the +pinned plugins it names, and only then runs full validation against the +freshly rebuilt catalog. Acting on a not-yet-fully-validated document is +bounded by the shape gate and the hash pin — an install can only bring in +the exact artifact the config authored — and plugin trees installed for a +config that then fails validation stay on disk by the same rule as rollback +(the lock records what is installed, not what is active). + +### Bundled first-party plugins + +First-party plugins ship bundled in the kernel package +([LLP 0002](./0002-v1-scope.decision.md#plugin-packaging-divergence)) and are +never fetched at apply time. For a pinned plugin that is bundled with the +running kernel: + +- The bundled copy satisfies the pin; the **artifact hash is not checked**. + Bundled code is inside the existing trust boundary — it ships in the same + npm package as the kernel performing the verification, and the server's + hash refers to a git release artifact that legitimately differs from the + npm-bundled tree. +- The pinned **version is checked strictly**: a mismatch between the pinned + version and the bundled version is an apply failure (→ rollback, below). + +Version-strictness means a fleet with mixed kernel versions (e.g. mid +rolling upgrade) can only converge on a config whose first-party pins match +every gateway's bundled versions — see open questions. + +## Last-known-good rollback + +If an applied config fails validation, a pinned install fails its hash check, +or the post-apply probation window (below) expires unsatisfied, revert to the +previous operative config (file swap + staged restart — cheap by +construction). Remember the failed revision's etag and **back off re-apply +attempts for that etag until the etag changes** — re-polling is fine, an +apply-crash loop is not. One remembered bad-etag value, no persistent +denylist. The client records a **structured rollback reason** (validation +failure / hash mismatch / probation expiry, plus the offending etag) from day +one — the server only sees non-convergence via `If-None-Match` and cannot +distinguish "rolled back" from "never applied," so if a rollback column is +ever added to the `gateways` dataset, the data must already exist +client-side. For V1 it surfaces in client logs and in `hypaware status` +([LLP 0009](./0009-cli-registry.spec.md#core-rendered-status)): probation +state, last rollback + reason, and the remembered bad etag — an operator at +the machine must not need log spelunking to learn the gateway rejected a +config. + +Rollback restores the config, **not the install root**: plugin trees and +lock-file entries installed for the failed config stay on disk. The lock +file records what is installed, not what is active — the operative config +defines the active set — and keeping the artifacts makes re-apply after a +fixed revision cheaper. + +### Post-apply probation + +Because apply is a process restart, the apply engine writes a **probation +marker to kernel-managed state before restarting** ("revision X applied at T, +probation until T+W"); the relaunched daemon reads it at boot. Probation is +cleared by the **first successful authenticated config poll** (200 or 304 on +`GET /v1/config`) after the restart — that one request proves identity +survived, the server is reachable, and the new config's central sink runs, +and its `If-None-Match` is simultaneously the server-side convergence signal, +so client probation and fleet convergence clear on the same packet. An ingest +POST is deliberately *not* the signal: an idle gateway with nothing to export +must still be able to clear probation. If the window expires unsatisfied, the +kernel rolls back: staged restart onto last-known-good, bad etag remembered. + +The **kernel owns the probation timer and the rollback decision, +independently of the central plugin functioning** — a wedged or +wrongly-pointed central sink is precisely a case probation must catch. The +plugin reports a successful poll through the apply facade (a confirmation +call); **it never touches probation state directly**. Probation expiry is +also evaluated **at boot, before plugin activation**: a +kernel-killing-but-valid config that crashloops under the service manager's +relaunch policy may never live long enough for a running timer to fire, so +each relaunch checks the marker first and rolls back from boot if the window +has passed. + +A probation-clearing poll may itself return 200 with a newer revision; that +triggers an immediate next apply, with its own probation. This chaining is +correct — do not serialize or suppress it. + +W must comfortably exceed one poll interval plus retry backoff: +`W = max(3 × poll_interval_seconds, 120 s)` — a formula, not a fixed +constant, so a slow operator-chosen poll cadence cannot make every apply +roll back, and the 120 s floor leaves room for relaunch + identity refresh ++ one retry even at the fastest cadence. The interval is taken from the +*staged* document's central sink block (that is the sink that will, or +won't, confirm the poll); the kernel falls back to the 300 s default when +the block doesn't set one. + +Rollback from the **first** applied config lands back on the seed config — +fine by construction: seed-config mode is a legitimate polling steady state, +and the bad-etag backoff prevents a re-apply loop. + +## Wire contract amendments (`proto.md`) + +`hypaware-core/plugins-workspace/central/proto.md` is the authoritative wire +reference and is amended by this spec: + +- Served configs pin plugins by version + artifact content hash. +- `If-None-Match` reflects the running config (convergence semantics). +- 404 ("operator has not registered a config") is demoted to a legacy-only + branch: every token now references a config at mint, so gateways enrolled + under server LLP 0009 always resolve. Keep the polite backoff for + conformance. +- The "bootstrap tokens are single-use" sentence is replaced by the + policy-token amendment (server LLP 0008); both changes fold in together. + +## Server-side guarantees the client relies on + +- Every gateway enrolled through a policy token resolves to a config — + join-time 404 is structurally impossible for new enrollments. +- The served document passed the server's save pipeline: schema-valid, + plugins hash-pinned, and **always contains a central sink targeting the + server's own external URL** (so a config that would disconnect the fleet + can't be authored). The rollback backstop covers the residue + (wrong-but-present URL, kernel-killing-but-valid configs). +- ETag changes exactly when the served bytes change (revision content hash). + No push channel in V1: propagation latency = the poll cadence. + +## Sequencing + +Server lands first (registry, revisions, admin authoring endpoints, +mint-requires-config, serving, convergence columns) and ships dark. +`GET /v1/config` has existed since V1, so no capability handshake is needed. +Nothing server-side is blocked on the client; nothing client-side is blocked +on the server except end-to-end testing. + +## Settled at implementation (2026-06-12) + +Three knobs the draft left open were fixed when the client landed: + +- **Poll cadence default: 300 s** (5 minutes). Validated range stays + 5–3600 s. +- **Maximum config document size: 1 MiB**, enforced at both the transport + and the apply engine. +- **Probation floor: 120 s** (`W = max(3 × poll_interval_seconds, 120 s)`). +- **Poll request deadline: 30 s** per poll (request + body read), with a + **1 s drain grace** before `stop()` aborts an in-flight poll. + +## Open questions + +- **Strict version pins for bundled plugins vs rolling kernel upgrades.** + The strict check (above) means a kernel upgrade that bumps bundled plugin + versions de-converges the fleet until the central config's pins are + updated, and a mixed-version fleet cannot fully converge on one config. + Considered alternative: treat the pin as enforced only for fetched + artifacts and let config *validation* gate apply for bundled plugins, + reporting the bundled version upward. Deliberately deferred — strict now, + relax if upgrade thrash shows up in practice. + +## References + +- hypaware-server LLP 0009 (`0009-remote-config.spec.md`) — design authority +- hypaware-server LLP 0008 — policy tokens +- [`proto.md`](../hypaware-core/plugins-workspace/central/proto.md) — wire reference +- [LLP 0007](./0007-plugin-install-and-locking.decision.md), [LLP 0008](./0008-plugin-runtime-dependencies.decision.md), [LLP 0010](./0010-config-model.spec.md), [LLP 0014](./0014-sinks.spec.md) diff --git a/notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md b/notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md new file mode 100644 index 0000000..37083af --- /dev/null +++ b/notes-archive/llp-reviews/remote-config-join-flow.spec.round-1.claude.md @@ -0,0 +1,159 @@ +# Review of LLP 0023: Remote Config and Join Flow + +**Reviewer:** Claude (Fable 5) +**Date:** 2026-06-12 +**Round:** 1 +**LLP Status at review time:** Draft + +## Overall assessment + +This is a good design and a notably complete one for a Draft: the join flow +is coherent end-to-end, the hard decisions (process-level restart, kernel +apply engine, bundled-plugin pin semantics, probation signal) are made +explicitly with rationale rather than left to the implementer, and the spec +is honest about what it defers. The strongest property is that several +mechanisms collapse into single primitives — convergence reporting, probation +clearing, and rollback visibility are all the same `GET /v1/config` request; +rollback is the same staged restart as apply. That economy is what makes the +"one remembered bad etag, no denylist" simplicity credible. + +The main weakness is at the seams the restart creates: the spec decides *who +owns* probation state (kernel) but not *who watches the clock* or how the +plugin's successful poll reaches the kernel, and it doesn't say what happens +when the applied config crashes the daemon faster than probation can be +evaluated. These are not flaws in the design — they are consequences of the +(correct) restart decision that the spec hasn't finished chasing down. + +## Strengths + +- **The probation signal choice is genuinely elegant.** Clearing on the first + authenticated poll makes client-side health and server-side convergence the + same observable event, and the explicit rejection of ingest-POST as the + signal (idle gateways must clear probation) shows the edge case was + actually considered. +- **The ESM-module-cache argument for process restart** is the right kind of + rationale: it converts a style preference ("restarts are cleaner") into a + correctness requirement (in-process re-activation would execute code other + than the hash-verified artifact). A future agent cannot "optimize" this + away without confronting a stated invariant. +- **Seed-as-ordinary-config** keeps faith with LLP 0010#no-mode-field — no + seed file format, no kernel seed state — and gets crash-safety for free: + rollback from the first apply lands on the seed, which the spec correctly + identifies as a legitimate steady state rather than a special case. +- **The bundled-plugin trust-boundary argument** (hash-checking code that + ships in the same npm package as the verifier buys nothing) is correct and + prevents the always-mismatching-hash failure mode that a naive uniform rule + would have shipped. +- **The kernel/plugin split for apply** is justified on the right grounds: + rollback state must survive the restart, and rollback is exactly the code + path that must be testable without HTTP. +- The open questions section records *why* the bundled-pin strictness was + deferred and what the considered alternative was — that's the difference + between a deferral and a hole. + +## Concerns + +1. **[Definite, trivial] The join sequence still says "kernel reload."** + Step 4 reads "Apply (persist + kernel reload)" — stale wording from before + the staged-restart decision; the apply section below contradicts it. Fix: + "Apply (persist + staged restart)". + +2. **[Definite] The probation watchdog's owner and evaluation point are + unspecified.** The marker is kernel state and "the kernel rolls back," but + the clearing event is observed by the *central plugin* (its poll), and the + window can expire while the plugin is wedged — wrong-but-present URL is + exactly the residue case the server guarantees don't cover. Two things + must be stated: (a) the kernel owns the probation timer and rollback + decision *independently of the central plugin functioning*; (b) probation + expiry is also evaluated **at boot, before plugin activation** — otherwise + a kernel-killing-but-valid config that crashloops under the service + manager's relaunch policy may never stay alive long enough for a running + timer to fire, and the gateway never rolls back. Boot-time evaluation + closes the crashloop case. Resolve by adding both sentences to + #post-apply-probation. + +3. **[Possible] The plugin→kernel "poll succeeded" signal path is + unspecified.** The facade shape is deliberately TBD, but this particular + signal is load-bearing for rollback correctness, and its absence invites + an implementer to have the plugin clear the marker file directly — + violating the state-ownership rule the spec itself establishes. One line + ("the facade includes a confirmation call; the plugin never touches + probation state") would pin it without designing the API. + +4. **[Possible] The etag sidecar's update timing now crosses the ownership + boundary.** The sidecar is plugin-owned wire bookkeeping, but apply is + kernel-owned, and the sidecar must come to reflect the new etag exactly + when the new config becomes the running one (during probation the new + config *is* running, so presenting the new etag mid-probation is correct — + and rollback must revert the sidecar too, or the gateway will present a + converged etag while running last-known-good). Who writes it, and when, in + both the apply and rollback directions? This is the one place where + "kernel owns apply" and "plugin owns the sidecar" genuinely collide; + resolve by specifying the handoff (simplest: the facade passes the etag + with the document, the kernel stages both, and the plugin re-reads the + sidecar at boot). + +5. **[Minor] Rollback leaves orphaned installs.** A failed config may have + hash-verified and installed plugins before validation or probation failed; + rollback restores the config but the spec says nothing about the installed + trees or lock-file entries. Probably correct to leave them (the lock file + records installs, not the active set, and re-apply after a fixed config + becomes cheaper) — but say so, or the lock file's meaning quietly shifts. + +6. **[Minor] `hypaware join ` puts a credential in argv** — + shell history and process listings on the very MDM-scripted machines this + targets. Policy tokens are multi-use and fleet-wide, which raises the + blast radius. Suggest the command also accept `--token-file` / stdin and + the spec recommend that form for MDM scripts. + +## Suggestions + +Prioritized: + +1. Fix concern 1 (one line) and add the two probation sentences from + concern 2 — these are the only changes I'd block on. +2. Pin the confirmation-signal ownership (concern 3) and the sidecar handoff + (concern 4) in a sentence each. +3. **Add an operator-visibility line:** probation state, last rollback, and + the remembered bad etag should surface in `hypaware status` (LLP 0009 + core-rendered status). "Rollback diagnosis stays in client logs for V1" is + fine for the server side, but the operator standing at the machine + shouldn't need log spelunking to learn the gateway rejected a config — + and this spec's own log-driven-development culture argues for it. +4. **Consider A/B config slots as the implementation idiom** for + "file swap": write configs to content-addressed or alternating paths and + flip an atomic pointer (symlink or one-line file). Same semantics the spec + already requires, but it makes "persist last-known-good" crash-safe by + construction — there is no moment where a crash between persist and + restart leaves an ambiguous operative config. Non-standard for config + files, standard for OTA updates, and this *is* an OTA update scheme. +5. The "chained apply" case is worth one sentence: a probation-clearing poll + may itself return 200 with a newer revision, triggering an immediate + second apply. This is correct behavior (each apply gets its own + probation), but stating it prevents an implementer from "helpfully" + serializing or suppressing it. + +## Open questions + +Beyond the three the spec already records (all appropriately deferred): + +- Where does probation rollback report *to*? The server sees non-convergence + via etag, but cannot distinguish "rolled back" from "never applied." If the + `gateways` dataset later wants a rollback column, the client needs to have + been recording the reason from day one — cheap now, annoying to retrofit. +- Does the lock file distinguish "installed and in the active config" from + "installed, orphaned by rollback"? (Falls out of concern 5.) +- Is there a maximum config document size the client will accept? A + wholesale-replace model means a malformed-but-authenticated 200 of + arbitrary size goes straight into memory and onto disk; a stated cap is + one line of defense-in-depth. + +## Recommended next step + +Stay `Draft` for one more pass: address concerns 1–2 (small, mechanical) and +decide on 3–4 (a sentence each). After that this is ready to move to +`Review` — the design itself is sound, the decisions are well-argued, and +nothing here is wrongheaded. Note that a single AI review is not sufficient +for acceptance; this round came from a reviewer who participated in the +grilling session that shaped the document, so an independent model's review +(and human judgment) should follow once the Draft revisions land. diff --git a/src/core/cli/core_commands.js b/src/core/cli/core_commands.js index 55a2eea..23a5779 100644 --- a/src/core/cli/core_commands.js +++ b/src/core/cli/core_commands.js @@ -216,6 +216,12 @@ function buildCoreCommands() { usage: 'hyp init [preset]', run: runInit, }, + { + name: 'join', + summary: 'Join a centrally-managed fleet (write seed config + install daemon)', + usage: 'hyp join [token] [--token-file ] [--bin ] [--no-daemon]', + run: runJoin, + }, { name: 'attach', summary: 'Attach an AI client to the local gateway', @@ -505,6 +511,22 @@ function renderStatusJson({ report, clientNames, datasets, cacheRoot }) { oldest_partition_date: report.cache.oldestDate, }, recent_error_count: report.recentErrorCount, + // Remote-config apply state (LLP 0025). All-null until the gateway + // applies its first centrally-served config. + remote_config: report.remoteConfig + ? { + running_etag: report.remoteConfig.runningEtag, + probation: report.remoteConfig.probation + ? { + etag: report.remoteConfig.probation.etag, + applied_at: report.remoteConfig.probation.applied_at, + until: report.remoteConfig.probation.until, + } + : null, + last_rollback: report.remoteConfig.lastRollback, + bad_etag: report.remoteConfig.badEtag, + } + : null, diagnostics: report.diagnostics.map((d) => ({ severity: d.severity, kind: d.kind, @@ -597,6 +619,23 @@ function renderStatusText({ report, clientNames, datasets, cacheRoot, stdout }) stdout.write(` datasets: ${datasets.length}\n`) stdout.write(` recent errors: ${report.recentErrorCount}\n`) + // Remote-config section appears only once the gateway has state to + // show — a never-joined install keeps the V1 status surface. + const rc = report.remoteConfig + if (rc && (rc.runningEtag || rc.probation || rc.lastRollback || rc.badEtag)) { + stdout.write(' remote config:\n') + if (rc.runningEtag) stdout.write(` running etag: ${rc.runningEtag}\n`) + if (rc.probation) { + stdout.write(` probation: ${rc.probation.etag} until ${rc.probation.until}\n`) + } + if (rc.lastRollback) { + stdout.write(` last rollback: ${rc.lastRollback.etag} at ${rc.lastRollback.at} (${rc.lastRollback.reason})\n`) + } + if (rc.badEtag) { + stdout.write(` bad etag: ${rc.badEtag.etag} (${rc.badEtag.reason})\n`) + } + } + if (report.diagnostics.length > 0) { stdout.write(' diagnostics:\n') for (const d of report.diagnostics) { @@ -2796,6 +2835,193 @@ async function runInitFromFile(flags, ctx) { return 0 } +/** + * `hyp join [token]` — join a centrally-managed fleet. Pure + * sugar over two existing steps: write the seed config (an ordinary v2 + * config containing exactly the central plugin) and run the + * non-interactive daemon install. Doing those two steps by hand is + * specified to be exactly equivalent. + * + * Because a policy token is a multi-use fleet-wide credential, the + * token can (and for MDM scripts, should) arrive via `--token-file` + * or stdin instead of argv — a bare argv token lands in shell history + * and process listings. The seed config is written mode 0600. + * + * @param {string[]} argv + * @param {CommandRunContext} ctx + * @ref LLP 0025#seed-config-mode [implements] — join = write-seed-config + daemon install; a wrapper, not a second code path + */ +async function runJoin(argv, ctx) { + const parsed = parseJoinArgs(argv) + if (parsed.help) { + ctx.stdout.write('usage: hyp join [token] [--token-file ] [--bin ] [--no-daemon]\n') + ctx.stdout.write(' token sources (pick one): positional argument, --token-file, or stdin\n') + return 0 + } + if (parsed.error) { + ctx.stderr.write(`hyp join: ${parsed.error}\n`) + return 2 + } + + try { + const url = new URL(/** @type {string} */ (parsed.url)) + if (url.protocol !== 'http:' && url.protocol !== 'https:') { + ctx.stderr.write(`hyp join: url must be http(s); got ${url.protocol}\n`) + return 2 + } + } catch { + ctx.stderr.write(`hyp join: not a valid URL: ${parsed.url}\n`) + return 2 + } + + /** @type {string | undefined} */ + let token = parsed.token + if (token === undefined && parsed.tokenFile !== undefined) { + try { + token = (await fs.readFile(parsed.tokenFile, 'utf8')).trim() + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + ctx.stderr.write(`hyp join: --token-file: ${message}\n`) + return 1 + } + } + if (token === undefined) { + if (isTty(ctx.stdin)) { + ctx.stderr.write('hyp join: no token given — pass it as an argument, via --token-file, or on stdin\n') + return 2 + } + token = (await readAllStdin(ctx.stdin)).trim() + } + if (token.length === 0) { + ctx.stderr.write('hyp join: token is empty\n') + return 2 + } + + /** @type {HypAwareV2Config} */ + const seed = { + version: 2, + plugins: [{ name: '@hypaware/central' }], + sinks: { + central: { + plugin: '@hypaware/central', + config: { + url: /** @type {string} */ (parsed.url), + identity: { bootstrap_token: token }, + }, + }, + }, + } + + const catalogCtx = await buildKnownPluginsForCtx(ctx) + const validation = await validateConfig(seed, { + knownPlugins: catalogCtx.knownPlugins, + knownDatasets: catalogCtx.knownDatasets, + }) + if (!validation.ok) { + for (const err of validation.errors) { + ctx.stderr.write(`hyp join: [${err.errorKind}] ${err.pointer || ''}: ${err.message}\n`) + } + return 1 + } + + const obsEnv = readObservabilityEnv(ctx.env) + const targetPath = ctx.env.HYP_CONFIG + ? path.resolve(ctx.env.HYP_CONFIG) + : defaultConfigPath(obsEnv.hypHome) + + return withSpan( + 'join.run', + { + [Attr.COMPONENT]: 'join', + [Attr.OPERATION]: 'join.run', + config_path: targetPath, + install_daemon: !parsed.noDaemon, + status: 'ok', + }, + async (span) => { + // The token is the only credential on disk until the first + // bootstrap, so the seed write is atomic and mode 0600. + await fs.mkdir(path.dirname(targetPath), { recursive: true }) + const tmp = `${targetPath}.tmp.${process.pid}.${Date.now()}` + await fs.writeFile(tmp, JSON.stringify(seed, null, 2) + '\n', { mode: 0o600 }) + await fs.rename(tmp, targetPath) + ctx.stdout.write(`✓ Wrote seed config ${targetPath}\n`) + + if (parsed.noDaemon) { + ctx.stdout.write(' daemon install skipped (--no-daemon); run `hyp daemon install` to finish joining\n') + return 0 + } + + const installArgv = parsed.binPath !== undefined ? ['--bin', parsed.binPath] : [] + const code = await runDaemonInstall(installArgv, ctx) + if (code !== 0) { + span.setAttribute('status', 'failed') + span.setAttribute('error_kind', 'daemon_install_failed') + return code + } + ctx.stdout.write('✓ Joined — the daemon will pull its configuration from the server\n') + return 0 + }, + { component: 'join' } + ) +} + +/** + * @param {string[]} argv + * @returns {{ help?: boolean, error?: string, url?: string, token?: string, tokenFile?: string, binPath?: string, noDaemon?: boolean }} + */ +function parseJoinArgs(argv) { + /** @type {{ help?: boolean, error?: string, url?: string, token?: string, tokenFile?: string, binPath?: string, noDaemon?: boolean }} */ + const r = {} + /** @type {string[]} */ + const positional = [] + for (let i = 0; i < argv.length; i += 1) { + const token = argv[i] + if (token === '--help' || token === '-h') { r.help = true; return r } + if (token === '--no-daemon') { r.noDaemon = true; continue } + if (token === '--token-file' || token.startsWith('--token-file=')) { + const value = token === '--token-file' ? argv[++i] : token.slice('--token-file='.length) + if (!value) return { error: '--token-file: requires a path' } + r.tokenFile = value + continue + } + if (token === '--bin' || token.startsWith('--bin=')) { + const value = token === '--bin' ? argv[++i] : token.slice('--bin='.length) + if (!value) return { error: '--bin: requires a path' } + r.binPath = value + continue + } + if (token.startsWith('-') && token !== '-') { + return { error: `unknown argument: ${token}` } + } + positional.push(token) + } + if (positional.length === 0) return { error: 'missing (see hyp join --help)' } + if (positional.length > 2) return { error: `unexpected argument: ${positional[2]}` } + r.url = positional[0] + // '-' as the token positional means "read from stdin", same as + // omitting it on a piped invocation. + if (positional.length === 2 && positional[1] !== '-') r.token = positional[1] + if (r.token !== undefined && r.tokenFile !== undefined) { + return { error: 'pass the token either as an argument or via --token-file, not both' } + } + return r +} + +/** + * @param {unknown} stdin + * @returns {Promise} + */ +async function readAllStdin(stdin) { + const stream = /** @type {AsyncIterable | undefined} */ (stdin) + if (!stream || typeof (/** @type {any} */ (stream))[Symbol.asyncIterator] !== 'function') return '' + let out = '' + for await (const chunk of stream) { + out += typeof chunk === 'string' ? chunk : chunk.toString('utf8') + } + return out +} + /** @param {unknown} stream */ function isTty(stream) { return !!stream && typeof stream === 'object' && /** @type {{ isTTY?: boolean }} */ (stream).isTTY === true diff --git a/src/core/config/apply.js b/src/core/config/apply.js new file mode 100644 index 0000000..4cbb5bb --- /dev/null +++ b/src/core/config/apply.js @@ -0,0 +1,569 @@ +// @ts-check + +import fs from 'node:fs' +import path from 'node:path' + +import { Attr, getLogger, withSpan } from '../observability/index.js' +import { parseConfigShape } from './schema.js' + +/** + * @import { ConfigApplyErrorKind, ConfigControlFacade, ConfigStageResult, HypAwareV2Config, PluginConfigInstance } from '../../../collectivus-plugin-kernel-types.d.ts' + * @import { + * ConfigApplyDeps, + * ConfigControl, + * ConfigControlState, + * ConfigControlStatus, + * ConfigRollbackReason, + * ConfigSlot, + * CreateConfigControlOptions, + * ProbationMarker, + * } from './types.d.ts' + */ + +/** + * Maximum accepted config document size in bytes. A pulled 200 body is + * parsed and persisted wholesale, so a stated cap bounds memory and + * disk regardless of what an authenticated server sends. 1 MiB is + * orders of magnitude above any real config. + * @ref LLP 0025#config-pull-loop [implements] — max accepted config document size, settled at 1 MiB + */ +export const MAX_CONFIG_DOCUMENT_BYTES = 1024 * 1024 + +/** + * Default config pull cadence (seconds) when the staged document's + * central sink does not set `poll_interval_seconds`. Mirrors the + * central plugin's own default — the kernel needs the value to size + * the probation window without asking the plugin. + */ +export const DEFAULT_POLL_INTERVAL_SECONDS = 300 + +/** + * Probation window floor (seconds). The window is + * `max(3 × poll_interval_seconds, floor)` so a fast poll cadence still + * leaves room for daemon relaunch + identity refresh + one retry. + * @ref LLP 0025#post-apply-probation [implements] — window formula with the floor settled at 120s + */ +export const PROBATION_FLOOR_SECONDS = 120 + +const CONTROL_DIRNAME = 'config-control' +const STATE_BASENAME = 'state.json' + +/** + * Build the kernel config apply engine: shape-check → install pinned + * plugins → validate against the post-install catalog → persist to an + * A/B slot → flip the operative pointer → staged restart, plus + * probation and last-known-good rollback. + * + * Persistence idiom: each applied config is written to its own slot + * file under `/config-control/`, with the served ETag in a + * per-slot sidecar written *before* the flip. The operative config + * path becomes a symlink to the active slot, replaced atomically via + * tmp+rename — so the config document and its etag transition together + * in both directions (apply and rollback), and last-known-good is + * crash-safe by construction (the previous slot is never modified). + * + * @param {CreateConfigControlOptions} opts + * @returns {ConfigControl} + * @ref LLP 0025#apply-engine-is-kernel-surface [implements] — the engine is kernel-owned; plugins only see the narrow facade + */ +export function createConfigControl(opts) { + const { stateRoot, configPath, requestRestart } = opts + const now = opts.now ?? Date.now + const log = getLogger('config-control') + const controlDir = path.join(stateRoot, CONTROL_DIRNAME) + const statePath = path.join(controlDir, STATE_BASENAME) + + /** @type {ConfigApplyDeps | null} */ + let applyDeps = null + /** @type {NodeJS.Timeout | null} */ + let watchdog = null + let restartPending = false + + /** @returns {ConfigControlState} */ + function readState() { + return readControlState(statePath) + } + + /** @param {ConfigControlState} state */ + function writeState(state) { + fs.mkdirSync(controlDir, { recursive: true, mode: 0o700 }) + const tmp = `${statePath}.tmp.${process.pid}.${now()}` + fs.writeFileSync(tmp, JSON.stringify(state, null, 2) + '\n', { mode: 0o600 }) + fs.renameSync(tmp, statePath) + } + + /** @param {ConfigSlot} slot */ + function slotPath(slot) { + return path.join(controlDir, `config.${slot}.json`) + } + + /** @param {ConfigSlot} slot */ + function slotEtagPath(slot) { + return path.join(controlDir, `config.${slot}.etag`) + } + + /** @returns {ConfigSlot | null} */ + function activeSlot() { + return readActiveSlot(controlDir, configPath) + } + + /** + * Atomically point the operative config path at `slot`. A relative + * symlink is created at a tmp path and renamed over the config path, + * so a crash leaves either the old or the new pointer — never + * neither. + * + * @param {ConfigSlot} slot + */ + function flipPointer(slot) { + const target = path.relative(path.dirname(configPath), slotPath(slot)) + const tmp = `${configPath}.tmp.${process.pid}.${now()}` + fs.symlinkSync(target, tmp) + fs.renameSync(tmp, configPath) + } + + /** @returns {string | undefined} */ + function runningEtag() { + return readRunningEtag(controlDir, configPath) + } + + /** + * Revert to the previous operative config: flip the pointer back + * (the per-slot etag sidecar reverts with it), clear probation, + * remember the bad etag, and record the structured rollback reason. + * + * @param {ProbationMarker} marker + * @param {ConfigRollbackReason} reason + * @param {string} [detail] + * @ref LLP 0025#last-known-good-rollback [implements] — flip back + remembered bad etag + structured reason, recorded client-side from day one + */ + function rollback(marker, reason, detail) { + if (marker.previous_slot) { + flipPointer(marker.previous_slot) + } + const at = new Date(now()).toISOString() + const state = readState() + delete state.probation + state.bad_etag = { etag: marker.etag, reason, recorded_at: at } + state.last_rollback = { + etag: marker.etag, + reason, + at, + ...(detail ? { detail } : {}), + } + writeState(state) + log.warn('config.rollback', { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.rollback', + [Attr.ERROR_KIND]: reason, + config_etag: marker.etag, + rolled_back_to_slot: marker.previous_slot ?? 'none', + ...(detail ? { detail } : {}), + status: 'ok', + }) + } + + function disarmProbationWatchdog() { + if (watchdog) { + clearTimeout(watchdog) + watchdog = null + } + } + + /** + * Arm the in-process probation timer for the active marker, if any. + * Expiry rolls back and requests a staged restart onto + * last-known-good. The kernel owns this timer — a wedged central + * sink is exactly the failure probation must catch. + * @ref LLP 0025#post-apply-probation [implements] — kernel-owned watchdog, independent of the central plugin functioning + */ + function armProbationWatchdog() { + disarmProbationWatchdog() + const state = readState() + const marker = state.probation + if (!marker) return + const remainingMs = Math.max(0, Date.parse(marker.until) - now()) + watchdog = setTimeout(() => { + watchdog = null + const current = readState().probation + if (!current || current.etag !== marker.etag) return + log.error('config.probation_expired', { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.probation_expired', + config_etag: marker.etag, + status: 'failed', + }) + rollback(current, 'probation_expired') + restartPending = true + requestRestart('probation_expired') + }, remainingMs) + if (typeof watchdog.unref === 'function') watchdog.unref() + } + + /** + * Boot-time probation evaluation, run before plugin activation: a + * kernel-killing-but-valid config can crashloop under the service + * manager faster than any in-process timer fires, so each relaunch + * checks the marker first. + * @ref LLP 0025#post-apply-probation [implements] — probation expiry is evaluated at boot, before plugin activation + */ + async function evaluateAtBoot() { + const state = readState() + const marker = state.probation + if (!marker) return { action: /** @type {const} */ ('none') } + + // A marker whose slot is not the operative pointer means the apply + // crashed between persisting the marker and flipping — the new + // config never took effect, so there is nothing to probe. + if (activeSlot() !== marker.slot) { + delete state.probation + writeState(state) + log.warn('config.probation_orphaned', { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.probation_orphaned', + config_etag: marker.etag, + status: 'ok', + }) + return { action: /** @type {const} */ ('cleared_orphan') } + } + + if (Date.parse(marker.until) <= now()) { + rollback(marker, 'probation_expired') + return { action: /** @type {const} */ ('rolled_back') } + } + return { action: /** @type {const} */ ('none') } + } + + function confirmPoll() { + const state = readState() + if (!state.probation) return + const etag = state.probation.etag + delete state.probation + writeState(state) + disarmProbationWatchdog() + log.info('config.probation_cleared', { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.probation_cleared', + config_etag: etag, + status: 'ok', + }) + } + + /** + * @param {unknown} document + * @param {string} etag + * @returns {Promise} + */ + async function stage(document, etag) { + return withSpan( + 'config.apply', + { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.apply', + config_etag: etag, + status: 'ok', + }, + async (span) => { + /** @param {ConfigApplyErrorKind} errorKind @param {string} message */ + function fail(errorKind, message) { + span.setAttribute('status', 'failed') + span.setAttribute('error_kind', errorKind) + log.error('config.apply_failed', { + [Attr.COMPONENT]: 'config-control', + [Attr.ERROR_KIND]: errorKind, + config_etag: etag, + message, + }) + return /** @type {ConfigStageResult} */ ({ ok: false, errorKind, message }) + } + + if (restartPending) { + return fail('restart_pending', 'a staged restart is already pending') + } + if (!applyDeps) { + return fail('apply_engine_not_ready', 'apply engine has no validator/installer attached') + } + if (typeof etag !== 'string' || etag.length === 0) { + return fail('config_invalid', 'stage() requires the served etag') + } + if (etag === runningEtag()) { + span.setAttribute('apply_action', 'noop_same_etag') + return { ok: true, action: 'noop_same_etag' } + } + + const state = readState() + // Re-apply backoff: one remembered bad etag, skipped until the + // server serves a different revision. Re-polling is fine; an + // apply-crash loop is not. + if (state.bad_etag && state.bad_etag.etag === etag) { + span.setAttribute('apply_action', 'skipped_bad_etag') + log.warn('config.apply_skipped', { + [Attr.COMPONENT]: 'config-control', + config_etag: etag, + hyp_reason: 'bad_etag_backoff', + }) + return { ok: true, action: 'skipped_bad_etag' } + } + + const serialized = JSON.stringify(document, null, 2) + '\n' + if (Buffer.byteLength(serialized, 'utf8') > MAX_CONFIG_DOCUMENT_BYTES) { + return fail('document_too_large', `config document exceeds ${MAX_CONFIG_DOCUMENT_BYTES} bytes`) + } + + // Shape-gate, then install, then full validation. Catalog-backed + // validation can only know a plugin once it is installed, so a + // served config naming a not-yet-installed plugin must install + // first — but install must not act on an arbitrary document, so + // the shape (including the pin fields' types) is checked before + // anything is fetched, and the hash pin bounds what an install + // can bring in. + // @ref LLP 0025#install-on-config-hash-pinned [implements] — shape-gate → install pinned plugins → validate against the post-install catalog + const shape = parseConfigShape(document) + if (!shape.ok) { + const first = shape.errors[0] + rememberBadEtag(etag, 'validation_failed') + return fail( + 'config_invalid', + first ? `${first.pointer || ''}: ${first.message}` : 'config shape invalid' + ) + } + const config = shape.config + + const install = await applyDeps.installPinnedPlugins(config.plugins ?? []) + if (!install.ok) { + rememberBadEtag( + etag, + install.errorKind === 'artifact_hash_mismatch' + ? 'artifact_hash_mismatch' + : install.errorKind === 'bundled_version_mismatch' + ? 'bundled_version_mismatch' + : 'plugin_install_failed' + ) + return fail(install.errorKind, install.message) + } + + const validation = await applyDeps.validateDocument(document) + if (!validation.ok) { + const first = validation.errors[0] + rememberBadEtag(etag, 'validation_failed') + return fail( + 'config_invalid', + first ? `${first.pointer || ''}: ${first.message}` : 'config validation failed' + ) + } + + try { + commit(config, serialized, etag) + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + return fail('apply_io_error', message) + } + + span.setAttribute('apply_action', 'applied') + log.info('config.applied', { + [Attr.COMPONENT]: 'config-control', + [Attr.OPERATION]: 'config.apply', + config_etag: etag, + status: 'ok', + }) + restartPending = true + requestRestart('config_applied') + return /** @type {ConfigStageResult} */ ({ ok: true, action: 'applied' }) + }, + { component: 'config-control' } + ) + } + + /** + * Remember a rejected revision so re-polls don't become an + * apply-fail loop. Pre-flip failures (validation, install) record + * only the bad etag + reason; `last_rollback` is reserved for actual + * reverts of an applied config. + * + * @param {string} etag + * @param {ConfigRollbackReason} reason + */ + function rememberBadEtag(etag, reason) { + const state = readState() + state.bad_etag = { etag, reason, recorded_at: new Date(now()).toISOString() } + writeState(state) + } + + /** + * Persist `serialized` to the inactive slot, write its etag sidecar + * and the probation marker, then flip the pointer as the last step. + * Ordering is the crash-safety argument: everything before the flip + * is invisible to boot; the flip itself is atomic; the marker's + * `slot` field lets `evaluateAtBoot` discard a marker whose flip + * never happened. + * + * @param {HypAwareV2Config} config + * @param {string} serialized + * @param {string} etag + * @ref LLP 0025#apply-semantics-staged-restart [implements] — A/B slots with an atomic pointer; never live-mutate; restart does the activation + */ + function commit(config, serialized, etag) { + fs.mkdirSync(controlDir, { recursive: true, mode: 0o700 }) + const current = activeSlot() + + /** @type {ConfigSlot | null} */ + let previousSlot = current + if (current === null) { + // First apply over a regular file (the seed, or a hand-written + // config): preserve its bytes in slot 'a' so rollback lands back + // on it. Seed-config mode is a legitimate steady state, so this + // is a safe rollback target by construction. + let seedRaw = null + try { + seedRaw = fs.readFileSync(configPath, 'utf8') + } catch (err) { + if (/** @type {NodeJS.ErrnoException} */ (err).code !== 'ENOENT') throw err + } + if (seedRaw !== null) { + fs.writeFileSync(slotPath('a'), seedRaw, { mode: 0o600 }) + fs.rmSync(slotEtagPath('a'), { force: true }) + previousSlot = 'a' + } + } + + /** @type {ConfigSlot} */ + const target = previousSlot === 'b' ? 'a' : 'b' + fs.writeFileSync(slotPath(target), serialized, { mode: 0o600 }) + fs.writeFileSync(slotEtagPath(target), etag + '\n', { mode: 0o600 }) + + const pollSeconds = pollIntervalFromConfig(config) + const windowSeconds = Math.max(3 * pollSeconds, PROBATION_FLOOR_SECONDS) + const state = readState() + state.probation = { + etag, + applied_at: new Date(now()).toISOString(), + until: new Date(now() + windowSeconds * 1000).toISOString(), + slot: target, + previous_slot: previousSlot, + } + writeState(state) + + flipPointer(target) + } + + /** @returns {Promise} */ + async function status() { + return readConfigControlStatus({ stateRoot, configPath }) + } + + return { + stage, + confirmPoll, + runningEtag, + evaluateAtBoot, + attachApplyDeps(deps) { applyDeps = deps }, + armProbationWatchdog, + disarmProbationWatchdog, + status, + } +} + +/* ---------- shared read-only helpers ---------- */ + +/** + * @param {string} statePath + * @returns {ConfigControlState} + */ +function readControlState(statePath) { + let raw + try { + raw = fs.readFileSync(statePath, 'utf8') + } catch (err) { + if (/** @type {NodeJS.ErrnoException} */ (err).code === 'ENOENT') return {} + throw err + } + const parsed = JSON.parse(raw) + return parsed && typeof parsed === 'object' ? parsed : {} +} + +/** + * Which slot the operative config symlink points at, or null when it + * is a regular file (seed / hand-written config) or missing. + * + * @param {string} controlDir + * @param {string} configPath + * @returns {ConfigSlot | null} + */ +function readActiveSlot(controlDir, configPath) { + let target + try { + target = fs.readlinkSync(configPath) + } catch { + return null + } + const resolved = path.resolve(path.dirname(configPath), target) + if (resolved === path.join(controlDir, 'config.a.json')) return 'a' + if (resolved === path.join(controlDir, 'config.b.json')) return 'b' + return null +} + +/** + * @param {string} controlDir + * @param {string} configPath + * @returns {string | undefined} + */ +function readRunningEtag(controlDir, configPath) { + const slot = readActiveSlot(controlDir, configPath) + if (!slot) return undefined + try { + const etag = fs.readFileSync(path.join(controlDir, `config.${slot}.etag`), 'utf8').trim() + return etag.length > 0 ? etag : undefined + } catch { + return undefined + } +} + +/** + * Read-only view of the apply engine's state for `hypaware status` — + * usable from any process (the CLI is not the daemon), so it never + * constructs the engine or takes its hooks. + * + * @param {{ stateRoot: string, configPath: string }} args + * @returns {ConfigControlStatus} + * @ref LLP 0025#last-known-good-rollback [implements] — operator-visible probation/rollback/bad-etag state without log spelunking + */ +export function readConfigControlStatus({ stateRoot, configPath }) { + const controlDir = path.join(stateRoot, CONTROL_DIRNAME) + /** @type {ConfigControlState} */ + let state = {} + try { + state = readControlState(path.join(controlDir, STATE_BASENAME)) + } catch { + // unreadable state surfaces as empty — status is best-effort + } + return { + probation: state.probation ?? null, + lastRollback: state.last_rollback ?? null, + badEtag: state.bad_etag ?? null, + runningEtag: readRunningEtag(controlDir, configPath) ?? null, + } +} + +/** + * Extract the config pull cadence from the staged document's central + * sink block to size the probation window. The window must track the + * *new* config's cadence — that is the sink that will (or won't) + * confirm the poll. Knowing the first-party plugin name here mirrors + * the client-descriptor precedent in `plugin_catalog.js`. + * + * @param {HypAwareV2Config} config + * @returns {number} + */ +function pollIntervalFromConfig(config) { + let min = Infinity + for (const sink of Object.values(config.sinks ?? {})) { + if (!('plugin' in sink) || sink.plugin !== '@hypaware/central') continue + const v = sink.config?.poll_interval_seconds + if (typeof v === 'number' && Number.isFinite(v) && v > 0) { + min = Math.min(min, v) + } else { + min = Math.min(min, DEFAULT_POLL_INTERVAL_SECONDS) + } + } + return Number.isFinite(min) ? min : DEFAULT_POLL_INTERVAL_SECONDS +} diff --git a/src/core/config/apply_deps.js b/src/core/config/apply_deps.js new file mode 100644 index 0000000..6b303f9 --- /dev/null +++ b/src/core/config/apply_deps.js @@ -0,0 +1,144 @@ +// @ts-check + +import { Attr, getLogger } from '../observability/index.js' +import { parseConfigShape } from './schema.js' +import { validateConfig } from './validate.js' +import { buildPluginCatalog } from '../plugin_catalog.js' +import { discoverBundledPlugins } from '../runtime/bundled.js' +import { discoverInstalledPlugins } from '../runtime/installed.js' +import { installPlugin, loadLock } from '../plugin_install/install.js' +import { getEntry } from '../plugin_install/lock.js' + +/** + * @import { PluginConfigInstance, PluginName, ValidationError } from '../../../collectivus-plugin-kernel-types.d.ts' + * @import { ConfigApplyDeps, PinnedInstallResult } from './types.d.ts' + */ + +/** + * Build the apply-time dependencies the config apply engine needs: + * full-document validation against the live plugin catalog, and + * hash-pinned plugin installation through the LLP 0007 install path. + * Constructed by the daemon after kernel boot (the catalog needs the + * bundled manifest set) and attached via + * `configControl.attachApplyDeps()`. + * + * @param {{ stateRoot: string, workspaceDir?: string }} opts + * @returns {ConfigApplyDeps} + */ +export function buildConfigApplyDeps(opts) { + const { stateRoot, workspaceDir } = opts + const log = getLogger('config-control') + + /** + * Discover bundled + installed manifests fresh per apply: an apply + * may have just installed a plugin, and a stale catalog would reject + * the very config that named it. + */ + async function discover() { + const bundled = await discoverBundledPlugins( + workspaceDir !== undefined ? { workspaceDir } : {} + ) + const installed = await discoverInstalledPlugins({ stateDir: stateRoot }) + return { bundled, installed } + } + + /** @param {unknown} document */ + async function validateDocument(document) { + const shape = parseConfigShape(document) + if (!shape.ok) { + return { ok: false, errors: shape.errors } + } + const { bundled, installed } = await discover() + const catalog = buildPluginCatalog( + [...bundled.loaded, ...bundled.excluded], + installed.loaded + ) + const result = await validateConfig(shape.config, { + knownPlugins: catalog.pluginMetadata, + knownDatasets: catalog.knownDatasets, + }) + return { ok: result.ok, errors: /** @type {ValidationError[]} */ (result.errors) } + } + + /** + * Install every pinned plugin the staged config names. Bundled + * first-party plugins satisfy the pin by strict version equality and + * skip the hash check (bundled code is inside the kernel's own trust + * boundary); everything else goes through the regular fetch path, + * with the artifact hash verified before the install commits. + * + * @param {PluginConfigInstance[]} entries + * @returns {Promise} + * @ref LLP 0025#install-on-config-hash-pinned [implements] — existing LLP 0007 install path; hash mismatch is an apply failure + */ + async function installPinnedPlugins(entries) { + const { bundled, installed } = await discover() + /** @type {Map} */ + const bundledVersions = new Map() + for (const m of [...bundled.loaded, ...bundled.excluded]) { + bundledVersions.set(m.manifest.name, m.manifest.version) + } + const installedNames = new Set(installed.loaded.map((m) => m.manifest.name)) + const lock = await loadLock(stateRoot) + + for (const entry of entries) { + if (entry.enabled === false) continue + + const bundledVersion = bundledVersions.get(entry.name) + if (bundledVersion !== undefined) { + // @ref LLP 0025#bundled-first-party-plugins [implements] — version checked strictly, artifact hash not checked for bundled plugins + if (entry.version !== undefined && entry.version !== bundledVersion) { + return { + ok: false, + errorKind: 'bundled_version_mismatch', + message: `plugin ${entry.name}: config pins version ${entry.version} but the bundled version is ${bundledVersion}`, + } + } + continue + } + + const locked = getEntry(lock, /** @type {PluginName} */ (entry.name)) + const satisfied = locked + && installedNames.has(entry.name) + && (entry.version === undefined || locked.version === entry.version) + && (entry.artifact_hash === undefined || locked.content_hash === entry.artifact_hash) + if (satisfied) continue + + const result = await installPlugin({ + rawSource: entry.source ?? entry.name, + stateDir: stateRoot, + ...(entry.version !== undefined ? { opts: { ref: `v${entry.version}` } } : {}), + // The hash pin is verified against the staged artifact before + // the install commits — nothing may substitute code after the + // config was authored. + confirm: async (staged) => { + if (entry.artifact_hash !== undefined && staged.contentHash !== entry.artifact_hash) { + log.error('config.pin_hash_mismatch', { + [Attr.COMPONENT]: 'config-control', + [Attr.PLUGIN]: entry.name, + [Attr.ERROR_KIND]: 'artifact_hash_mismatch', + pinned_hash: entry.artifact_hash, + fetched_hash: staged.contentHash, + }) + return { proceed: false, outcome: 'rejected' } + } + if (entry.version !== undefined && staged.manifest.version !== entry.version) { + return { proceed: false, outcome: 'rejected' } + } + return { proceed: true, outcome: 'auto_yes' } + }, + }) + if (!result.ok) { + const hashRejected = result.errorKind === 'remote_install_rejected' + return { + ok: false, + errorKind: hashRejected ? 'artifact_hash_mismatch' : 'plugin_install_failed', + message: `plugin ${entry.name}: ${result.message}`, + } + } + } + return { ok: true } + } + + return { validateDocument, installPinnedPlugins } +} diff --git a/src/core/config/schema.js b/src/core/config/schema.js index 777f561..c63d5b7 100644 --- a/src/core/config/schema.js +++ b/src/core/config/schema.js @@ -343,10 +343,20 @@ function parsePluginEntry(entry, pointer, errors) { if (obj.config !== undefined && !isPlainObject(obj.config)) { errors.push({ pointer: `${pointer}/config`, message: 'config must be an object when present' }) } + // Pin fields set by centrally-served configs (LLP 0025). Optional in + // hand-written configs; the apply engine enforces them when present. + for (const key of /** @type {const} */ (['version', 'artifact_hash', 'source'])) { + if (obj[key] !== undefined && !isNonEmptyString(obj[key])) { + errors.push({ pointer: `${pointer}/${key}`, message: `${key} must be a non-empty string when present` }) + } + } /** @type {PluginConfigInstance} */ const out = { name: obj.name } if (typeof obj.enabled === 'boolean') out.enabled = obj.enabled if (isPlainObject(obj.config)) out.config = /** @type {JsonObject} */ (obj.config) + if (isNonEmptyString(obj.version)) out.version = obj.version + if (isNonEmptyString(obj.artifact_hash)) out.artifact_hash = obj.artifact_hash + if (isNonEmptyString(obj.source)) out.source = obj.source return out } diff --git a/src/core/config/types.d.ts b/src/core/config/types.d.ts index 8f968c8..95ac807 100644 --- a/src/core/config/types.d.ts +++ b/src/core/config/types.d.ts @@ -1,5 +1,9 @@ import type { + ConfigApplyErrorKind, + ConfigControlFacade, + ConfigStageResult, HypAwareV2Config, + PluginConfigInstance, PluginName, CapabilityName, ConfigRegistry, @@ -88,3 +92,116 @@ export interface ValidateResult { pluginCount: number sinkCount: number } + +// ============================================================================= +// Config apply engine (LLP 0025) +// ============================================================================= + +/** Structured rollback reason recorded by the apply engine. */ +export type ConfigRollbackReason = + | 'validation_failed' + | 'plugin_install_failed' + | 'artifact_hash_mismatch' + | 'bundled_version_mismatch' + | 'probation_expired' + +/** A/B slot identifier for persisted config documents. */ +export type ConfigSlot = 'a' | 'b' + +/** + * Probation marker persisted before the staged restart and read back at + * the next boot. `slot` is the slot the apply flipped to; rollback + * flips to `previousSlot` (or back to the pre-apply regular file + * content preserved in that slot). + */ +export interface ProbationMarker { + /** ETag of the applied revision under probation. */ + etag: string + applied_at: string + /** ISO time after which an unconfirmed apply rolls back. */ + until: string + slot: ConfigSlot + previous_slot: ConfigSlot | null +} + +export interface ConfigRollbackRecord { + etag: string + reason: ConfigRollbackReason + at: string + detail?: string +} + +export interface RememberedBadEtag { + etag: string + reason: ConfigRollbackReason + recorded_at: string +} + +/** + * Kernel-managed apply bookkeeping, persisted atomically as one file + * under `/config-control/state.json`. + */ +export interface ConfigControlState { + probation?: ProbationMarker + bad_etag?: RememberedBadEtag + last_rollback?: ConfigRollbackRecord +} + +/** Result of installing one pinned plugin entry during apply. */ +export type PinnedInstallResult = + | { ok: true } + | { ok: false, errorKind: ConfigApplyErrorKind, message: string } + +/** + * Apply-time dependencies the daemon attaches once the kernel has + * booted (the validator needs the plugin catalog; the installer needs + * the bundled manifest set). Both are injectable so the engine state + * machine is testable without HTTP, git, or a real kernel boot. + */ +export interface ConfigApplyDeps { + /** Full document validation: shape + cross-plugin. */ + validateDocument(document: unknown): Promise<{ ok: boolean, errors: ValidationError[] }> + /** Install every pinned plugin the config names; verify pins. */ + installPinnedPlugins(entries: PluginConfigInstance[]): Promise +} + +/** Public status surface for `hypaware status`. */ +export interface ConfigControlStatus { + probation: ProbationMarker | null + lastRollback: ConfigRollbackRecord | null + badEtag: RememberedBadEtag | null + runningEtag: string | null +} + +/** + * Kernel-internal handle to the apply engine. The plugin-facing subset + * is `ConfigControlFacade`; everything else is daemon-only. + */ +export interface ConfigControl extends ConfigControlFacade { + /** + * Evaluate probation state before plugin activation: discard + * orphaned markers (apply never committed), roll back expired ones + * (flips the operative config in place; no restart needed since the + * kernel has not loaded it yet). + */ + evaluateAtBoot(): Promise<{ action: 'none' | 'cleared_orphan' | 'rolled_back' }> + /** Attach post-boot apply dependencies; `stage()` fails before this. */ + attachApplyDeps(deps: ConfigApplyDeps): void + /** Arm the in-process probation watchdog timer when a marker is active. */ + armProbationWatchdog(): void + /** Cancel the watchdog timer (daemon shutdown). */ + disarmProbationWatchdog(): void + status(): Promise +} + +export interface CreateConfigControlOptions { + /** Kernel state root (`/hypaware`). */ + stateRoot: string + /** Operative config path the daemon booted with. */ + configPath: string + /** Staged restart hook; the daemon exits with the restart code. */ + requestRestart(reason: string): void + now?: () => number +} + +export type { ConfigStageResult, ConfigApplyErrorKind } diff --git a/src/core/daemon/runtime.js b/src/core/daemon/runtime.js index cee9936..d2dc82f 100644 --- a/src/core/daemon/runtime.js +++ b/src/core/daemon/runtime.js @@ -12,7 +12,9 @@ import { } from '../observability/index.js' import { readObservabilityEnv } from '../observability/env.js' import { loadConfigFile } from '../config/schema.js' -import { bootKernel } from '../runtime/boot.js' +import { createConfigControl } from '../config/apply.js' +import { buildConfigApplyDeps } from '../config/apply_deps.js' +import { bootKernel, resolveConfigPath } from '../runtime/boot.js' import { createSinkDriver } from '../sinks/driver.js' import { materializeSinks } from '../sinks/materialize.js' import { @@ -45,6 +47,16 @@ import { statusFilePath, writeStatusFile } from './status.js' const DEFAULT_TICK_INTERVAL_MS = 60_000 const MIN_TICK_INTERVAL_MS = 25 +/** + * Exit code a foreground daemon uses to request its own relaunch after + * a staged config apply or rollback (EX_TEMPFAIL — "try again"). The + * service managers relaunch on any exit (`KeepAlive` / + * `Restart=always`); foreground invokers (smoke harness, dev shells) + * loop on this specific code. + * @ref LLP 0017#staged-restart-for-config-replacement [implements] — a foreground daemon cannot relaunch itself; the invoker loops on this code + */ +export const DAEMON_RESTART_EXIT_CODE = 75 + /** * Boot the kernel, start every configured source, and run sink ticks * on a fixed cadence. Returns a `DaemonHandle` the caller can use to @@ -114,7 +126,7 @@ export async function runDaemon(opts = {}) { const sinkSnapshots = new Map() /** @type {NodeJS.Timeout | null} */ let tickHandle = null - /** @type {((reason: 'signal'|'manual') => Promise) | null} */ + /** @type {((reason: 'signal'|'manual'|'restart') => Promise) | null} */ let triggerShutdown = null let shutdownInFlight = false /** @type {((value: number) => void) | null} */ @@ -138,6 +150,37 @@ export async function runDaemon(opts = {}) { writeStatusFile(stateRoot, status) fileLog.info('daemon.starting', { config_path: opts.configPath ?? null }) + // ----- Config apply engine (LLP 0025) ----- + // Created before bootKernel so probation expiry is evaluated before + // any plugin activates: a kernel-killing-but-valid config that + // crashloops under the service manager may never live long enough + // for an in-process timer to fire. + const operativeConfigPath = resolveConfigPath({ + ...(opts.configPath !== undefined ? { explicit: opts.configPath } : {}), + env, + hypHome, + }) + // An apply can land while the daemon is still wiring up (the pull + // loop's immediate pull races the tail of runDaemon), so a restart + // request before triggerShutdown exists is parked, not dropped. + let pendingRestart = false + const configControl = createConfigControl({ + stateRoot, + configPath: operativeConfigPath, + requestRestart: (reason) => { + fileLog.info('daemon.restart_requested', { hyp_reason: reason }) + if (triggerShutdown) { + void triggerShutdown('restart') + } else { + pendingRestart = true + } + }, + }) + const bootEval = await configControl.evaluateAtBoot() + if (bootEval.action !== 'none') { + fileLog.warn('daemon.config_probation_boot_action', { action: bootEval.action }) + } + /** * Persist the status snapshot to disk and update the gauge. * @param {Partial} [patch] @@ -173,6 +216,7 @@ export async function runDaemon(opts = {}) { mode: 'daemon', runId, env, + configControl, }) /** @type {Map} */ const sourcePluginByName = new Map() @@ -213,6 +257,13 @@ export async function runDaemon(opts = {}) { status.healthyAt = new Date(healthyAtMs).toISOString() } + // Attach apply-time deps before any sink materializes: the central + // sink's pull loop may deliver a document immediately after its + // bootstrap, and `stage()` refuses to run without a validator. The + // watchdog re-arms here on every relaunch that boots mid-probation. + configControl.attachApplyDeps(buildConfigApplyDeps({ stateRoot })) + configControl.armProbationWatchdog() + // ----- Materialize config-backed sinks ----- const sinkResult = await materializeSinks(boot.runtime, boot.config, { stateRoot, @@ -328,10 +379,11 @@ export async function runDaemon(opts = {}) { } // ----- Shutdown ----- - /** @param {'signal'|'manual'} reason */ + /** @param {'signal'|'manual'|'restart'} reason */ async function shutdown(reason) { if (shutdownInFlight) return done shutdownInFlight = true + configControl.disarmProbationWatchdog() if (tickHandle) { clearInterval(tickHandle) tickHandle = null @@ -366,6 +418,7 @@ export async function runDaemon(opts = {}) { for (const snap of status.sources) { snap.state = 'stopped' } + await closeAllSinks({ runtime: boot.runtime, fileLog }) }, { component: 'daemon' } ).catch((err) => { @@ -382,7 +435,8 @@ export async function runDaemon(opts = {}) { if (installSignals) { removeSignalHandlers() } - resolveDone?.(0) + // @ref LLP 0017#staged-restart-for-config-replacement [implements] — the daemon exits and the service manager (or looping invoker) relaunches it + resolveDone?.(reason === 'restart' ? DAEMON_RESTART_EXIT_CODE : 0) return done } triggerShutdown = shutdown @@ -458,6 +512,10 @@ export async function runDaemon(opts = {}) { process.on('SIGHUP', sigHupHandler) } + if (pendingRestart) { + void shutdown('restart') + } + return { done, stop: () => shutdown('manual'), @@ -549,6 +607,23 @@ async function startConfiguredSources({ runtime, log, fileLog, sourcePluginByNam return snapshots } +/** + * Close every materialized sink instance. The central plugin's config + * pull loop stops in its `close()` (identity refresh is lazy and has + * no timer), so shutdown must reach it even though sinks have no + * started/stopped lifecycle of their own. + * + * @param {{ runtime: KernelRuntime, fileLog: ReturnType }} args + */ +async function closeAllSinks({ runtime, fileLog }) { + try { + await runtime.sinks.closeAll() + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + fileLog.error('daemon.sink_close_failed', { message }) + } +} + /** * Stop every started source. Returns the list of names that failed * so the daemon can surface them as warnings on the final status diff --git a/src/core/daemon/status.js b/src/core/daemon/status.js index 8da3a40..b9a8267 100644 --- a/src/core/daemon/status.js +++ b/src/core/daemon/status.js @@ -6,6 +6,7 @@ import path from 'node:path' import process from 'node:process' import { defaultConfigPath, loadConfigFile } from '../config/schema.js' +import { readConfigControlStatus } from '../config/apply.js' import { devTelemetryDir, readObservabilityEnv } from '../observability/env.js' import { diagnoseV1Config, validateConfig } from '../config/validate.js' import { discoverInstalledPlugins } from '../runtime/installed.js' @@ -32,7 +33,7 @@ import { /** * @import { HypAwareV2Config } from '../../../collectivus-plugin-kernel-types.d.ts' - * @import { ConfigValidationError, V1Diagnostic } from '../config/types.d.ts' + * @import { ConfigControlStatus, ConfigValidationError, V1Diagnostic } from '../config/types.d.ts' * @import { ClientAttachReport, CollectStatusOptions, DaemonState, DaemonStatus, HypAwareStatusReport, ServiceState, SinkSnapshot, SourceSnapshot, StatusDiagnostic, StatusDiagnosticKind } from './types.d.ts' * @import { Dirent } from 'node:fs' * @import { PluginCatalog, ClientDescriptor } from '../plugin_catalog.js' @@ -404,6 +405,21 @@ export async function collectHypAwareStatus(opts = {}) { const cacheRoot = opts.runtime?.storage?.cacheRoot ?? path.join(stateRoot, 'cache') const cache = await measureCacheStats(cacheRoot) + // ----- remote config apply state (LLP 0025) ----- + /** @type {ConfigControlStatus | null} */ + let remoteConfig = null + try { + remoteConfig = readConfigControlStatus({ stateRoot, configPath }) + } catch { /* best-effort probe */ } + if (remoteConfig?.lastRollback) { + diagnostics.push({ + severity: 'warning', + kind: 'remote_config_rolled_back', + message: `remote config ${remoteConfig.lastRollback.etag} rolled back at ${remoteConfig.lastRollback.at} (${remoteConfig.lastRollback.reason})`, + repair: ['fix the central config revision; the gateway re-applies when the served etag changes'], + }) + } + // ----- recent errors ----- const recentErrorCount = await countRecentErrors(devTelemetryDir(stateRoot)) if (recentErrorCount > 0) { @@ -441,6 +457,7 @@ export async function collectHypAwareStatus(opts = {}) { recentErrorCount, diagnostics, overall, + remoteConfig, } } diff --git a/src/core/daemon/types.d.ts b/src/core/daemon/types.d.ts index 8320084..77237cd 100644 --- a/src/core/daemon/types.d.ts +++ b/src/core/daemon/types.d.ts @@ -3,7 +3,7 @@ import type { CapabilityRegistry, QueryRegistry, } from '../../../collectivus-plugin-kernel-types.d.ts' -import type { V1Diagnostic, ConfigValidationError } from '../config/types.d.ts' +import type { ConfigControlStatus, V1Diagnostic, ConfigValidationError } from '../config/types.d.ts' import type { ExtendedSourceRegistry } from '../registry/sources.js' import type { ExtendedSinkRegistry } from '../registry/sinks.js' import type { KernelRuntime } from '../runtime/activation.js' @@ -71,6 +71,7 @@ export type StatusDiagnosticKind = | 'daemon_loaded_no_pid' | 'client_attach_missing' | 'recent_errors' + | 'remote_config_rolled_back' /** * Diagnostic surfaced by `hyp status`. Carries a severity, the @@ -139,6 +140,13 @@ export interface HypAwareStatusReport { recentErrorCount: number diagnostics: StatusDiagnostic[] overall: 'healthy' | 'degraded' + /** + * Remote-config apply state (LLP 0025): probation, last rollback + + * structured reason, remembered bad etag, and the running config's + * etag. Null only when the probe itself failed; a gateway that has + * never applied a remote config reports all-null fields. + */ + remoteConfig: ConfigControlStatus | null } export interface CollectStatusOptions { diff --git a/src/core/runtime/activation.d.ts b/src/core/runtime/activation.d.ts index 9575670..c5ffea6 100644 --- a/src/core/runtime/activation.d.ts +++ b/src/core/runtime/activation.d.ts @@ -4,6 +4,7 @@ import type { BackfillMaterializerRegistry, BackfillRegistry, CommandRegistry, + ConfigControlFacade, ConfigRegistry, InitPresetRegistry, JsonObject, @@ -43,6 +44,12 @@ export interface KernelRuntime { backfills: BackfillRegistry backfillMaterializers: BackfillMaterializerRegistry activationContexts: Map + /** + * Plugin-facing facade of the daemon's config apply engine. Set only + * when the host process runs one (daemon mode); CLI boots leave it + * undefined so transport plugins skip their pull loops. + */ + configControl?: ConfigControlFacade } export interface CreateKernelRuntimeArgs { @@ -55,6 +62,7 @@ export interface CreateKernelRuntimeArgs { backfillMaterializerRegistry?: BackfillMaterializerRegistry storage?: ExtendedQueryStorageService cacheRoot?: string + configControl?: ConfigControlFacade } export interface CreateActivationContextArgs { diff --git a/src/core/runtime/activation.js b/src/core/runtime/activation.js index 4d3ae16..5379e62 100644 --- a/src/core/runtime/activation.js +++ b/src/core/runtime/activation.js @@ -15,7 +15,7 @@ import { createQueryStorageService } from '../cache/storage.js' import { isSafeContributionName } from './contribution_names.js' /** - * @import { ActivePlugin, AgentContribution, AgentRegistry, BackfillMaterializerRegistry, BackfillRegistry, CapabilityName, CapabilityRegistry, CommandRegistry, ConfigRegistry, InitPresetContribution, InitPresetRegistry, JsonObject, PermissionContext, PluginActivationContext, PluginLogger, PluginManifest, PluginName, PluginPaths, PluginPermission, QueryRegistry, SemverRange, SemverVersion, SinkRegistry, SkillContribution, SkillRegistry, SourceRegistry } from '../../../collectivus-plugin-kernel-types.d.ts' + * @import { ActivePlugin, AgentContribution, AgentRegistry, BackfillMaterializerRegistry, BackfillRegistry, CapabilityName, CapabilityRegistry, CommandRegistry, ConfigControlFacade, ConfigRegistry, InitPresetContribution, InitPresetRegistry, JsonObject, PermissionContext, PluginActivationContext, PluginLogger, PluginManifest, PluginName, PluginPaths, PluginPermission, QueryRegistry, SemverRange, SemverVersion, SinkRegistry, SkillContribution, SkillRegistry, SourceRegistry } from '../../../collectivus-plugin-kernel-types.d.ts' * @import { ExtendedQueryStorageService } from '../cache/types.d.ts' * @import { KernelRuntime } from './activation.d.ts' */ @@ -41,6 +41,7 @@ import { isSafeContributionName } from './contribution_names.js' * backfillMaterializerRegistry?: BackfillMaterializerRegistry, * storage?: ExtendedQueryStorageService, * cacheRoot?: string, + * configControl?: ConfigControlFacade, * }} [opts] * @returns {KernelRuntime} * @ref LLP 0003#intrinsic-not-plugin-provided [implements] — query + storage are wired in as intrinsic services, not plugin contributions @@ -53,6 +54,7 @@ export function createKernelRuntime(opts = {}) { getDeclaration: (dataset) => query.getDataset(dataset)?.cachePartitioning, }) return { + ...(opts.configControl ? { configControl: opts.configControl } : {}), capabilities: opts.capabilityRegistry ?? createCapabilityRegistry(), commands: opts.commandRegistry ?? createCommandRegistry(), configRegistry: createConfigRegistry(), @@ -121,6 +123,8 @@ export function createActivationContext({ runtime, plugin, paths, config, env }) initPresets: runtime.initPresets, backfills: runtime.backfills, backfillMaterializers: runtime.backfillMaterializers, + // @ref LLP 0025#apply-engine-is-kernel-surface [implements] — plugins reach the apply engine only through this narrow facade; absent outside the daemon + ...(runtime.configControl ? { configControl: runtime.configControl } : {}), /** * @template T * @param {CapabilityName} name diff --git a/src/core/runtime/boot.js b/src/core/runtime/boot.js index e23d580..aee6524 100644 --- a/src/core/runtime/boot.js +++ b/src/core/runtime/boot.js @@ -77,7 +77,11 @@ export async function bootKernel(opts = {}) { }, async (span) => { const commandRegistry = opts.commandRegistry ?? createCommandRegistry() - const runtime = createKernelRuntime({ commandRegistry, cacheRoot }) + const runtime = createKernelRuntime({ + commandRegistry, + cacheRoot, + ...(opts.configControl ? { configControl: opts.configControl } : {}), + }) const discovered = await discoverBundledPlugins({ workspaceDir: opts.workspaceDir }) span.setAttribute('bundled_available', discovered.loaded.length) @@ -250,10 +254,13 @@ export async function bootKernel(opts = {}) { * 2. `env.HYP_CONFIG` * 3. `/hypaware-config.json` * + * Exported so the daemon can resolve the same operative path for the + * config apply engine before `bootKernel` runs. + * * @param {{ explicit?: string, env: NodeJS.ProcessEnv, hypHome: string }} args * @returns {string} */ -function resolveConfigPath({ explicit, env, hypHome }) { +export function resolveConfigPath({ explicit, env, hypHome }) { if (explicit) return path.resolve(explicit) if (env.HYP_CONFIG) return path.resolve(env.HYP_CONFIG) return defaultConfigPath(hypHome) diff --git a/src/core/runtime/types.d.ts b/src/core/runtime/types.d.ts index 3936e42..01f960a 100644 --- a/src/core/runtime/types.d.ts +++ b/src/core/runtime/types.d.ts @@ -1,5 +1,6 @@ import type { ActivePlugin, + ConfigControlFacade, HypAwareV2Config, PluginLockEntry, PluginName, @@ -47,6 +48,8 @@ export interface BootKernelOptions { env?: NodeJS.ProcessEnv /** Override OS temp root (tests). */ tmpRoot?: string + /** Apply-engine facade to expose on activation contexts (daemon only). */ + configControl?: ConfigControlFacade } export interface BootKernelResult { diff --git a/test/core/config-apply-deps.test.js b/test/core/config-apply-deps.test.js new file mode 100644 index 0000000..ef22f54 --- /dev/null +++ b/test/core/config-apply-deps.test.js @@ -0,0 +1,293 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import { spawn } from 'node:child_process' +import fs from 'node:fs/promises' +import os from 'node:os' +import path from 'node:path' + +import { buildConfigApplyDeps } from '../../src/core/config/apply_deps.js' +import { loadLock } from '../../src/core/plugin_install/install.js' +import { getEntry, writeLock } from '../../src/core/plugin_install/lock.js' + +/** + * @import { PluginName } from '../../collectivus-plugin-kernel-types.d.ts' + */ + +/** + * Pin enforcement is the apply path's core security property — nothing + * may substitute code after the config was authored (LLP 0025 + * install-on-config). The apply-engine tests mock these deps away, so + * the real decisions are exercised here against real fixtures: a + * fixture bundled workspace, a lock-backed installed plugin, and a + * local git repo standing in for a served artifact. + */ + +const HASH_A = 'a'.repeat(64) + +/** @param {string} dir @param {string} name @param {string} version */ +async function writePluginDir(dir, name, version) { + await fs.mkdir(dir, { recursive: true }) + await fs.writeFile( + path.join(dir, 'hypaware.plugin.json'), + JSON.stringify({ + schema_version: 1, + name, + version, + hypaware_api: '^1.0.0', + runtime: 'node', + entrypoint: './index.js', + }) + ) + await fs.writeFile(path.join(dir, 'index.js'), 'export async function activate(){}\n') +} + +/** + * A temp HYP state root plus a fixture bundled workspace holding a + * fake `@hypaware/otel` at a controlled version, so the bundled-pin + * checks don't depend on the real workspace's version numbers. + */ +async function makeFixture() { + const tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'hyp-apply-deps-')) + const stateRoot = path.join(tmpRoot, 'state') + const workspaceDir = path.join(tmpRoot, 'workspace') + await writePluginDir(path.join(workspaceDir, 'otel'), '@hypaware/otel', '9.9.9') + return { + tmpRoot, + stateRoot, + workspaceDir, + cleanup: () => fs.rm(tmpRoot, { recursive: true, force: true }), + } +} + +test('bundled plugin: pinned version mismatch is a bundled_version_mismatch failure', async () => { + const fx = await makeFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + const result = await deps.installPinnedPlugins([ + { name: '@hypaware/otel', version: '1.0.0' }, + ]) + assert.equal(result.ok, false) + assert.equal(!result.ok && result.errorKind, 'bundled_version_mismatch') + assert.ok(!result.ok && /1\.0\.0/.test(result.message) && /9\.9\.9/.test(result.message)) + } finally { + await fx.cleanup() + } +}) + +test('bundled plugin: matching version pin is satisfied without an install; hash is not checked', async () => { + const fx = await makeFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + // The artifact_hash refers to a git release artifact that + // legitimately differs from the npm-bundled tree — a garbage hash + // must not fail a bundled pin (LLP 0025 bundled-first-party). + const result = await deps.installPinnedPlugins([ + { name: '@hypaware/otel', version: '9.9.9', artifact_hash: 'f'.repeat(64) }, + ]) + assert.deepEqual(result, { ok: true }) + const lock = await loadLock(fx.stateRoot) + assert.equal(getEntry(lock, /** @type {PluginName} */ ('@hypaware/otel')), undefined) + } finally { + await fx.cleanup() + } +}) + +test('bundled plugin: an unpinned entry is satisfied by any bundled version', async () => { + const fx = await makeFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + const result = await deps.installPinnedPlugins([{ name: '@hypaware/otel' }]) + assert.deepEqual(result, { ok: true }) + } finally { + await fx.cleanup() + } +}) + +test('disabled entries are skipped entirely', async () => { + const fx = await makeFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + // The unreachable source proves no install was attempted. + const result = await deps.installPinnedPlugins([ + { name: '@third-party/off', enabled: false, source: `file://${fx.tmpRoot}/nonexistent` }, + ]) + assert.deepEqual(result, { ok: true }) + } finally { + await fx.cleanup() + } +}) + +test('an installed lock entry matching version + hash is satisfied without re-install', async () => { + const fx = await makeFixture() + try { + const installDir = path.join(fx.tmpRoot, 'installed-fixture') + await writePluginDir(installDir, '@third-party/installed-fixture', '1.0.0') + await writeLock(fx.stateRoot, { + schema_version: 1, + plugins: { + '@third-party/installed-fixture': { + name: '@third-party/installed-fixture', + version: '1.0.0', + source: { kind: 'local-dir', raw: installDir, path: installDir }, + install_dir: installDir, + content_hash: HASH_A, + manifest_hash: 'b'.repeat(64), + installed_at: '2026-06-12T00:00:00.000Z', + }, + }, + }) + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + // The unreachable source proves the satisfied entry never hits the + // install path. + const result = await deps.installPinnedPlugins([ + { + name: '@third-party/installed-fixture', + version: '1.0.0', + artifact_hash: HASH_A, + source: `file://${fx.tmpRoot}/nonexistent`, + }, + ]) + assert.deepEqual(result, { ok: true }) + + // A different pinned hash is NOT satisfied: the install path runs + // (and fails here on the unreachable source). + const mismatched = await deps.installPinnedPlugins([ + { + name: '@third-party/installed-fixture', + version: '1.0.0', + artifact_hash: 'c'.repeat(64), + source: `file://${fx.tmpRoot}/nonexistent`, + }, + ]) + assert.equal(mismatched.ok, false) + assert.equal(!mismatched.ok && mismatched.errorKind, 'plugin_install_failed') + } finally { + await fx.cleanup() + } +}) + +test('a fetched artifact failing its hash pin is an artifact_hash_mismatch and nothing is installed', async () => { + const fx = await makeFixture() + const git = await buildGitPluginFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + const result = await deps.installPinnedPlugins([ + { + name: '@third-party/pin-fixture', + source: git.sourceUrl, + version: '0.1.0', + artifact_hash: 'f'.repeat(64), + }, + ]) + assert.equal(result.ok, false) + assert.equal(!result.ok && result.errorKind, 'artifact_hash_mismatch') + const lock = await loadLock(fx.stateRoot) + assert.equal(getEntry(lock, /** @type {PluginName} */ ('@third-party/pin-fixture')), undefined) + } finally { + await git.cleanup() + await fx.cleanup() + } +}) + +test('a correct hash pin installs, and validation then sees the plugin it could not know before', async () => { + // The install-before-validate ordering only works because a fresh + // catalog is discovered per call — this is the integration check for + // a served config naming a not-yet-installed plugin. + const fx = await makeFixture() + const git = await buildGitPluginFixture() + try { + const deps = buildConfigApplyDeps({ stateRoot: fx.stateRoot, workspaceDir: fx.workspaceDir }) + const document = { version: 2, plugins: [{ name: '@third-party/pin-fixture' }] } + + const before = await deps.validateDocument(document) + assert.equal(before.ok, false) + assert.ok(before.errors.some((e) => /pin-fixture/.test(e.message))) + + // Learn the artifact hash by installing unpinned once, then prove a + // config pinning that exact hash is accepted from a clean state. + const unpinned = await deps.installPinnedPlugins([ + { name: '@third-party/pin-fixture', source: git.sourceUrl, version: '0.1.0' }, + ]) + assert.deepEqual(unpinned, { ok: true }) + const lock = await loadLock(fx.stateRoot) + const entry = getEntry(lock, /** @type {PluginName} */ ('@third-party/pin-fixture')) + assert.ok(entry) + + const fresh = await makeFixture() + try { + const freshDeps = buildConfigApplyDeps({ + stateRoot: fresh.stateRoot, + workspaceDir: fresh.workspaceDir, + }) + const pinned = await freshDeps.installPinnedPlugins([ + { + name: '@third-party/pin-fixture', + source: git.sourceUrl, + version: '0.1.0', + artifact_hash: entry?.content_hash, + }, + ]) + assert.deepEqual(pinned, { ok: true }) + + const after = await freshDeps.validateDocument(document) + assert.equal(after.ok, true, JSON.stringify(after.errors)) + } finally { + await fresh.cleanup() + } + } finally { + await git.cleanup() + await fx.cleanup() + } +}) + +/* ---------- git fixture ---------- */ + +/** + * A bare local git repo serving a plugin tagged `v0.1.0`, standing in + * for a served config's pinned artifact source. + */ +async function buildGitPluginFixture() { + const tmpRoot = await fs.mkdtemp(path.join(os.tmpdir(), 'hyp-apply-deps-git-')) + const workdir = path.join(tmpRoot, 'work') + await writePluginDir(workdir, '@third-party/pin-fixture', '0.1.0') + await runGit(['init', '-q', '-b', 'main'], { cwd: workdir }) + await runGit(['config', 'user.email', 'unit@hypaware.test'], { cwd: workdir }) + await runGit(['config', 'user.name', 'HypAware Test'], { cwd: workdir }) + await runGit(['add', '.'], { cwd: workdir }) + await runGit(['commit', '--quiet', '--no-gpg-sign', '-m', 'initial'], { cwd: workdir }) + await runGit(['tag', 'v0.1.0'], { cwd: workdir }) + + const bareRepoDir = path.join(tmpRoot, 'bare.git') + await runGit(['init', '--bare', '-q', '-b', 'main', bareRepoDir]) + await runGit(['remote', 'add', 'origin', bareRepoDir], { cwd: workdir }) + await runGit(['push', '--quiet', 'origin', 'main', '--tags'], { cwd: workdir }) + + return { + sourceUrl: `file://${bareRepoDir}`, + cleanup: () => fs.rm(tmpRoot, { recursive: true, force: true }), + } +} + +/** + * @param {string[]} args + * @param {{ cwd?: string }} [opts] + * @returns {Promise} + */ +function runGit(args, opts = {}) { + return new Promise((resolve, reject) => { + const child = spawn('git', args, { + cwd: opts.cwd, + env: { ...process.env, GIT_TERMINAL_PROMPT: '0' }, + stdio: ['ignore', 'ignore', 'pipe'], + }) + /** @type {Buffer[]} */ + const stderrChunks = [] + child.stderr.on('data', (chunk) => stderrChunks.push(chunk)) + child.on('close', (code) => { + if (code === 0) resolve() + else reject(new Error(`git ${args.join(' ')} exited ${code}: ${Buffer.concat(stderrChunks)}`)) + }) + }) +} diff --git a/test/core/config-apply.test.js b/test/core/config-apply.test.js new file mode 100644 index 0000000..1c6db5f --- /dev/null +++ b/test/core/config-apply.test.js @@ -0,0 +1,454 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs' +import fsp from 'node:fs/promises' +import os from 'node:os' +import path from 'node:path' + +import { + DEFAULT_POLL_INTERVAL_SECONDS, + MAX_CONFIG_DOCUMENT_BYTES, + PROBATION_FLOOR_SECONDS, + createConfigControl, + readConfigControlStatus, +} from '../../src/core/config/apply.js' +import { parseConfigShape } from '../../src/core/config/schema.js' + +/** + * @import { PluginConfigInstance } from '../../collectivus-plugin-kernel-types.d.ts' + * @import { ConfigApplyDeps, PinnedInstallResult } from '../../src/core/config/types.d.ts' + */ + +const SEED_CONFIG = { + version: 2, + plugins: [{ name: '@hypaware/central' }], + sinks: { + central: { + plugin: '@hypaware/central', + config: { url: 'https://central.example', identity: { bootstrap_token: 'tok' } }, + }, + }, +} + +const REMOTE_CONFIG = { + version: 2, + plugins: [{ name: '@hypaware/central' }, { name: '@hypaware/otel' }], + sinks: { + central: { + plugin: '@hypaware/central', + config: { url: 'https://central.example', identity: {} }, + }, + }, +} + +async function makeFixture() { + const tmp = await fsp.mkdtemp(path.join(os.tmpdir(), 'hyp-config-apply-')) + const stateRoot = path.join(tmp, 'hypaware') + await fsp.mkdir(stateRoot, { recursive: true }) + const configPath = path.join(tmp, 'hypaware-config.json') + await fsp.writeFile(configPath, JSON.stringify(SEED_CONFIG, null, 2) + '\n') + return { tmp, stateRoot, configPath } +} + +/** + * @param {{ validateOk?: boolean, installResult?: PinnedInstallResult }} [opts] + * @returns {ConfigApplyDeps & { validateCalls: number, installCalls: number, calls: string[] }} + */ +function makeDeps(opts = {}) { + const deps = { + validateCalls: 0, + installCalls: 0, + /** @type {string[]} */ + calls: [], + /** @param {unknown} _document */ + async validateDocument(_document) { + deps.validateCalls += 1 + deps.calls.push('validate') + return opts.validateOk === false + ? { ok: false, errors: [{ pointer: '/plugins/0', message: 'nope' }] } + : { ok: true, errors: [] } + }, + /** @param {PluginConfigInstance[]} _entries */ + async installPinnedPlugins(_entries) { + deps.installCalls += 1 + deps.calls.push('install') + return opts.installResult ?? { ok: true } + }, + } + return deps +} + +/** + * @param {{ stateRoot: string, configPath: string, now?: () => number }} args + */ +function makeControl({ stateRoot, configPath, now }) { + /** @type {string[]} */ + const restarts = [] + const control = createConfigControl({ + stateRoot, + configPath, + requestRestart: (reason) => { restarts.push(reason) }, + ...(now ? { now } : {}), + }) + return { control, restarts } +} + +test('stage applies a document: slot persisted, pointer flipped, etag staged, probation armed, restart requested', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + + const result = await control.stage(REMOTE_CONFIG, 'etag-1') + assert.deepEqual(result, { ok: true, action: 'applied' }) + assert.deepEqual(restarts, ['config_applied']) + + // Operative config is now a symlink whose content is the new doc. + const stat = await fsp.lstat(configPath) + assert.ok(stat.isSymbolicLink()) + const operative = JSON.parse(await fsp.readFile(configPath, 'utf8')) + assert.deepEqual(operative.plugins, REMOTE_CONFIG.plugins) + + // The seed was preserved as the rollback target. + const slotA = JSON.parse( + await fsp.readFile(path.join(stateRoot, 'config-control', 'config.a.json'), 'utf8') + ) + assert.deepEqual(slotA, SEED_CONFIG) + + assert.equal(control.runningEtag(), 'etag-1') + const status = await control.status() + assert.equal(status.probation?.etag, 'etag-1') + assert.equal(status.probation?.slot, 'b') + assert.equal(status.probation?.previous_slot, 'a') +}) + +test('probation window is max(3 × poll interval, floor) from the staged document', async () => { + const { stateRoot, configPath } = await makeFixture() + const t0 = Date.parse('2026-06-12T00:00:00.000Z') + const { control } = makeControl({ stateRoot, configPath, now: () => t0 }) + control.attachApplyDeps(makeDeps()) + + // No poll_interval_seconds in the doc → default cadence. + await control.stage(REMOTE_CONFIG, 'etag-1') + let status = await control.status() + assert.equal( + Date.parse(/** @type {string} */ (status.probation?.until)) - t0, + 3 * DEFAULT_POLL_INTERVAL_SECONDS * 1000 + ) + + // A fast cadence is floored. Fresh engine: the first stage left a + // restart pending in the old one. + const relaunch = makeControl({ stateRoot, configPath, now: () => t0 }) + relaunch.control.attachApplyDeps(makeDeps()) + relaunch.control.confirmPoll() + const fastDoc = { + ...REMOTE_CONFIG, + sinks: { + central: { + plugin: '@hypaware/central', + config: { url: 'https://central.example', identity: {}, poll_interval_seconds: 5 }, + }, + }, + } + await relaunch.control.stage(fastDoc, 'etag-2') + status = await relaunch.control.status() + assert.equal( + Date.parse(/** @type {string} */ (status.probation?.until)) - t0, + PROBATION_FLOOR_SECONDS * 1000 + ) +}) + +test('stage before attachApplyDeps fails closed', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + const result = await control.stage(REMOTE_CONFIG, 'etag-1') + assert.equal(result.ok, false) + assert.equal(!result.ok && result.errorKind, 'apply_engine_not_ready') + assert.deepEqual(restarts, []) +}) + +test('validation failure remembers the bad etag and leaves the config untouched', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps({ validateOk: false })) + + const result = await control.stage(REMOTE_CONFIG, 'etag-bad') + assert.equal(result.ok, false) + assert.equal(!result.ok && result.errorKind, 'config_invalid') + assert.deepEqual(restarts, []) + + // Still the seed, still a regular file. + const stat = await fsp.lstat(configPath) + assert.ok(!stat.isSymbolicLink()) + const status = await control.status() + assert.equal(status.badEtag?.etag, 'etag-bad') + assert.equal(status.badEtag?.reason, 'validation_failed') + assert.equal(status.runningEtag, null) +}) + +test('pinned plugins install before full validation, so a config can name a not-yet-installed plugin', async () => { + // Catalog-backed validation only knows a plugin once it is installed; + // install-on-config breaks if validation runs first (LLP 0025 + // install-on-config). The shape gate runs before install instead. + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + const deps = makeDeps() + control.attachApplyDeps(deps) + + const result = await control.stage(REMOTE_CONFIG, 'etag-order') + assert.equal(result.ok, true) + assert.deepEqual(deps.calls, ['install', 'validate']) +}) + +test('a shape-invalid document is rejected before any install runs', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + const deps = makeDeps() + control.attachApplyDeps(deps) + + const result = await control.stage({ version: 1 }, 'etag-shape') + assert.equal(result.ok, false) + assert.equal(!result.ok && result.errorKind, 'config_invalid') + assert.equal(deps.installCalls, 0) + const status = await control.status() + assert.equal(status.badEtag?.reason, 'validation_failed') +}) + +test('a remembered bad etag backs off re-apply until the etag changes', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + const deps = makeDeps({ validateOk: false }) + control.attachApplyDeps(deps) + + await control.stage(REMOTE_CONFIG, 'etag-bad') + assert.equal(deps.validateCalls, 1) + + // Same etag again: skipped without re-validating. + const skipped = await control.stage(REMOTE_CONFIG, 'etag-bad') + assert.deepEqual(skipped, { ok: true, action: 'skipped_bad_etag' }) + assert.equal(deps.validateCalls, 1) + + // A different etag proceeds (and fails validation again here). + const retried = await control.stage(REMOTE_CONFIG, 'etag-fixed') + assert.equal(retried.ok, false) + assert.equal(deps.validateCalls, 2) +}) + +test('pinned install hash mismatch is an apply failure with a structured reason', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps({ + installResult: { ok: false, errorKind: 'artifact_hash_mismatch', message: 'hash differs' }, + })) + + const result = await control.stage(REMOTE_CONFIG, 'etag-hash') + assert.equal(!result.ok && result.errorKind, 'artifact_hash_mismatch') + assert.deepEqual(restarts, []) + const status = await control.status() + assert.equal(status.badEtag?.reason, 'artifact_hash_mismatch') +}) + +test('oversized documents are rejected before validation', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + const deps = makeDeps() + control.attachApplyDeps(deps) + + const huge = { ...REMOTE_CONFIG, padding: 'x'.repeat(MAX_CONFIG_DOCUMENT_BYTES) } + const result = await control.stage(huge, 'etag-huge') + assert.equal(!result.ok && result.errorKind, 'document_too_large') + assert.equal(deps.validateCalls, 0) +}) + +test('staging the running etag is a no-op', async () => { + const { stateRoot, configPath } = await makeFixture() + const first = makeControl({ stateRoot, configPath }) + first.control.attachApplyDeps(makeDeps()) + await first.control.stage(REMOTE_CONFIG, 'etag-1') + + // Relaunch: a fresh engine over the same state. + const second = makeControl({ stateRoot, configPath }) + second.control.attachApplyDeps(makeDeps()) + const result = await second.control.stage(REMOTE_CONFIG, 'etag-1') + assert.deepEqual(result, { ok: true, action: 'noop_same_etag' }) +}) + +test('a second stage in the same process is refused while a restart is pending', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + const result = await control.stage(REMOTE_CONFIG, 'etag-2') + assert.equal(!result.ok && result.errorKind, 'restart_pending') +}) + +test('confirmPoll clears probation', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + control.confirmPoll() + const status = await control.status() + assert.equal(status.probation, null) + assert.equal(status.runningEtag, 'etag-1') + // Idempotent. + control.confirmPoll() +}) + +test('chained applies alternate slots and roll back one revision', async () => { + const { stateRoot, configPath } = await makeFixture() + + const first = makeControl({ stateRoot, configPath }) + first.control.attachApplyDeps(makeDeps()) + await first.control.stage(REMOTE_CONFIG, 'etag-1') + + // Relaunch, probation clears, a newer revision arrives. + const second = makeControl({ stateRoot, configPath }) + second.control.attachApplyDeps(makeDeps()) + second.control.confirmPoll() + const doc2 = { ...REMOTE_CONFIG, plugins: [{ name: '@hypaware/central' }] } + await second.control.stage(doc2, 'etag-2') + assert.equal(second.control.runningEtag(), 'etag-2') + const status = await second.control.status() + assert.equal(status.probation?.slot, 'a') + assert.equal(status.probation?.previous_slot, 'b') + + // Expired probation at the next boot rolls back to etag-1, not the seed. + const future = Date.now() + 10 * 24 * 60 * 60 * 1000 + const third = makeControl({ stateRoot, configPath, now: () => future }) + const evaluated = await third.control.evaluateAtBoot() + assert.equal(evaluated.action, 'rolled_back') + assert.equal(third.control.runningEtag(), 'etag-1') + const rolled = await third.control.status() + assert.equal(rolled.lastRollback?.etag, 'etag-2') + assert.equal(rolled.lastRollback?.reason, 'probation_expired') + assert.equal(rolled.badEtag?.etag, 'etag-2') +}) + +test('evaluateAtBoot rolls an expired first apply back onto the seed', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + const future = Date.now() + 10 * 24 * 60 * 60 * 1000 + const relaunch = makeControl({ stateRoot, configPath, now: () => future }) + const evaluated = await relaunch.control.evaluateAtBoot() + assert.equal(evaluated.action, 'rolled_back') + + const operative = JSON.parse(await fsp.readFile(configPath, 'utf8')) + assert.deepEqual(operative, SEED_CONFIG) + assert.equal(relaunch.control.runningEtag(), undefined) +}) + +test('evaluateAtBoot keeps an unexpired probation marker', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + const relaunch = makeControl({ stateRoot, configPath }) + const evaluated = await relaunch.control.evaluateAtBoot() + assert.equal(evaluated.action, 'none') + const status = await relaunch.control.status() + assert.equal(status.probation?.etag, 'etag-1') +}) + +test('evaluateAtBoot discards a probation marker whose flip never committed', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + // Simulate a crash between the marker write and the pointer flip by + // pointing the marker at the slot that is NOT active. + const statePath = path.join(stateRoot, 'config-control', 'state.json') + const state = JSON.parse(fs.readFileSync(statePath, 'utf8')) + state.probation.slot = 'a' + fs.writeFileSync(statePath, JSON.stringify(state)) + + const relaunch = makeControl({ stateRoot, configPath }) + const evaluated = await relaunch.control.evaluateAtBoot() + assert.equal(evaluated.action, 'cleared_orphan') + const status = await relaunch.control.status() + assert.equal(status.probation, null) + // The operative config is untouched by orphan cleanup. + assert.equal(relaunch.control.runningEtag(), 'etag-1') +}) + +test('the probation watchdog rolls back and requests a restart on expiry', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + assert.deepEqual(restarts, ['config_applied']) + + // Shrink the live marker's window so the real timer fires fast. + const statePath = path.join(stateRoot, 'config-control', 'state.json') + const state = JSON.parse(fs.readFileSync(statePath, 'utf8')) + state.probation.until = new Date(Date.now() + 20).toISOString() + fs.writeFileSync(statePath, JSON.stringify(state)) + + control.armProbationWatchdog() + await new Promise((resolve) => setTimeout(resolve, 100)) + + assert.deepEqual(restarts, ['config_applied', 'probation_expired']) + const status = await control.status() + assert.equal(status.lastRollback?.reason, 'probation_expired') + assert.equal(status.runningEtag, null) + const operative = JSON.parse(await fsp.readFile(configPath, 'utf8')) + assert.deepEqual(operative, SEED_CONFIG) +}) + +test('a confirmed poll disarms the watchdog before it fires', async () => { + const { stateRoot, configPath } = await makeFixture() + const { control, restarts } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + const statePath = path.join(stateRoot, 'config-control', 'state.json') + const state = JSON.parse(fs.readFileSync(statePath, 'utf8')) + state.probation.until = new Date(Date.now() + 30).toISOString() + fs.writeFileSync(statePath, JSON.stringify(state)) + + control.armProbationWatchdog() + control.confirmPoll() + await new Promise((resolve) => setTimeout(resolve, 100)) + + assert.deepEqual(restarts, ['config_applied']) + assert.equal(control.runningEtag(), 'etag-1') +}) + +test('readConfigControlStatus reads without an engine and tolerates a fresh install', async () => { + const { stateRoot, configPath } = await makeFixture() + const empty = readConfigControlStatus({ stateRoot, configPath }) + assert.deepEqual(empty, { probation: null, lastRollback: null, badEtag: null, runningEtag: null }) + + const { control } = makeControl({ stateRoot, configPath }) + control.attachApplyDeps(makeDeps()) + await control.stage(REMOTE_CONFIG, 'etag-1') + + const status = readConfigControlStatus({ stateRoot, configPath }) + assert.equal(status.runningEtag, 'etag-1') + assert.equal(status.probation?.etag, 'etag-1') +}) + +test('parseConfigShape accepts and validates plugin pin fields', () => { + const ok = parseConfigShape({ + version: 2, + plugins: [{ name: '@x/y', version: '1.2.3', artifact_hash: 'abc123', source: 'github:x/y' }], + }) + assert.ok(ok.ok) + assert.equal(ok.ok && ok.config.plugins?.[0].version, '1.2.3') + assert.equal(ok.ok && ok.config.plugins?.[0].artifact_hash, 'abc123') + assert.equal(ok.ok && ok.config.plugins?.[0].source, 'github:x/y') + + const bad = parseConfigShape({ + version: 2, + plugins: [{ name: '@x/y', version: 7 }], + }) + assert.ok(!bad.ok) + assert.ok(!bad.ok && bad.errors.some((e) => e.pointer === '/plugins/0/version')) +}) diff --git a/test/core/daemon.test.js b/test/core/daemon.test.js index 38814d9..0da5156 100644 --- a/test/core/daemon.test.js +++ b/test/core/daemon.test.js @@ -173,6 +173,34 @@ test('renderDaemonInstall renders a deterministic LaunchAgent dry-run payload', ]) }) +test('installers default to relaunch-on-exit (staged restart requirement, LLP 0017)', () => { + // Defaults — no keepAlive/restart override. The service manager MUST + // relaunch the daemon after a staged config-apply exit. + const launchd = renderDaemonInstall({ + platform: 'darwin', + homeDir: '/Users/hyp', + binPath: '/opt/hypaware/bin/hypaware.js', + nodePath: '/usr/local/bin/node', + }) + assert.match(launchd.content, /KeepAlive<\/key>\n /) + + const systemd = renderDaemonInstall({ + platform: 'linux', + homeDir: '/home/hyp', + binPath: '/opt/hypaware/bin/hypaware.js', + nodePath: '/usr/local/bin/node', + }) + assert.match(systemd.content, /^Restart=always$/m) +}) + +test('the staged-restart exit code is distinct from success and error exits', async () => { + const { DAEMON_RESTART_EXIT_CODE } = await import('../../src/core/daemon/runtime.js') + assert.equal(typeof DAEMON_RESTART_EXIT_CODE, 'number') + /** @type {number} */ + const code = DAEMON_RESTART_EXIT_CODE + assert.ok(code !== 0 && code !== 1 && code !== 2) +}) + test('runDaemon reload refreshes plugin config before source.reload', async () => { const hypHome = await fs.mkdtemp(path.join(os.tmpdir(), 'hypaware-daemon-reload-config-')) let handle diff --git a/test/core/join-command.test.js b/test/core/join-command.test.js new file mode 100644 index 0000000..44bf23d --- /dev/null +++ b/test/core/join-command.test.js @@ -0,0 +1,131 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs/promises' +import os from 'node:os' +import path from 'node:path' + +import { dispatch } from '../../src/core/cli/dispatch.js' + +function makeBuf() { + let value = '' + return { + write(chunk) { + value += String(chunk) + return true + }, + text() { + return value + }, + } +} + +/** @param {string} [stdinText] */ +async function makeDispatchOpts(stdinText) { + const hypHome = await fs.mkdtemp(path.join(os.tmpdir(), 'hyp-join-test-')) + const stdout = makeBuf() + const stderr = makeBuf() + /** @type {any} */ + let stdin + if (stdinText !== undefined) { + stdin = { + isTTY: false, + async *[Symbol.asyncIterator]() { yield stdinText }, + } + } else { + stdin = { isTTY: true } + } + return { + hypHome, + stdout, + stderr, + opts: { stdout, stderr, stdin, env: { ...process.env, HYP_HOME: hypHome, HYP_CONFIG: '' } }, + } +} + +test('join writes a seed config (mode 0600) and skips daemon install with --no-daemon', async () => { + const { hypHome, stdout, opts } = await makeDispatchOpts() + const code = await dispatch( + ['join', 'https://central.example', 'policy-token-1', '--no-daemon'], + opts + ) + assert.equal(code, 0, stdout.text()) + + const configPath = path.join(hypHome, 'hypaware-config.json') + const stat = await fs.stat(configPath) + assert.equal(stat.mode & 0o777, 0o600) + + const seed = JSON.parse(await fs.readFile(configPath, 'utf8')) + assert.equal(seed.version, 2) + assert.deepEqual(seed.plugins, [{ name: '@hypaware/central' }]) + assert.equal(seed.sinks.central.plugin, '@hypaware/central') + assert.equal(seed.sinks.central.config.url, 'https://central.example') + assert.equal(seed.sinks.central.config.identity.bootstrap_token, 'policy-token-1') + assert.match(stdout.text(), /daemon install skipped/) +}) + +test('join reads the token from --token-file', async () => { + const { hypHome, opts } = await makeDispatchOpts() + const tokenFile = path.join(hypHome, 'token.txt') + await fs.writeFile(tokenFile, 'file-token\n') + + const code = await dispatch( + ['join', 'https://central.example', '--token-file', tokenFile, '--no-daemon'], + opts + ) + assert.equal(code, 0) + const seed = JSON.parse( + await fs.readFile(path.join(hypHome, 'hypaware-config.json'), 'utf8') + ) + assert.equal(seed.sinks.central.config.identity.bootstrap_token, 'file-token') +}) + +test('join reads the token from stdin when piped', async () => { + const { hypHome, opts } = await makeDispatchOpts('stdin-token\n') + const code = await dispatch(['join', 'https://central.example', '--no-daemon'], opts) + assert.equal(code, 0) + const seed = JSON.parse( + await fs.readFile(path.join(hypHome, 'hypaware-config.json'), 'utf8') + ) + assert.equal(seed.sinks.central.config.identity.bootstrap_token, 'stdin-token') +}) + +test('join rejects missing url, bad url, missing token, and conflicting token sources', async () => { + { + const { stderr, opts } = await makeDispatchOpts() + assert.equal(await dispatch(['join'], opts), 2) + assert.match(stderr.text(), /missing /) + } + { + const { stderr, opts } = await makeDispatchOpts() + assert.equal(await dispatch(['join', 'ftp://x', 'tok', '--no-daemon'], opts), 2) + assert.match(stderr.text(), /http\(s\)/) + } + { + // TTY stdin and no token anywhere. + const { stderr, opts } = await makeDispatchOpts() + assert.equal(await dispatch(['join', 'https://central.example', '--no-daemon'], opts), 2) + assert.match(stderr.text(), /no token given/) + } + { + const { hypHome, stderr, opts } = await makeDispatchOpts() + const tokenFile = path.join(hypHome, 'token.txt') + await fs.writeFile(tokenFile, 'x') + assert.equal( + await dispatch( + ['join', 'https://central.example', 'tok', '--token-file', tokenFile, '--no-daemon'], + opts + ), + 2 + ) + assert.match(stderr.text(), /not both/) + } +}) + +test('join help exits 0 and documents token sources', async () => { + const { stdout, opts } = await makeDispatchOpts() + assert.equal(await dispatch(['join', '--help'], opts), 0) + assert.match(stdout.text(), /--token-file/) + assert.match(stdout.text(), /stdin/) +}) diff --git a/test/plugins/central-config-pull.test.js b/test/plugins/central-config-pull.test.js new file mode 100644 index 0000000..d2bf3f4 --- /dev/null +++ b/test/plugins/central-config-pull.test.js @@ -0,0 +1,370 @@ +// @ts-check + +import test from 'node:test' +import assert from 'node:assert/strict' + +import { + MAX_CONFIG_DOCUMENT_BYTES, + createConfigPullLoop, + parseRetryAfter, +} from '../../hypaware-core/plugins-workspace/central/src/config_client.js' + +function makeLog() { + /** @type {Array<{ level: string, message: string, fields: Record }>} */ + const rows = [] + /** @param {string} level */ + const emit = (level) => + /** @param {string} message @param {Record} [fields] */ + (message, fields) => { rows.push({ level, message, fields: fields ?? {} }) } + return { + rows, + debug: emit('debug'), + info: emit('info'), + warn: emit('warn'), + error: emit('error'), + } +} + +/** @param {{ runningEtag?: string }} [opts] */ +function makeControl(opts = {}) { + /** @type {Array<{ document: unknown, etag: string }>} */ + const staged = [] + let confirms = 0 + return { + staged, + get confirms() { return confirms }, + /** @param {unknown} document @param {string} etag */ + async stage(document, etag) { + staged.push({ document, etag }) + return /** @type {const} */ ({ ok: true, action: 'applied' }) + }, + confirmPoll() { confirms += 1 }, + runningEtag() { return opts.runningEtag }, + } +} + +/** + * Real `Response` objects so the transport path under test (streamed + * body reads, abort signals) matches what `fetch` actually returns. + * + * @param {Array<{ status: number, headers?: Record, body?: string }>} responses + */ +function makeFetch(responses) { + /** @type {Array<{ url: string, headers: Record }>} */ + const requests = [] + /** @type {typeof fetch} */ + const fetchFn = async (url, init) => { + requests.push({ + url: String(url), + headers: /** @type {Record} */ (init?.headers ?? {}), + }) + const next = responses.shift() ?? { status: 500 } + // Response forbids a body on null-body statuses (204/304). + const body = next.status === 204 || next.status === 304 ? null : next.body ?? null + return new Response(body, { status: next.status, headers: next.headers ?? {} }) + } + return { fetchFn, requests } +} + +function makeIdentity() { + let refreshes = 0 + return { + get refreshes() { return refreshes }, + async getCurrentJwt() { return 'jwt-1' }, + async refresh() { refreshes += 1 }, + } +} + +/** + * @param {object} overrides + */ +function makeLoop(overrides) { + const log = makeLog() + const args = /** @type {any} */ ({ + centralUrl: 'https://central.example', + identityClient: makeIdentity(), + pollIntervalSeconds: 3600, + log, + ...overrides, + }) + return { loop: createConfigPullLoop(args), log } +} + +test('start pulls immediately; a 200 confirms the poll and stages the document with its etag', async () => { + const control = makeControl() + const { fetchFn, requests } = makeFetch([ + { status: 200, headers: { etag: 'rev-1' }, body: JSON.stringify({ version: 2 }) }, + ]) + const { loop } = makeLoop({ configControl: control, fetchFn }) + + loop.start() + await loop.stop() + + assert.equal(requests.length, 1) + assert.ok(requests[0].url.endsWith('/v1/config')) + assert.equal(requests[0].headers.authorization, 'Bearer jwt-1') + // No running config etag → no If-None-Match (first 200 must happen). + assert.equal('if-none-match' in requests[0].headers, false) + assert.equal(control.confirms, 1) + assert.deepEqual(control.staged, [{ document: { version: 2 }, etag: 'rev-1' }]) +}) + +test('If-None-Match always presents the running config etag', async () => { + const control = makeControl({ runningEtag: 'rev-current' }) + const { fetchFn, requests } = makeFetch([{ status: 304 }]) + const { loop } = makeLoop({ configControl: control, fetchFn }) + + loop.start() + await loop.stop() + + assert.equal(requests[0].headers['if-none-match'], 'rev-current') + assert.equal(control.confirms, 1) + assert.deepEqual(control.staged, []) +}) + +test('401 refreshes the JWT and retries once; a second 401 escalates without staging', async () => { + const control = makeControl() + const identityClient = makeIdentity() + const ok = makeFetch([ + { status: 401 }, + { status: 304 }, + ]) + const { loop } = makeLoop({ configControl: control, identityClient, fetchFn: ok.fetchFn }) + loop.start() + await loop.stop() + assert.equal(identityClient.refreshes, 1) + assert.equal(ok.requests.length, 2) + assert.equal(control.confirms, 1) + + const identity2 = makeIdentity() + const bad = makeFetch([{ status: 401 }, { status: 401 }]) + const second = makeLoop({ configControl: control, identityClient: identity2, fetchFn: bad.fetchFn }) + second.loop.start() + await second.loop.stop() + assert.equal(identity2.refreshes, 1) + assert.equal(control.confirms, 1) + assert.deepEqual(control.staged, []) +}) + +test('a 200 without an etag header is dropped, not staged', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([ + { status: 200, body: JSON.stringify({ version: 2 }) }, + ]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.deepEqual(control.staged, []) + assert.equal(control.confirms, 0) + assert.ok(log.rows.some((r) => r.fields.error_kind === 'config_missing_etag')) +}) + +test('an oversized 200 body is dropped before parsing', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([ + { status: 200, headers: { etag: 'rev-big' }, body: 'x'.repeat(MAX_CONFIG_DOCUMENT_BYTES + 1) }, + ]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.deepEqual(control.staged, []) + assert.ok(log.rows.some((r) => r.fields.error_kind === 'config_document_too_large')) +}) + +test('invalid JSON in a 200 body is dropped', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([ + { status: 200, headers: { etag: 'rev-1' }, body: '{nope' }, + ]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.deepEqual(control.staged, []) + assert.ok(log.rows.some((r) => r.fields.error_kind === 'config_invalid_json')) +}) + +test('404 takes the legacy backoff branch without confirming probation', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([{ status: 404 }]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.equal(control.confirms, 0) + assert.ok(log.rows.some((r) => r.fields.hyp_reason === 'no_config_registered_legacy')) +}) + +test('the steady timer keeps polling on the configured cadence', async () => { + const control = makeControl() + const { fetchFn, requests } = makeFetch([ + { status: 304 }, { status: 304 }, { status: 304 }, { status: 304 }, + ]) + // Sub-second cadence is rejected by config validation but accepted + // by the loop itself — that's what makes this test fast. + const { loop } = makeLoop({ configControl: control, fetchFn, pollIntervalSeconds: 0.02 }) + loop.start() + await new Promise((resolve) => setTimeout(resolve, 120)) + await loop.stop() + assert.ok(requests.length >= 2, `expected repeat polls, saw ${requests.length}`) + assert.ok(control.confirms >= 2, `expected repeat confirms, saw ${control.confirms}`) +}) + +test('stop prevents any further polls', async () => { + const control = makeControl() + const { fetchFn, requests } = makeFetch([{ status: 304 }, { status: 304 }]) + const { loop } = makeLoop({ configControl: control, fetchFn, pollIntervalSeconds: 0.01 }) + loop.start() + await loop.stop() + const seen = requests.length + await new Promise((resolve) => setTimeout(resolve, 60)) + assert.equal(requests.length, seen) +}) + +test('transport errors back off and keep the loop alive', async () => { + const control = makeControl() + let calls = 0 + /** @type {typeof fetch} */ + const fetchFn = async () => { + calls += 1 + throw new Error('connection refused') + } + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.equal(calls, 1) + assert.ok(log.rows.some((r) => r.message === 'central.config.poll_failed')) +}) + +test('an oversized Content-Length is rejected without reading the body', async () => { + const control = makeControl() + /** @type {typeof fetch} */ + const fetchFn = async () => { + // A stream that never produces and never closes: only the + // Content-Length pre-reject can finish this poll promptly — the + // streaming counter would wait on it until the deadline. + const stream = new ReadableStream({ pull() {} }) + const response = new Response(stream, { status: 200, headers: { etag: 'rev-huge' } }) + response.headers.set('content-length', String(MAX_CONFIG_DOCUMENT_BYTES + 1)) + return response + } + const { loop, log } = makeLoop({ configControl: control, fetchFn, requestTimeoutSeconds: 600 }) + loop.start() + await loop.stop() + assert.deepEqual(control.staged, []) + const row = log.rows.find((r) => r.fields.error_kind === 'config_document_too_large') + assert.ok(row, 'expected the Content-Length pre-reject to fire') + // body_bytes reports the declared length — the streaming path could + // never have observed this number from an empty stream. + assert.equal(row?.fields.body_bytes, MAX_CONFIG_DOCUMENT_BYTES + 1) +}) + +test('a chunked oversized body is cancelled at the cap, not buffered whole', async () => { + const control = makeControl() + const chunk = new TextEncoder().encode('x'.repeat(64 * 1024)) + let chunksServed = 0 + /** @type {typeof fetch} */ + const fetchFn = async () => { + // Endless chunked stream with no Content-Length: only the byte + // counter can stop this one. + const stream = new ReadableStream({ + pull(controller) { + chunksServed += 1 + controller.enqueue(chunk) + }, + }) + return new Response(stream, { status: 200, headers: { etag: 'rev-endless' } }) + } + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.deepEqual(control.staged, []) + assert.ok(log.rows.some((r) => r.fields.error_kind === 'config_document_too_large')) + // Reads stop within one chunk of the cap instead of draining forever. + assert.ok( + chunksServed <= MAX_CONFIG_DOCUMENT_BYTES / chunk.byteLength + 2, + `expected the read to stop at the cap, served ${chunksServed} chunks` + ) +}) + +test('stop() aborts a poll stuck on a never-resolving fetch after the drain grace', async () => { + const control = makeControl() + /** @type {typeof fetch} */ + const fetchFn = () => new Promise(() => {}) + // Long request timeout: the stop-grace abort, not the deadline, is + // what must unblock shutdown here. + const { loop } = makeLoop({ + configControl: control, + fetchFn, + requestTimeoutSeconds: 600, + stopGraceSeconds: 0.02, + }) + loop.start() + const before = Date.now() + await loop.stop() + assert.ok(Date.now() - before < 5000, 'stop() must not wait out the request timeout') + assert.deepEqual(control.staged, []) +}) + +test('the request deadline aborts a stalled poll and the loop stays alive', async () => { + const control = makeControl() + let aborted = false + /** @type {typeof fetch} */ + const fetchFn = (_url, init) => + new Promise((_resolve, reject) => { + init?.signal?.addEventListener('abort', () => { + aborted = true + reject(init.signal?.reason ?? new Error('aborted')) + }) + }) + const { loop, log } = makeLoop({ + configControl: control, + fetchFn, + requestTimeoutSeconds: 0.02, + }) + loop.start() + await new Promise((resolve) => setTimeout(resolve, 100)) + await loop.stop() + assert.equal(aborted, true) + const row = log.rows.find((r) => r.fields.error_kind === 'config_poll_error') + assert.ok(row, 'expected the timed-out poll to log a failure') + assert.match(String(row?.fields.message), /exceeded/) +}) + +test('429 with Retry-After schedules from the header without confirming the poll', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([{ status: 429, headers: { 'retry-after': '7' } }]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.equal(control.confirms, 0) + assert.deepEqual(control.staged, []) + const row = log.rows.find((r) => r.fields.error_kind === 'config_poll_throttled') + assert.ok(row) + assert.equal(row?.fields.http_status, 429) + assert.equal(row?.fields.retry_after_seconds, 7) +}) + +test('503 with a garbage Retry-After falls back to the backoff ladder', async () => { + const control = makeControl() + const { fetchFn } = makeFetch([{ status: 503, headers: { 'retry-after': 'soonish' } }]) + const { loop, log } = makeLoop({ configControl: control, fetchFn }) + loop.start() + await loop.stop() + assert.equal(control.confirms, 0) + const row = log.rows.find((r) => r.fields.error_kind === 'config_poll_throttled') + assert.ok(row) + assert.equal(row?.fields.http_status, 503) + assert.equal('retry_after_seconds' in (row?.fields ?? {}), false) +}) + +test('parseRetryAfter: delta-seconds, HTTP-date, and garbage', () => { + assert.equal(parseRetryAfter('7'), 7) + assert.equal(parseRetryAfter('0'), 0) + // An HTTP-date resolves to a non-negative whole-second delay. + const future = parseRetryAfter(new Date(Date.now() + 30_000).toUTCString()) + assert.ok(typeof future === 'number' && future >= 28 && future <= 31, `got ${future}`) + // A past date clamps to zero rather than going negative. + assert.equal(parseRetryAfter(new Date(Date.now() - 60_000).toUTCString()), 0) + assert.equal(parseRetryAfter('soonish'), undefined) + assert.equal(parseRetryAfter(''), undefined) + assert.equal(parseRetryAfter(null), undefined) +})