From c995f4bb9ae135bf9ff4992152fdd5ad49c0079c Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Thu, 7 May 2026 20:52:56 -0600 Subject: [PATCH 1/4] peer: rotate samizdat credentials hourly (closes engineering#3437) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently peer.Client builds the libbox inbound exactly once per Start and holds the same X25519 keypair / shortID / masquerade for the entire peer process lifetime — a leaked credential (logs, telemetry, support bundles, the route_id leakage in engineering#3440) remains usable for hours or days. This adds a credRotationLoop goroutine on a 1h tick. On each tick: 1. Re-register with lantern-cloud against the same (address, port) tuple — same router-side mapping, fresh server-side row, fresh samizdat creds. 2. Patch the new options for VPN bypass. 3. Build a new libbox service. 4. Close the old box (releases the listening port). 5. Start the new box (re-binds the same port with new creds). 6. Atomic swap of c.box, c.routeID. 7. Best-effort deregister of the prior route_id so the bandit stops handing the old (now-invalid) creds to clients within ~immediately rather than waiting up-to-TTL for the row to expire. Steps 4-5 leave a brief (~hundreds of ms) window where the port is unbound; samizdat clients see TCP RST and reconnect via the bandit. That's the trade-off vs. the security cost of holding the same cred for the peer process lifetime — caps blast radius from cred leakage to ~1h regardless of how long the peer has been running. Rotation is best-effort: a single failure logs and waits for the next tick. The current box and creds remain serving in the failure case so a transient register error doesn't kill the session. Config gains CredRotationInterval (defaults to peerCredRotationInterval = 1h) so tests drive the loop without a 1h sleep — see TestClient_RotatesCredentialsAtInterval. Co-Authored-By: Claude Opus 4.7 (1M context) --- peer/peer.go | 194 ++++++++++++++++++++++++++++++++++++++++++++-- peer/peer_test.go | 95 ++++++++++++++++++++++- 2 files changed, 281 insertions(+), 8 deletions(-) diff --git a/peer/peer.go b/peer/peer.go index 500ae7a5..a41f5eec 100644 --- a/peer/peer.go +++ b/peer/peer.go @@ -59,14 +59,15 @@ type Status struct { } // Config plumbs in dependencies. Zero-valued fields fall back to production -// defaults; HeartbeatInterval and HeartbeatTimeout exist so tests can drive -// the loop without sleeping a full minute. +// defaults; HeartbeatInterval, HeartbeatTimeout, and CredRotationInterval +// exist so tests can drive the loops without sleeping a full minute / hour. type Config struct { - API *API - NewForwarder func(ctx context.Context) (portForwarder, error) - BuildBoxService boxFactory - HeartbeatInterval time.Duration - HeartbeatTimeout time.Duration + API *API + NewForwarder func(ctx context.Context) (portForwarder, error) + BuildBoxService boxFactory + HeartbeatInterval time.Duration + HeartbeatTimeout time.Duration + CredRotationInterval time.Duration } // Client orchestrates one peer-proxy session: open UPnP port → register with @@ -97,8 +98,38 @@ type Client struct { forwarder portForwarder box boxService routeID string + // externalPort / internalPort persist the port mapping picked at + // Start so the cred-rotation loop can re-register against the same + // (address, port) tuple without re-probing UPnP / re-mapping. The + // router-side mapping itself stays put across rotations; only the + // samizdat creds and route_id rotate. + externalPort uint16 + internalPort uint16 + // boxOptions is the fresh options string passed to BuildBoxService, + // kept for diagnostics and so the rotation path doesn't need to + // re-derive it from the (also-stored) box reference. + boxOptions string + // runCtx is captured here for the cred-rotation goroutine to bind + // the new libbox lifetime to the same context as the original Start. + // Stop's cancelRun() teardown still applies to the rebuilt box. + runCtx context.Context } +// peerCredRotationInterval bounds how long a leaked samizdat +// credential remains usable. At each tick the peer re-registers with +// lantern-cloud (new route_id, new keypair, new shortID), rebuilds the +// libbox service against the new options, and deregisters the prior +// route. Caps blast radius from credential leakage (logs, telemetry, +// memory dumps, the H2 leakage path in engineering#3440) to ~1h +// regardless of peer process lifetime. +// +// Cost per rotation: one API.Register + Deregister round trip, one +// libbox build + start + close cycle. Brief (~hundreds-of-ms) port- +// rebind window during the swap; samizdat clients see TCP RST and +// reconnect via the bandit. Acceptable trade-off vs. holding the same +// cred for the full peer process lifetime. +const peerCredRotationInterval = 1 * time.Hour + // peerCleanupTimeout caps how long Start's rollback path waits for // Deregister / UnmapPort. Cleanup uses a fresh Background context (not the // caller's ctx) so an already-canceled or expired Start ctx doesn't skip @@ -253,6 +284,10 @@ func (c *Client) Start(ctx context.Context) error { c.forwarder = fwd c.box = box c.routeID = regResp.RouteID + c.externalPort = mapping.ExternalPort + c.internalPort = mapping.InternalPort + c.boxOptions = options + c.runCtx = runCtx c.cancelRun = cancelRun c.runDone = runDone c.status = Status{ @@ -265,8 +300,14 @@ func (c *Client) Start(ctx context.Context) error { statusSnapshot := c.status c.mu.Unlock() + rotation := c.cfg.CredRotationInterval + if rotation == 0 { + rotation = peerCredRotationInterval + } + fwd.StartRenewal(runCtx) go c.heartbeatLoop(runCtx, heartbeat, runDone) + go c.credRotationLoop(runCtx, rotation) slog.Info("peer client started", "external_ip", externalIP, @@ -317,6 +358,10 @@ func (c *Client) Stop(ctx context.Context) error { c.forwarder = nil c.box = nil c.routeID = "" + c.externalPort = 0 + c.internalPort = 0 + c.boxOptions = "" + c.runCtx = nil c.status = Status{} c.mu.Unlock() @@ -408,6 +453,141 @@ func isNotRegistered(err error) bool { return errors.As(err, &apiErr) && apiErr.Status == 404 } +// credRotationLoop periodically rotates the peer's samizdat credentials +// (X25519 keypair, shortID, masquerade) by re-registering with +// lantern-cloud, rebuilding the libbox inbound, and deregistering the +// prior route. Caps blast radius from credential leakage to ~interval +// regardless of peer process lifetime — see peerCredRotationInterval. +// +// Closes done is the responsibility of heartbeatLoop; this loop just +// exits when ctx is cancelled. We deliberately don't add another close +// channel: heartbeatLoop's done already gates Stop, and rotation +// failures are non-fatal (log + retry next tick), so there's nothing +// the Stop path needs to wait on from this goroutine. +func (c *Client) credRotationLoop(ctx context.Context, interval time.Duration) { + t := time.NewTicker(interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + if err := c.rotateCreds(ctx); err != nil { + // Don't kill the loop on a single failure — current + // box / route is still serving. Try again next tick. + slog.Warn("peer cred rotation failed; current creds remain in use", "err", err) + } + } + } +} + +// rotateCreds atomically swaps the peer's samizdat credentials. On +// success: a fresh route_id and keypair are in use, the libbox inbound +// has been rebuilt against the new options, the prior route is +// deregistered server-side, and the FlutterEvent stream sees no gap. +// On failure: the prior creds and box continue serving — rotation is +// best-effort. The router-side port mapping is preserved across the +// rotation; only the in-process samizdat state changes. +// +// Sequence: +// 1. Re-register with the same (externalIP, externalPort) as Start. +// 2. Patch the new server-supplied options for VPN bypass. +// 3. Build a new libbox service against the new options. +// 4. Close the old box (releases the listening port). +// 5. Start the new box (re-binds the same port, now with new creds). +// 6. Atomic swap: c.box, c.routeID, c.boxOptions point at the new box. +// 7. Best-effort deregister of the prior route_id so the bandit +// catalog stops handing the old (now-invalid) creds to clients. +// +// Steps 4-5 leave a brief (~hundreds of ms) window where the port +// isn't bound; samizdat clients see TCP RST and reconnect. Acceptable +// trade-off vs. the security cost of holding the same cred for the +// peer process lifetime. +func (c *Client) rotateCreds(ctx context.Context) error { + c.mu.Lock() + if !c.active { + c.mu.Unlock() + return errors.New("not active") + } + fwd := c.forwarder + extPort := c.externalPort + intPort := c.internalPort + oldRouteID := c.routeID + oldBox := c.box + c.mu.Unlock() + + if fwd == nil || oldBox == nil { + return errors.New("rotateCreds: client state inconsistent") + } + + externalIP, err := fwd.ExternalIP(ctx) + if err != nil { + return fmt.Errorf("get external ip: %w", err) + } + regResp, err := c.cfg.API.Register(ctx, RegisterRequest{ + ExternalIP: externalIP, + ExternalPort: extPort, + InternalPort: intPort, + }) + if err != nil { + return fmt.Errorf("re-register: %w", err) + } + options, err := ensurePeerOutboundsBypassVPN(regResp.ServerConfig) + if err != nil { + return fmt.Errorf("patch sing-box options: %w", err) + } + + c.mu.Lock() + runCtx := c.runCtx + c.mu.Unlock() + if runCtx == nil { + // Stop happened between the unlock above and here. Skip the + // build to avoid spinning up a libbox tied to a torn-down ctx. + // The new register row is harmless — server-side reaper will + // deprecate it after TTL since no heartbeat will arrive. + return errors.New("client stopped during rotation") + } + newBox, err := c.cfg.BuildBoxService(runCtx, options) + if err != nil { + return fmt.Errorf("build new sing-box: %w", err) + } + + // Close old, start new. Order matters — both want the same port. + // If newBox.Start fails after oldBox.Close, we lost the listener + // and the next heartbeat / rotation tick is the recovery point. + if closeErr := oldBox.Close(); closeErr != nil { + slog.Warn("close old box during rotation", "err", closeErr) + } + if err := newBox.Start(); err != nil { + // Catastrophic: port is now unbound. Leave c.box pointing at + // oldBox so a future Stop tries to close it (idempotent on + // already-closed); the next rotation tick will try again. + return fmt.Errorf("start new sing-box: %w", err) + } + + c.mu.Lock() + c.box = newBox + c.routeID = regResp.RouteID + c.boxOptions = options + c.status.RouteID = regResp.RouteID + c.mu.Unlock() + + // Deregister the prior route so the bandit stops handing the old + // (now-invalid) creds to clients. Best-effort: the prior row will + // expire from its TTL anyway, but explicit deregister cuts the + // stale-creds window from up-to-TTL down to ~immediately. + if err := c.cfg.API.Deregister(ctx, oldRouteID); err != nil { + slog.Warn("deregister prior route after rotation", + "err", err, "old_route_id", oldRouteID) + } + + slog.Info("peer cred rotation succeeded", + "new_route_id", regResp.RouteID, + "old_route_id", oldRouteID, + ) + return nil +} + // ensurePeerOutboundsBypassVPN guarantees the peer sing-box's outbound dials // bind to the physical interface rather than whatever the OS routing table // picks. Without this, when the user's own Lantern VPN is up its TUN holds diff --git a/peer/peer_test.go b/peer/peer_test.go index dd24c0ea..d1e635c1 100644 --- a/peer/peer_test.go +++ b/peer/peer_test.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "errors" + "fmt" "net/http" "net/http/httptest" "sync" @@ -139,6 +140,10 @@ type stubServer struct { server *httptest.Server registerStatus int registerResp RegisterResponse + // registerRespFn lets a test return a different response per + // register call (e.g. cred-rotation tests need a fresh route_id + // each time). When non-nil, takes precedence over registerResp. + registerRespFn func() RegisterResponse heartbeatStatus int deregisterStatus int registerCount atomic.Int64 @@ -174,7 +179,11 @@ func newStubServer(t *testing.T) *stubServer { http.Error(w, "register failed", s.registerStatus) return } - _ = json.NewEncoder(w).Encode(s.registerResp) + resp := s.registerResp + if s.registerRespFn != nil { + resp = s.registerRespFn() + } + _ = json.NewEncoder(w).Encode(resp) }) mux.HandleFunc("/v1/peer/heartbeat", func(w http.ResponseWriter, r *http.Request) { s.heartbeatCount.Add(1) @@ -576,6 +585,90 @@ func TestAPIError_StringFormat(t *testing.T) { assert.Contains(t, e.Error(), "could not connect") } +// TestClient_RotatesCredentialsAtInterval pins the C2 fix from +// engineering#3437: the peer client must re-register and rebuild its +// libbox inbound on a schedule so a leaked credential's blast radius is +// bounded by CredRotationInterval rather than peer process lifetime. +// +// Drives a short rotation interval (50ms) and asserts: +// 1. Multiple registers happen (start + ≥2 rotations within 250ms). +// 2. Each rotation deregisters the prior route_id. +// 3. The peer's exposed RouteID changes — clients freshly assigned +// after a rotation see the new ID; the bandit catalog stops +// handing out the old one once Deregister lands. +// 4. Multiple distinct boxes were built (the rotation actually +// rebuilt libbox; not just a no-op). +// 5. The first box was closed (the old listener released its port). +func TestClient_RotatesCredentialsAtInterval(t *testing.T) { + fwd := &fakeForwarder{externalIP: "203.0.113.42"} + srv := newStubServer(t) + + // Each rotation needs a register response with a distinct + // route_id so we can verify the swap actually changed identifiers + // rather than re-registering the same id. + var registerSeq atomic.Int64 + srv.registerRespFn = func() RegisterResponse { + n := registerSeq.Add(1) + return RegisterResponse{ + RouteID: fmt.Sprintf("00000000-0000-0000-0000-00000000000%d", n), + ServerConfig: `{"inbounds": [{"type":"samizdat","tag":"samizdat-in"}]}`, + HeartbeatIntervalSeconds: 60, + } + } + + // Each BuildBoxService call gets a fresh fakeBoxService so we can + // see how many boxes were built and which ones got closed. + var ( + boxesMu sync.Mutex + boxes []*fakeBoxService + ) + c := newTestClient(t, fwd, &fakeBoxService{}, srv, func(cfg *Config) { + cfg.CredRotationInterval = 50 * time.Millisecond + // Long heartbeat so heartbeat ticks don't compete with the + // register/deregister counters that we're asserting on. + cfg.HeartbeatInterval = time.Hour + cfg.BuildBoxService = func(_ context.Context, options string) (boxService, error) { + b := &fakeBoxService{gotConfig: options} + boxesMu.Lock() + boxes = append(boxes, b) + boxesMu.Unlock() + return b, nil + } + }) + + require.NoError(t, c.Start(context.Background())) + t.Cleanup(func() { _ = c.Stop(context.Background()) }) + + // Wait for at least 2 rotations on top of the initial register. + require.Eventually(t, func() bool { + return srv.registerCount.Load() >= 3 + }, 1*time.Second, 25*time.Millisecond, + "expected ≥3 registers (initial + 2 rotations) within 1s; got %d", + srv.registerCount.Load()) + + // Each rotation deregisters the prior route — N rotations => + // N deregisters (initial register is not preceded by one). + rotations := srv.registerCount.Load() - 1 + assert.GreaterOrEqual(t, srv.deregisterCount.Load(), rotations-1, + "each rotation should deregister the prior route_id (got %d deregs vs %d rotations)", + srv.deregisterCount.Load(), rotations) + + // RouteID exposed via Status should reflect the latest rotation. + c.mu.Lock() + currentRouteID := c.routeID + c.mu.Unlock() + assert.NotEqual(t, "00000000-0000-0000-0000-000000000001", currentRouteID, + "current route_id should have advanced past the initial register") + + // Multiple boxes built; first one closed. + boxesMu.Lock() + defer boxesMu.Unlock() + require.GreaterOrEqual(t, len(boxes), 2, + "expected ≥2 libbox builds (initial + ≥1 rotation)") + assert.True(t, boxes[0].closed.Load(), + "first box should be closed by the first rotation") +} + // Subscribers (the IPC SSE handler in production) need both edges so the UI // can render fresh state without polling. func TestClient_StatusEventEmittedOnStartAndStop(t *testing.T) { From ec71803db3f012c6b50c5d77f0f7fc14bfdef65b Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Fri, 29 May 2026 18:51:13 -0600 Subject: [PATCH 2/4] peer: address Copilot review on #472 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eight findings on the cred-rotation lifecycle: 1. credRotationLoop time.NewTicker panic on non-positive interval. Now clamps interval <= 0 to peerCredRotationInterval (the same default Start applies when CredRotationInterval is unset) instead of letting NewTicker panic the host process. 2. rotateCreds raced Stop. Re-check c.active under the swap lock immediately before assigning c.box / c.routeID; if Stop has cleared that state in flight, close the newly-built box and deregister the new route rather than resurrecting peer state Stop just tore down. 3. heartbeatLoop 404 raced cred rotation. heartbeatLoop captured c.routeID, sent the heartbeat, and treated a 404 as authoritative — but a rotation between those two steps deregisters the captured route_id, producing a 404 that's actually a stale response and should not trigger auto-Stop. heartbeatLoop now re-checks the current routeID under lock on 404; if it differs, the 404 is expected and the loop continues. 4. Status.ExternalIP went stale after rotation. rotation re-queries ExternalIP for Register, so update c.status.ExternalIP alongside c.status.RouteID under the swap lock. 5. Newly-registered route leaked when BuildBoxService or the subsequent Start failed. Threaded a cleanupNewRoute(reason) closure through every post-Register error path so the orphan row is deregistered with a fresh ctx (avoids cancellation-skip) rather than leaking until TTL. 6. Long availability outage when newBox.Start failed after oldBox.Close released the port. New startNewBoxWithRetry retries Start up to 5 times with exponential backoff (50ms → 800ms, total <1s) to absorb router-side TIME_WAIT / EADDRINUSE windows; preserves the previous 'fall through to next tick' behavior only after the retries are exhausted. 7. libbox.Start panic could crash the host process during background rotation, taking the user's main VPN with it (vpn/tunnel.go's tunnel-start path has a parallel recover). New runRotation wrapper defers a recover + slog.Error and lets the loop continue. 8. TestClient_RotatesCredentialsAtInterval asserted deregisterCount >= rotations-1, which would pass even if one rotation never deregistered. Tightened to >= rotations via require.Eventually so the test waits for the most recent deregister to land (rotation issues it after the swap-lock completes, so there's a small race window between observing the new register and the corresponding deregister). Tests pass under -race -count=1 across 5 consecutive runs. Co-Authored-By: Claude Opus 4.7 --- peer/peer.go | 125 +++++++++++++++++++++++++++++++++++++++++----- peer/peer_test.go | 10 +++- 2 files changed, 121 insertions(+), 14 deletions(-) diff --git a/peer/peer.go b/peer/peer.go index a41f5eec..dfa77cf8 100644 --- a/peer/peer.go +++ b/peer/peer.go @@ -7,6 +7,7 @@ import ( "fmt" "log/slog" "math/rand/v2" + "runtime/debug" "sync" "time" @@ -430,6 +431,20 @@ func (c *Client) heartbeatLoop(ctx context.Context, interval time.Duration, done // later heartbeat as a 404. slog.Warn("peer heartbeat failed", "err", err, "route_id", routeID) if isNotRegistered(err) { + // Re-check current routeID under lock. If credRotationLoop + // swapped routeID + deregistered the prior route between + // our heartbeat-prepare and heartbeat-response, the 404 + // applies to a stale route and is expected, not a reason + // to stop. Skip the auto-Stop and let the next tick + // heartbeat the new route. + c.mu.Lock() + currentRouteID := c.routeID + c.mu.Unlock() + if currentRouteID != routeID { + slog.Info("peer heartbeat 404 on stale route_id; rotation in flight, continuing", + "stale_route_id", routeID, "current_route_id", currentRouteID) + continue + } slog.Info("peer route no longer registered server-side, stopping client") // Stop runs in a separate goroutine to avoid the cyclic // Stop → cancelRun → loop-exit deadlock. @@ -465,6 +480,14 @@ func isNotRegistered(err error) bool { // failures are non-fatal (log + retry next tick), so there's nothing // the Stop path needs to wait on from this goroutine. func (c *Client) credRotationLoop(ctx context.Context, interval time.Duration) { + // Non-positive interval would panic time.NewTicker. Treat it the same + // as the zero case Start handles when CredRotationInterval is unset: + // fall back to the default cap rather than disabling rotation + // silently (an unset value still wants rotation; a negative value is + // almost certainly a test/config bug). + if interval <= 0 { + interval = peerCredRotationInterval + } t := time.NewTicker(interval) defer t.Stop() for { @@ -472,15 +495,31 @@ func (c *Client) credRotationLoop(ctx context.Context, interval time.Duration) { case <-ctx.Done(): return case <-t.C: - if err := c.rotateCreds(ctx); err != nil { - // Don't kill the loop on a single failure — current - // box / route is still serving. Try again next tick. - slog.Warn("peer cred rotation failed; current creds remain in use", "err", err) - } + c.runRotation(ctx) } } } +// runRotation wraps rotateCreds with a panic recover. libbox.Start can +// panic (see the recover in vpn/tunnel.go's tunnel-start path); an +// unrecovered panic here would crash the host process during a +// background rotation, taking the user's main VPN with it. Treat a +// panic the same as any other rotation failure: log, keep the existing +// box serving, try again next tick. +func (c *Client) runRotation(ctx context.Context) { + defer func() { + if r := recover(); r != nil { + slog.Error("peer cred rotation panicked; current creds remain in use", + "panic", r, "stack", string(debug.Stack())) + } + }() + if err := c.rotateCreds(ctx); err != nil { + // Don't kill the loop on a single failure — current + // box / route is still serving. Try again next tick. + slog.Warn("peer cred rotation failed; current creds remain in use", "err", err) + } +} + // rotateCreds atomically swaps the peer's samizdat credentials. On // success: a fresh route_id and keypair are in use, the libbox inbound // has been rebuilt against the new options, the prior route is @@ -532,8 +571,24 @@ func (c *Client) rotateCreds(ctx context.Context) error { if err != nil { return fmt.Errorf("re-register: %w", err) } + // From here on, any error path must deregister regResp.RouteID — + // otherwise the newly-created server-side row leaks until TTL expiry + // and the bandit catalog may briefly hand out creds for a route + // whose box never came up. + cleanupNewRoute := func(reason error) { + // Use a fresh ctx so a cancelled rotation ctx doesn't skip the + // cleanup we just made necessary. + cleanupCtx, cancel := context.WithTimeout(context.Background(), peerCleanupTimeout) + defer cancel() + if dErr := c.cfg.API.Deregister(cleanupCtx, regResp.RouteID); dErr != nil { + slog.Warn("deregister orphan route after rotation failure", + "reason", reason, "err", dErr, "orphan_route_id", regResp.RouteID) + } + } + options, err := ensurePeerOutboundsBypassVPN(regResp.ServerConfig) if err != nil { + cleanupNewRoute(err) return fmt.Errorf("patch sing-box options: %w", err) } @@ -542,34 +597,54 @@ func (c *Client) rotateCreds(ctx context.Context) error { c.mu.Unlock() if runCtx == nil { // Stop happened between the unlock above and here. Skip the - // build to avoid spinning up a libbox tied to a torn-down ctx. - // The new register row is harmless — server-side reaper will - // deprecate it after TTL since no heartbeat will arrive. + // build to avoid spinning up a libbox tied to a torn-down ctx, + // and clean up the just-created route so it doesn't linger. + cleanupNewRoute(errors.New("client stopped during rotation")) return errors.New("client stopped during rotation") } newBox, err := c.cfg.BuildBoxService(runCtx, options) if err != nil { + cleanupNewRoute(err) return fmt.Errorf("build new sing-box: %w", err) } // Close old, start new. Order matters — both want the same port. - // If newBox.Start fails after oldBox.Close, we lost the listener - // and the next heartbeat / rotation tick is the recovery point. + // If newBox.Start fails after oldBox.Close, retry briefly to absorb + // router-side TIME_WAIT / EADDRINUSE windows before giving up. if closeErr := oldBox.Close(); closeErr != nil { slog.Warn("close old box during rotation", "err", closeErr) } - if err := newBox.Start(); err != nil { + if err := startNewBoxWithRetry(ctx, newBox); err != nil { // Catastrophic: port is now unbound. Leave c.box pointing at // oldBox so a future Stop tries to close it (idempotent on - // already-closed); the next rotation tick will try again. + // already-closed); the next rotation tick will try again. Also + // deregister the now-orphan new route so the bandit doesn't + // hand its creds out for a non-listening port. + cleanupNewRoute(err) return fmt.Errorf("start new sing-box: %w", err) } + // Final swap under lock. Re-check active so a Stop racing us between + // runCtx-check above and now doesn't get resurrected by overwriting + // the cleared state Stop just set up. c.mu.Lock() + if !c.active { + c.mu.Unlock() + // Stop already cleared c.box / c.routeID. Close the new box we + // just brought up (Stop has no reference to it) and deregister + // the new route. The old route Stop already deregistered as + // part of its own teardown. + if err := newBox.Close(); err != nil { + slog.Warn("close new box after Stop raced rotation", "err", err) + } + cleanupNewRoute(errors.New("client stopped during rotation swap")) + return errors.New("client stopped during rotation") + } c.box = newBox c.routeID = regResp.RouteID c.boxOptions = options c.status.RouteID = regResp.RouteID + c.status.ExternalIP = externalIP c.mu.Unlock() // Deregister the prior route so the bandit stops handing the old @@ -588,6 +663,32 @@ func (c *Client) rotateCreds(ctx context.Context) error { return nil } +// startNewBoxWithRetry retries newBox.Start a handful of times with a +// short backoff to absorb router-side TIME_WAIT / EADDRINUSE between +// oldBox.Close releasing the port and newBox.Start re-binding it. Total +// wait is bounded under 1s so a healthy rotation isn't delayed +// noticeably; the alternative is leaving the peer's listener down for +// the full rotation interval (default 1h) on a transient bind failure. +func startNewBoxWithRetry(ctx context.Context, newBox boxService) error { + const attempts = 5 + backoff := 50 * time.Millisecond + var lastErr error + for i := 0; i < attempts; i++ { + if err := newBox.Start(); err == nil { + return nil + } else { + lastErr = err + } + select { + case <-ctx.Done(): + return fmt.Errorf("start new sing-box (ctx cancelled after %d attempts): %w", i+1, lastErr) + case <-time.After(backoff): + } + backoff *= 2 + } + return fmt.Errorf("start new sing-box (%d attempts): %w", attempts, lastErr) +} + // ensurePeerOutboundsBypassVPN guarantees the peer sing-box's outbound dials // bind to the physical interface rather than whatever the OS routing table // picks. Without this, when the user's own Lantern VPN is up its TUN holds diff --git a/peer/peer_test.go b/peer/peer_test.go index d1e635c1..72368077 100644 --- a/peer/peer_test.go +++ b/peer/peer_test.go @@ -647,9 +647,15 @@ func TestClient_RotatesCredentialsAtInterval(t *testing.T) { srv.registerCount.Load()) // Each rotation deregisters the prior route — N rotations => - // N deregisters (initial register is not preceded by one). + // N deregisters (initial register is not preceded by one). Wait + // briefly for the deregister to catch up to the most recent + // rotation; rotation issues the deregister after the swap-lock + // completes, so there's a small race window between observing the + // new register and the corresponding deregister landing. rotations := srv.registerCount.Load() - 1 - assert.GreaterOrEqual(t, srv.deregisterCount.Load(), rotations-1, + require.Eventually(t, func() bool { + return srv.deregisterCount.Load() >= rotations + }, 500*time.Millisecond, 25*time.Millisecond, "each rotation should deregister the prior route_id (got %d deregs vs %d rotations)", srv.deregisterCount.Load(), rotations) From 1c44635abd60b14c81893d9629c51390d5a6a876 Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Fri, 29 May 2026 23:47:03 -0600 Subject: [PATCH 3/4] peer: address Copilot review on #472 (round 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seven follow-ups from the round-2 re-review: 1. startNewBoxWithRetry now wraps the libbox.Start call in a deferred recover that converts a panic into an error return. Without this, runRotation's recover one frame up catches the panic but only after rotateCreds' cleanupNewRoute path has been skipped — orphaning the freshly-registered route and leaving the port unbound. 2. Stop→Start race during rotation. The previous `!c.active` re-check at the swap lock missed the case where Stop cleared state AND a new Start re-set active=true before this rotation reached the swap; the old rotation would then overwrite the new session's box / routeID. rotateCreds now captures c.runCtx as sessionRunCtx at the top of the function and gates both the BuildBoxService call and the final swap on pointer-identity with the current c.runCtx — different ctx means a Stop→Start cycle replaced the session and our newBox / new route belong to the prior session, so close + deregister. 3. Post-swap deregister of the prior route now uses a fresh peerCleanupTimeout-bounded Background ctx instead of the rotation ctx. Stop cancelling that ctx between the swap and the deregister would have left the old (now-invalid) route in the server catalog until TTL — defeating the entire stale-credential cap the rotation is supposed to enforce. 4. startNewBoxWithRetry budget. The previous loop slept after every failed attempt, including the fifth (no point — we wouldn't try again), pushing total backoff to 1550ms when the comment claimed <1s. Now skips the final sleep, total budget 750ms (50+100+200+400 across 4 sleeps), comment updated to match. 5. peerCredRotationInterval doc dropped an engineering#3440 reference that violated AGENTS.md:13-17 (no ticket refs in code comments). Kept the rationale ('H2 leakage paths') without naming the issue. 6. runRotation doc dropped the explicit 'vpn/tunnel.go' reference — the 'the main tunnel start path already wraps it with recover for the same reason' framing carries the same context without naming the file. 7. TestClient_RotatesCredentialsAtInterval doc dropped the engineering#3437 ticket reference; rewords as 'pins the rotation invariant'. All tests pass under -race -count=1 across 5 consecutive runs. Co-Authored-By: Claude Opus 4.7 --- peer/peer.go | 104 +++++++++++++++++++++++++++++++--------------- peer/peer_test.go | 8 ++-- 2 files changed, 75 insertions(+), 37 deletions(-) diff --git a/peer/peer.go b/peer/peer.go index dfa77cf8..18eafc2a 100644 --- a/peer/peer.go +++ b/peer/peer.go @@ -121,8 +121,8 @@ type Client struct { // lantern-cloud (new route_id, new keypair, new shortID), rebuilds the // libbox service against the new options, and deregisters the prior // route. Caps blast radius from credential leakage (logs, telemetry, -// memory dumps, the H2 leakage path in engineering#3440) to ~1h -// regardless of peer process lifetime. +// memory dumps, H2 leakage paths) to ~1h regardless of peer process +// lifetime. // // Cost per rotation: one API.Register + Deregister round trip, one // libbox build + start + close cycle. Brief (~hundreds-of-ms) port- @@ -501,11 +501,11 @@ func (c *Client) credRotationLoop(ctx context.Context, interval time.Duration) { } // runRotation wraps rotateCreds with a panic recover. libbox.Start can -// panic (see the recover in vpn/tunnel.go's tunnel-start path); an -// unrecovered panic here would crash the host process during a -// background rotation, taking the user's main VPN with it. Treat a -// panic the same as any other rotation failure: log, keep the existing -// box serving, try again next tick. +// panic — the main tunnel start path already wraps it with recover for +// the same reason — and an unrecovered panic here would crash the host +// process during a background rotation, taking the user's main VPN +// with it. Treat a panic the same as any other rotation failure: log, +// keep the existing box serving, try again next tick. func (c *Client) runRotation(ctx context.Context) { defer func() { if r := recover(); r != nil { @@ -553,6 +553,14 @@ func (c *Client) rotateCreds(ctx context.Context) error { intPort := c.internalPort oldRouteID := c.routeID oldBox := c.box + // Capture sessionRunCtx upfront so the final swap can detect a + // Stop→Start cycle that happened mid-rotation. Just checking + // c.active at the swap isn't enough: Stop clears active, then a + // new Start can set it back to true before this rotation reaches + // the swap, and the old rotation would clobber the new session's + // state. The runCtx is unique per session, so a pointer-identity + // check at the swap is sufficient. + sessionRunCtx := c.runCtx c.mu.Unlock() if fwd == nil || oldBox == nil { @@ -593,16 +601,17 @@ func (c *Client) rotateCreds(ctx context.Context) error { } c.mu.Lock() - runCtx := c.runCtx + currentRunCtx := c.runCtx c.mu.Unlock() - if runCtx == nil { - // Stop happened between the unlock above and here. Skip the - // build to avoid spinning up a libbox tied to a torn-down ctx, - // and clean up the just-created route so it doesn't linger. + if currentRunCtx == nil || currentRunCtx != sessionRunCtx { + // Stop ran (runCtx==nil), or a Stop→Start cycle replaced the + // session (runCtx pointer differs from the one captured at the + // top). Either way the build below would tie a libbox to the + // wrong session; skip it and clean up the just-created route. cleanupNewRoute(errors.New("client stopped during rotation")) return errors.New("client stopped during rotation") } - newBox, err := c.cfg.BuildBoxService(runCtx, options) + newBox, err := c.cfg.BuildBoxService(sessionRunCtx, options) if err != nil { cleanupNewRoute(err) return fmt.Errorf("build new sing-box: %w", err) @@ -624,21 +633,26 @@ func (c *Client) rotateCreds(ctx context.Context) error { return fmt.Errorf("start new sing-box: %w", err) } - // Final swap under lock. Re-check active so a Stop racing us between - // runCtx-check above and now doesn't get resurrected by overwriting - // the cleared state Stop just set up. + // Final swap under lock. Re-check that the session is still the + // one we started against. Per-session runCtx identity is a stricter + // check than c.active alone: a Stop→Start cycle between the + // runCtx-check above and now would have cleared active AND set it + // back true, but the new session's runCtx differs from sessionRunCtx + // — so we'd otherwise resurrect old-session state into the new one. c.mu.Lock() - if !c.active { + if !c.active || c.runCtx != sessionRunCtx { c.mu.Unlock() - // Stop already cleared c.box / c.routeID. Close the new box we - // just brought up (Stop has no reference to it) and deregister - // the new route. The old route Stop already deregistered as - // part of its own teardown. + // Either Stop cleared state, or Stop→Start replaced the session. + // Close the new box we just brought up (the current session has + // no reference to it) and deregister the new route. Don't touch + // the prior route: in the Stop-only case, Stop already + // deregistered it; in the Stop→Start case, deregistering it + // would defeat the rotation point of cutting off the old creds. if err := newBox.Close(); err != nil { - slog.Warn("close new box after Stop raced rotation", "err", err) + slog.Warn("close new box after session changed during rotation", "err", err) } - cleanupNewRoute(errors.New("client stopped during rotation swap")) - return errors.New("client stopped during rotation") + cleanupNewRoute(errors.New("session changed during rotation swap")) + return errors.New("session changed during rotation") } c.box = newBox c.routeID = regResp.RouteID @@ -648,13 +662,17 @@ func (c *Client) rotateCreds(ctx context.Context) error { c.mu.Unlock() // Deregister the prior route so the bandit stops handing the old - // (now-invalid) creds to clients. Best-effort: the prior row will - // expire from its TTL anyway, but explicit deregister cuts the - // stale-creds window from up-to-TTL down to ~immediately. - if err := c.cfg.API.Deregister(ctx, oldRouteID); err != nil { + // (now-invalid) creds to clients. Use a fresh ctx so a Stop that + // races us between the swap above and the deregister doesn't cancel + // the cleanup — leaving the old (now-invalid-locally) route in the + // server catalog until TTL would defeat the rotation's stale-cred + // cap, which is the whole point of the feature. + deregCtx, cancelDereg := context.WithTimeout(context.Background(), peerCleanupTimeout) + if err := c.cfg.API.Deregister(deregCtx, oldRouteID); err != nil { slog.Warn("deregister prior route after rotation", "err", err, "old_route_id", oldRouteID) } + cancelDereg() slog.Info("peer cred rotation succeeded", "new_route_id", regResp.RouteID, @@ -665,11 +683,25 @@ func (c *Client) rotateCreds(ctx context.Context) error { // startNewBoxWithRetry retries newBox.Start a handful of times with a // short backoff to absorb router-side TIME_WAIT / EADDRINUSE between -// oldBox.Close releasing the port and newBox.Start re-binding it. Total -// wait is bounded under 1s so a healthy rotation isn't delayed -// noticeably; the alternative is leaving the peer's listener down for -// the full rotation interval (default 1h) on a transient bind failure. -func startNewBoxWithRetry(ctx context.Context, newBox boxService) error { +// oldBox.Close releasing the port and newBox.Start re-binding it. +// Inter-attempt backoff totals 750ms (50+100+200+400 across 4 sleeps; +// no sleep after the final attempt) so a healthy rotation isn't +// delayed noticeably; the alternative is leaving the peer's listener +// down for the full rotation interval (default 1h) on a transient +// bind failure. +// +// libbox.Start can panic; convert that to an error here rather than +// letting it propagate. Without this, the recover in runRotation would +// catch the panic but only after rotateCreds' cleanupNewRoute path +// has been skipped — leaving the freshly-registered route orphaned +// and the port unbound until next rotation. Returning the panic as +// an error lets rotateCreds' deferred cleanup deregister the orphan. +func startNewBoxWithRetry(ctx context.Context, newBox boxService) (retErr error) { + defer func() { + if r := recover(); r != nil { + retErr = fmt.Errorf("start new sing-box panicked: %v", r) + } + }() const attempts = 5 backoff := 50 * time.Millisecond var lastErr error @@ -679,6 +711,12 @@ func startNewBoxWithRetry(ctx context.Context, newBox boxService) error { } else { lastErr = err } + // Skip the sleep on the final attempt — we won't try again, + // so the wait is pure latency that would push total backoff + // above the documented sub-1s budget. + if i == attempts-1 { + break + } select { case <-ctx.Done(): return fmt.Errorf("start new sing-box (ctx cancelled after %d attempts): %w", i+1, lastErr) diff --git a/peer/peer_test.go b/peer/peer_test.go index 72368077..b4c9af2e 100644 --- a/peer/peer_test.go +++ b/peer/peer_test.go @@ -585,10 +585,10 @@ func TestAPIError_StringFormat(t *testing.T) { assert.Contains(t, e.Error(), "could not connect") } -// TestClient_RotatesCredentialsAtInterval pins the C2 fix from -// engineering#3437: the peer client must re-register and rebuild its -// libbox inbound on a schedule so a leaked credential's blast radius is -// bounded by CredRotationInterval rather than peer process lifetime. +// TestClient_RotatesCredentialsAtInterval pins the rotation invariant: +// the peer client must re-register and rebuild its libbox inbound on +// a schedule so a leaked credential's blast radius is bounded by +// CredRotationInterval rather than peer process lifetime. // // Drives a short rotation interval (50ms) and asserts: // 1. Multiple registers happen (start + ≥2 rotations within 250ms). From fe88fd74d58468ef0997fceba4fbee6eeac0837b Mon Sep 17 00:00:00 2001 From: Adam Fisk Date: Fri, 29 May 2026 23:56:10 -0600 Subject: [PATCH 4/4] peer: address Copilot review on #472 (round 3) Two follow-ups from the round-3 re-review: 1. Test asserted against the internal c.routeID field directly. A future change that updated the routeID used by heartbeats but forgot to mirror it into Status.RouteID would still pass the test. Use CurrentStatus() so the assertion pins the observable contract. 2. rotateCreds docstring claimed 'On failure: the prior creds and box continue serving' across all failures, but a failure after oldBox.Close (startNewBoxWithRetry exhausts retries or panics) leaves the listener down until the next rotation tick rebinds. Narrow the docstring to distinguish failures before and after oldBox.Close; the router-side port mapping survives in both cases, only the in-process listener state differs. No behavior change. Co-Authored-By: Claude Opus 4.7 --- peer/peer.go | 18 +++++++++++++++--- peer/peer_test.go | 9 +++++---- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/peer/peer.go b/peer/peer.go index 18eafc2a..270f25eb 100644 --- a/peer/peer.go +++ b/peer/peer.go @@ -524,9 +524,21 @@ func (c *Client) runRotation(ctx context.Context) { // success: a fresh route_id and keypair are in use, the libbox inbound // has been rebuilt against the new options, the prior route is // deregistered server-side, and the FlutterEvent stream sees no gap. -// On failure: the prior creds and box continue serving — rotation is -// best-effort. The router-side port mapping is preserved across the -// rotation; only the in-process samizdat state changes. +// +// On failure, behavior depends on where rotation aborted: +// - Before oldBox.Close (Register, options patch, BuildBoxService, +// stop-raced-rotation paths) — the prior box keeps serving with +// its existing creds; the newly-registered route (if any) is +// deregistered via cleanupNewRoute. +// - After oldBox.Close (startNewBoxWithRetry exhausts retries or +// panics) — the listener is down until the next rotation tick +// successfully rebinds. The router-side port mapping survives; +// only the in-process listener is gone. The new route is +// deregistered so the bandit doesn't hand its creds out for a +// non-listening port. +// +// In both cases the router-side port mapping is preserved; only the +// in-process samizdat state changes. // // Sequence: // 1. Re-register with the same (externalIP, externalPort) as Start. diff --git a/peer/peer_test.go b/peer/peer_test.go index b4c9af2e..96fc65a0 100644 --- a/peer/peer_test.go +++ b/peer/peer_test.go @@ -660,10 +660,11 @@ func TestClient_RotatesCredentialsAtInterval(t *testing.T) { srv.deregisterCount.Load(), rotations) // RouteID exposed via Status should reflect the latest rotation. - c.mu.Lock() - currentRouteID := c.routeID - c.mu.Unlock() - assert.NotEqual(t, "00000000-0000-0000-0000-000000000001", currentRouteID, + // Asserting against CurrentStatus() rather than the internal field + // pins the observable contract: a future change that updates the + // route used by heartbeats but forgets to mirror it into + // Status.RouteID would fail this assertion. + assert.NotEqual(t, "00000000-0000-0000-0000-000000000001", c.CurrentStatus().RouteID, "current route_id should have advanced past the initial register") // Multiple boxes built; first one closed.