From 7941e34954dde91de2283da8159d01081722b968 Mon Sep 17 00:00:00 2001 From: Bruno Lima Date: Fri, 17 Apr 2026 11:30:21 -0300 Subject: [PATCH] fix(tenant-manager/client): build HTTP transport explicitly to avoid HTTP/2 leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit http.DefaultTransport can be mutated at boot by other libraries (notably OpenTelemetry registering an h2 handler in TLSNextProto). Cloning it inherited that handler, so HTTPS connections negotiated HTTP/2 via ALPN and hit the stdlib hpack encoder issue under the concurrent-goroutine usage pattern of this client — reproduced in production on 2026-04-10 (61 errors on reporter-manager, 14 on reporter-worker, circuit-breaker cascade, 503s). Replace the Clone()-based construction with an explicit *http.Transport. TLSNextProto is initialized to a non-nil empty map (the stdlib opt-out signal). Proxy, DialContext, TLS, pool and timeout settings are now set explicitly, no longer inherited from a mutable global. HTTP/1.1 is the deliberate choice for this client because middleware and async revalidation share the same host concurrently. Adds client_transport_test.go with regression tests that contaminate http.DefaultTransport with an h2 handler and assert it does not leak into the client, plus pinning tests for pool and timeout defaults. Refs: docs/lib-commons/incidents/2026-04-10-http2-protocol-mismatch.md Generated-by: Claude AI-Model: claude-opus-4-7 --- commons/tenant-manager/client/client.go | 51 ++++++++--- .../client/client_transport_test.go | 86 +++++++++++++++++++ 2 files changed, 123 insertions(+), 14 deletions(-) create mode 100644 commons/tenant-manager/client/client_transport_test.go diff --git a/commons/tenant-manager/client/client.go b/commons/tenant-manager/client/client.go index 944a7f02..9b2324d5 100644 --- a/commons/tenant-manager/client/client.go +++ b/commons/tenant-manager/client/client.go @@ -4,9 +4,11 @@ package client import ( "context" + "crypto/tls" "encoding/json" "fmt" "io" + "net" "net/http" "net/url" "sync" @@ -667,23 +669,44 @@ func (c *Client) GetActiveTenantsByService(ctx context.Context, service string) return tenants, nil } -// newDefaultHTTPClient creates an HTTP client by cloning http.DefaultTransport -// so that Proxy, TLS, DialContext and other platform defaults are preserved. -// HTTP/2 is explicitly disabled to prevent a known Go stdlib panic in the -// HTTP/2 hpack encoder when multiple goroutines concurrently use the same -// HTTP/2 connection (e.g., middleware + async revalidation goroutines). +// newDefaultHTTPClient creates an HTTP client with a fully explicit *http.Transport. +// +// Historically this function cloned http.DefaultTransport to inherit platform +// defaults (Proxy, DialContext, TLS settings). That approach was fragile: +// OpenTelemetry (and other libraries) instrument http.DefaultTransport at boot +// by registering HTTP/2 handlers in the TLSNextProto map. Clone() copies that +// map, and setting ForceAttemptHTTP2 = false only prevents future auto-setup — +// it does NOT clear handlers that are already present. The inherited "h2" +// handler then triggers HTTP/2 negotiation via ALPN on HTTPS connections, +// which interacts poorly with the concurrent-goroutine usage pattern of this +// client (middleware + async revalidation) and has caused production incidents +// (malformed response errors, 503s, circuit breaker cascade). +// +// Building the transport from scratch decouples the client from the mutable +// global http.DefaultTransport entirely. HTTP/1.1 is the deliberate choice — +// a single connection is not shared across goroutines under HTTP/1.1 (each +// goroutine gets its own connection from the pool), avoiding a known stdlib +// HTTP/2 hpack encoder issue under concurrent writes. +// +// TLSNextProto is initialized to a non-nil empty map: this is the explicit +// opt-out signal the stdlib respects — a nil map instead triggers HTTP/2 +// auto-setup on the next HTTPS handshake. func newDefaultHTTPClient() *http.Client { - base, ok := http.DefaultTransport.(*http.Transport) - if !ok { - base = &http.Transport{} + t := &http.Transport{ + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + ForceAttemptHTTP2: false, + TLSNextProto: map[string]func(string, *tls.Conn) http.RoundTripper{}, + MaxIdleConns: 100, + MaxIdleConnsPerHost: 10, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, } - t := base.Clone() - t.ForceAttemptHTTP2 = false - t.MaxIdleConns = 100 - t.MaxIdleConnsPerHost = 10 - t.IdleConnTimeout = 90 * time.Second - return &http.Client{ Timeout: 30 * time.Second, Transport: t, diff --git a/commons/tenant-manager/client/client_transport_test.go b/commons/tenant-manager/client/client_transport_test.go new file mode 100644 index 00000000..e956d552 --- /dev/null +++ b/commons/tenant-manager/client/client_transport_test.go @@ -0,0 +1,86 @@ +package client + +import ( + "crypto/tls" + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestNewDefaultHTTPClient_UsesHTTP1Only is a regression test for the +// 2026-04-10 production incident where an inherited "h2" handler in +// http.DefaultTransport.TLSNextProto caused HTTP/2 negotiation and malformed +// responses under concurrent load. +// +// The invariant enforced here: newDefaultHTTPClient must produce a transport +// that will never negotiate HTTP/2 via ALPN, regardless of what any other +// package (OpenTelemetry, middleware, etc.) did to http.DefaultTransport. +func TestNewDefaultHTTPClient_UsesHTTP1Only(t *testing.T) { + // Simulate OTel boot-time instrumentation that registers an "h2" handler. + // Save and restore so we don't affect other tests. + originalTransport := http.DefaultTransport + defer func() { http.DefaultTransport = originalTransport }() + + contaminated := &http.Transport{ + TLSNextProto: map[string]func(string, *tls.Conn) http.RoundTripper{ + "h2": func(string, *tls.Conn) http.RoundTripper { return nil }, + }, + } + http.DefaultTransport = contaminated + + client := newDefaultHTTPClient() + require.NotNil(t, client) + require.NotNil(t, client.Transport) + + transport, ok := client.Transport.(*http.Transport) + require.True(t, ok, "transport must be *http.Transport") + + // TLSNextProto must be non-nil (explicit opt-out) and must NOT contain h2. + // A nil map would trigger stdlib's HTTP/2 auto-setup; an "h2" entry would + // make ALPN negotiate HTTP/2. + require.NotNil(t, transport.TLSNextProto, "TLSNextProto must be non-nil to opt out of HTTP/2 auto-setup") + assert.Empty(t, transport.TLSNextProto, "TLSNextProto must be empty — no h2 or other handlers") + _, hasH2 := transport.TLSNextProto["h2"] + assert.False(t, hasH2, "contaminated http.DefaultTransport must not leak h2 handler into client") + + assert.False(t, transport.ForceAttemptHTTP2, "ForceAttemptHTTP2 must be false") +} + +// TestNewDefaultHTTPClient_TransportIsIsolated ensures the client's transport +// does not share memory with http.DefaultTransport. A future mutation of the +// global must not leak into an already-constructed client. +func TestNewDefaultHTTPClient_TransportIsIsolated(t *testing.T) { + client := newDefaultHTTPClient() + transport, ok := client.Transport.(*http.Transport) + require.True(t, ok) + + defaultTransport, ok := http.DefaultTransport.(*http.Transport) + require.True(t, ok) + + assert.NotSame(t, defaultTransport, transport, + "client transport must not share pointer identity with http.DefaultTransport") +} + +// TestNewDefaultHTTPClient_ExpectedDefaults pins the transport configuration +// so that silent regressions on pool sizing, timeouts, or proxy support are +// caught by CI. +func TestNewDefaultHTTPClient_ExpectedDefaults(t *testing.T) { + client := newDefaultHTTPClient() + require.NotNil(t, client) + + assert.Equal(t, 30*time.Second, client.Timeout, "Client.Timeout") + + transport, ok := client.Transport.(*http.Transport) + require.True(t, ok) + + assert.NotNil(t, transport.Proxy, "Proxy must be set (http.ProxyFromEnvironment)") + assert.NotNil(t, transport.DialContext, "DialContext must be set via net.Dialer") + assert.Equal(t, 100, transport.MaxIdleConns, "MaxIdleConns") + assert.Equal(t, 10, transport.MaxIdleConnsPerHost, "MaxIdleConnsPerHost") + assert.Equal(t, 90*time.Second, transport.IdleConnTimeout, "IdleConnTimeout") + assert.Equal(t, 10*time.Second, transport.TLSHandshakeTimeout, "TLSHandshakeTimeout") + assert.Equal(t, 1*time.Second, transport.ExpectContinueTimeout, "ExpectContinueTimeout") +}