diff --git a/commons/tenant-manager/client/client.go b/commons/tenant-manager/client/client.go index 944a7f02..9b2324d5 100644 --- a/commons/tenant-manager/client/client.go +++ b/commons/tenant-manager/client/client.go @@ -4,9 +4,11 @@ package client import ( "context" + "crypto/tls" "encoding/json" "fmt" "io" + "net" "net/http" "net/url" "sync" @@ -667,23 +669,44 @@ func (c *Client) GetActiveTenantsByService(ctx context.Context, service string) return tenants, nil } -// newDefaultHTTPClient creates an HTTP client by cloning http.DefaultTransport -// so that Proxy, TLS, DialContext and other platform defaults are preserved. -// HTTP/2 is explicitly disabled to prevent a known Go stdlib panic in the -// HTTP/2 hpack encoder when multiple goroutines concurrently use the same -// HTTP/2 connection (e.g., middleware + async revalidation goroutines). +// newDefaultHTTPClient creates an HTTP client with a fully explicit *http.Transport. +// +// Historically this function cloned http.DefaultTransport to inherit platform +// defaults (Proxy, DialContext, TLS settings). That approach was fragile: +// OpenTelemetry (and other libraries) instrument http.DefaultTransport at boot +// by registering HTTP/2 handlers in the TLSNextProto map. Clone() copies that +// map, and setting ForceAttemptHTTP2 = false only prevents future auto-setup — +// it does NOT clear handlers that are already present. The inherited "h2" +// handler then triggers HTTP/2 negotiation via ALPN on HTTPS connections, +// which interacts poorly with the concurrent-goroutine usage pattern of this +// client (middleware + async revalidation) and has caused production incidents +// (malformed response errors, 503s, circuit breaker cascade). +// +// Building the transport from scratch decouples the client from the mutable +// global http.DefaultTransport entirely. HTTP/1.1 is the deliberate choice — +// a single connection is not shared across goroutines under HTTP/1.1 (each +// goroutine gets its own connection from the pool), avoiding a known stdlib +// HTTP/2 hpack encoder issue under concurrent writes. +// +// TLSNextProto is initialized to a non-nil empty map: this is the explicit +// opt-out signal the stdlib respects — a nil map instead triggers HTTP/2 +// auto-setup on the next HTTPS handshake. func newDefaultHTTPClient() *http.Client { - base, ok := http.DefaultTransport.(*http.Transport) - if !ok { - base = &http.Transport{} + t := &http.Transport{ + Proxy: http.ProxyFromEnvironment, + DialContext: (&net.Dialer{ + Timeout: 30 * time.Second, + KeepAlive: 30 * time.Second, + }).DialContext, + ForceAttemptHTTP2: false, + TLSNextProto: map[string]func(string, *tls.Conn) http.RoundTripper{}, + MaxIdleConns: 100, + MaxIdleConnsPerHost: 10, + IdleConnTimeout: 90 * time.Second, + TLSHandshakeTimeout: 10 * time.Second, + ExpectContinueTimeout: 1 * time.Second, } - t := base.Clone() - t.ForceAttemptHTTP2 = false - t.MaxIdleConns = 100 - t.MaxIdleConnsPerHost = 10 - t.IdleConnTimeout = 90 * time.Second - return &http.Client{ Timeout: 30 * time.Second, Transport: t, diff --git a/commons/tenant-manager/client/client_transport_test.go b/commons/tenant-manager/client/client_transport_test.go new file mode 100644 index 00000000..e956d552 --- /dev/null +++ b/commons/tenant-manager/client/client_transport_test.go @@ -0,0 +1,86 @@ +package client + +import ( + "crypto/tls" + "net/http" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestNewDefaultHTTPClient_UsesHTTP1Only is a regression test for the +// 2026-04-10 production incident where an inherited "h2" handler in +// http.DefaultTransport.TLSNextProto caused HTTP/2 negotiation and malformed +// responses under concurrent load. +// +// The invariant enforced here: newDefaultHTTPClient must produce a transport +// that will never negotiate HTTP/2 via ALPN, regardless of what any other +// package (OpenTelemetry, middleware, etc.) did to http.DefaultTransport. +func TestNewDefaultHTTPClient_UsesHTTP1Only(t *testing.T) { + // Simulate OTel boot-time instrumentation that registers an "h2" handler. + // Save and restore so we don't affect other tests. + originalTransport := http.DefaultTransport + defer func() { http.DefaultTransport = originalTransport }() + + contaminated := &http.Transport{ + TLSNextProto: map[string]func(string, *tls.Conn) http.RoundTripper{ + "h2": func(string, *tls.Conn) http.RoundTripper { return nil }, + }, + } + http.DefaultTransport = contaminated + + client := newDefaultHTTPClient() + require.NotNil(t, client) + require.NotNil(t, client.Transport) + + transport, ok := client.Transport.(*http.Transport) + require.True(t, ok, "transport must be *http.Transport") + + // TLSNextProto must be non-nil (explicit opt-out) and must NOT contain h2. + // A nil map would trigger stdlib's HTTP/2 auto-setup; an "h2" entry would + // make ALPN negotiate HTTP/2. + require.NotNil(t, transport.TLSNextProto, "TLSNextProto must be non-nil to opt out of HTTP/2 auto-setup") + assert.Empty(t, transport.TLSNextProto, "TLSNextProto must be empty — no h2 or other handlers") + _, hasH2 := transport.TLSNextProto["h2"] + assert.False(t, hasH2, "contaminated http.DefaultTransport must not leak h2 handler into client") + + assert.False(t, transport.ForceAttemptHTTP2, "ForceAttemptHTTP2 must be false") +} + +// TestNewDefaultHTTPClient_TransportIsIsolated ensures the client's transport +// does not share memory with http.DefaultTransport. A future mutation of the +// global must not leak into an already-constructed client. +func TestNewDefaultHTTPClient_TransportIsIsolated(t *testing.T) { + client := newDefaultHTTPClient() + transport, ok := client.Transport.(*http.Transport) + require.True(t, ok) + + defaultTransport, ok := http.DefaultTransport.(*http.Transport) + require.True(t, ok) + + assert.NotSame(t, defaultTransport, transport, + "client transport must not share pointer identity with http.DefaultTransport") +} + +// TestNewDefaultHTTPClient_ExpectedDefaults pins the transport configuration +// so that silent regressions on pool sizing, timeouts, or proxy support are +// caught by CI. +func TestNewDefaultHTTPClient_ExpectedDefaults(t *testing.T) { + client := newDefaultHTTPClient() + require.NotNil(t, client) + + assert.Equal(t, 30*time.Second, client.Timeout, "Client.Timeout") + + transport, ok := client.Transport.(*http.Transport) + require.True(t, ok) + + assert.NotNil(t, transport.Proxy, "Proxy must be set (http.ProxyFromEnvironment)") + assert.NotNil(t, transport.DialContext, "DialContext must be set via net.Dialer") + assert.Equal(t, 100, transport.MaxIdleConns, "MaxIdleConns") + assert.Equal(t, 10, transport.MaxIdleConnsPerHost, "MaxIdleConnsPerHost") + assert.Equal(t, 90*time.Second, transport.IdleConnTimeout, "IdleConnTimeout") + assert.Equal(t, 10*time.Second, transport.TLSHandshakeTimeout, "TLSHandshakeTimeout") + assert.Equal(t, 1*time.Second, transport.ExpectContinueTimeout, "ExpectContinueTimeout") +}