From 234fc8811d069f1a813a6eed9bba4d2467505abf Mon Sep 17 00:00:00 2001 From: Zeeshan Lakhani Date: Thu, 7 May 2026 15:27:35 +0000 Subject: [PATCH] [ip-pool] block silo unlinking when multicast groups are allocated from the pool Fixes the issue found in https://github.com/oxidecomputer/omicron/issues/10304: > P2-13 (novel): unlink_ip_pool_from_external_silo_query does not check multicast_group for outstanding rows. > Purely missing from the guard not a race. A non-racy DELETE /v1/system/ip-pools/{pool}/silos/{silo} can > unlink a silo's multicast pool even when the silo has live multicast groups allocated. Predates the multicast > feature, so Some IP Pool database operations may not be concurrency-safe #8992 doesn't cover it. This adds a `multicast_groups` CTE to the guard, mirroring the existing multicast check in `ip_pool_delete_range`. --- nexus/db-queries/src/db/datastore/ip_pool.rs | 55 +++++++++++++++---- .../integration_tests/multicast/groups.rs | 11 ++++ 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/ip_pool.rs b/nexus/db-queries/src/db/datastore/ip_pool.rs index 1b5c334ca79..e4101e60100 100644 --- a/nexus/db-queries/src/db/datastore/ip_pool.rs +++ b/nexus/db-queries/src/db/datastore/ip_pool.rs @@ -114,7 +114,8 @@ const BAD_SILO_LINK_ERROR: &str = "IP Pools cannot be both linked to external \ Silos and reserved for internal Oxide usage."; // Error message emitted when a user attempts to unlink an IP Pool from a Silo -// while the pool has external IP addresses allocated from it. +// while the pool has external IP addresses or multicast groups allocated +// from it. const POOL_HAS_IPS_ERROR: &str = "IP addresses from this pool are in use in the linked silo"; @@ -2202,14 +2203,18 @@ fn link_ip_pool_to_external_silo_query( // Query to conditionally unlink an IP Pool from an external / customer Silo. // -// This deletes the link iff there are no outstanding instance external IPs or -// floating IPs allocated out of the pool, to objects in the Silo. +// This deletes the link iff there are no remaining instance external IPs or +// floating IPs in the Silo. Multicast groups aren't silo-scoped at the row +// level (they're explicitly cross-silo), so we additionally block unlinking +// while any remain (groups in the deleting state still hold their pool IP), +// mirroring `ip_pool_delete_range`. A group allocated from this pool by any +// silo blocks unlinking from every silo linked to the pool. // // The full query is: // // ``` // -- This CTE returns one row if there are any external IPs attached to -// -- instances, in any projects in the Silo. +// -- instances in any projects in the Silo. // WITH instance_ips AS ( // SELECT 1 // FROM external_ip @@ -2240,20 +2245,33 @@ fn link_ip_pool_to_external_silo_query( // project.silo_id = $4 AND // project.time_deleted IS NULL // LIMIT 1 +// ), +// -- This CTE returns one row if any multicast groups are allocated out of +// -- the pool. Groups are cross-silo by design, so the existence check is +// -- not silo-scoped. +// multicast_groups AS ( +// SELECT 1 +// FROM multicast_group +// WHERE +// multicast_group.time_deleted IS NULL AND +// multicast_group.ip_pool_id = $5 +// LIMIT 1 // ) // -- Delete the requested link by primary key, but conditionally. // DELETE FROM ip_pool_resource // WHERE -// ip_pool_id = $7 AND +// ip_pool_id = $6 AND // resource_type = 'silo' AND -// resource_id = $8 AND -// -- If there are any external IPs, this generates an error casting 'eips' to -// -- a boolean, which we detect and handle. +// resource_id = $7 AND +// -- If there are any allocations, this generates an error casting +// -- 'has-allocs' to a boolean, which we detect and handle. // CAST(IF(EXISTS( // SELECT 1 FROM instance_ips // UNION ALL // SELECT 1 FROM floating_ips -// ), 'eips', 'true') AS BOOL) +// UNION ALL +// SELECT 1 FROM multicast_groups +// ), 'has-allocs', 'true') AS BOOL) // ``` fn unlink_ip_pool_from_external_silo_query( ip_pool_id: Uuid, @@ -2301,7 +2319,18 @@ fn unlink_ip_pool_from_external_silo_query( .param() .bind::(silo_id) .sql( - " AND project.time_deleted IS NULL LIMIT 1) \ + " AND project.time_deleted IS NULL LIMIT 1), \ + multicast_groups AS (\ + SELECT 1 \ + FROM multicast_group \ + WHERE \ + multicast_group.time_deleted IS NULL AND \ + multicast_group.ip_pool_id = ", + ) + .param() + .bind::(ip_pool_id) + .sql( + " LIMIT 1) \ DELETE FROM ip_pool_resource \ WHERE ip_pool_id = ", ) @@ -2321,9 +2350,11 @@ fn unlink_ip_pool_from_external_silo_query( EXISTS(\ SELECT 1 FROM instance_ips \ UNION ALL \ - SELECT 1 FROM floating_ips\ + SELECT 1 FROM floating_ips \ + UNION ALL \ + SELECT 1 FROM multicast_groups\ ), \ - 'has-eips', \ + 'has-allocs', \ 'true'\ ) AS BOOL)", ); diff --git a/nexus/tests/integration_tests/multicast/groups.rs b/nexus/tests/integration_tests/multicast/groups.rs index 9ae620b7dc2..6f810eb04d7 100644 --- a/nexus/tests/integration_tests/multicast/groups.rs +++ b/nexus/tests/integration_tests/multicast/groups.rs @@ -866,6 +866,14 @@ async fn test_cannot_delete_multicast_pool_with_groups( "IP Pool cannot be deleted while it contains IP ranges" ); + // Verify we can't unlink the pool from the silo while groups are + // allocated from it. + let unlink_url = format!( + "/v1/system/ip-pools/{pool_name}/silos/{}", + DEFAULT_SILO.name().as_str() + ); + object_delete_error(client, &unlink_url, StatusCode::BAD_REQUEST).await; + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(cptestctx, group_name).await; @@ -882,6 +890,9 @@ async fn test_cannot_delete_multicast_pool_with_groups( "Should be able to delete range after groups are implicitly deleted", ); + // And we can unlink the pool from the silo + object_delete(client, &unlink_url).await; + // And now we should be able to delete the pool NexusRequest::object_delete(client, &pool_url) .authn_as(AuthnMode::PrivilegedUser)