From 19b2ae5ad083a9f94ef770b5f2d190d7f882db32 Mon Sep 17 00:00:00 2001 From: chitadi <119812932+chitadi@users.noreply.github.com> Date: Fri, 12 Jun 2026 14:07:56 +0530 Subject: [PATCH 1/4] Add failover tests for image transfer, container start, and volume creation The test numbers have been adjusted according to PR - 5562 --- .../supervisor_failover/failover_test.go | 202 ++++++++++++++++++ 1 file changed, 202 insertions(+) diff --git a/feature/container/failover/tests/supervisor_failover/failover_test.go b/feature/container/failover/tests/supervisor_failover/failover_test.go index 51613746df6..a18fc31ce67 100644 --- a/feature/container/failover/tests/supervisor_failover/failover_test.go +++ b/feature/container/failover/tests/supervisor_failover/failover_test.go @@ -501,6 +501,208 @@ func TestContainerPersistenceAfterColdReboot(t *testing.T) { }) } +// TestInterruptImageTransferFailover implements CNTR-3.7. +func TestInterruptImageTransferFailover(t *testing.T) { + dut := ondatra.DUT(t, "dut") + ctx := context.Background() + + if containerTarPath(t) == "" { + t.Skip("container_tar flag not set, skipping test") + } + + standbyRPBefore, _, err := findRPs(t, dut) + if err != nil { + t.Fatalf("Failed to find RPs before switchover: %v", err) + } + + cli := containerztest.Client(t, dut) + + t.Cleanup(func() { + t.Log("Starting cleanup...") + cli := containerztest.Client(t, dut) + if err := cli.RemoveImage(ctx, imageName, tag, true); err != nil && status.Code(err) != codes.NotFound && status.Code(err) != codes.Unknown { + t.Logf("Cleanup: failed to remove image %q:%q: %v", imageName, tag, err) + } + t.Log("Cleanup finished.") + }) + + // Ensure switchover is ready before we start the race + switchoverReady := gnmi.OC().Component(standbyRPBefore).SwitchoverReady() + gnmi.Await(t, dut, switchoverReady.State(), 5*time.Minute, true) + + t.Run("InterruptTransfer", func(t *testing.T) { + // Run PushImage in a background goroutine + errCh := make(chan error, 1) + go func() { + progCh, err := cli.PushImage(ctx, imageName, tag, containerTarPath(t), false) + if err != nil { + errCh <- fmt.Errorf("Initial call to PushImage failed: %v", err) + return + } + for prog := range progCh { + if prog.Error != nil { + // We expect an error due to the connection dropping + errCh <- nil + return + } + } + errCh <- nil + }() + + // Trigger switchover while transfer is happening + doSwitchover(t, dut, standbyRPBefore) + + if err := <-errCh; err != nil { + t.Logf("PushImage returned error: %v", err) + } + }) + + t.Run("VerifyInterruption", func(t *testing.T) { + waitForSwitchover(t, dut) + + cli = containerztest.Client(t, dut) // Re-initialize client + + t.Log("Verifying image state after interrupted transfer...") + if err := verifyImageExists(ctx, t, cli, imageName, tag); err != nil { + t.Logf("Image not found after interrupted transfer (expected): %v", err) + } else { + t.Logf("Image fully loaded despite interruption (transfer was likely too fast).") + } + }) +} + +// TestInterruptContainerStartFailover implements CNTR-3.8. +func TestInterruptContainerStartFailover(t *testing.T) { + dut := ondatra.DUT(t, "dut") + ctx := context.Background() + + if containerTarPath(t) == "" { + t.Skip("container_tar flag not set, skipping test") + } + + cli := containerztest.Client(t, dut) + + t.Cleanup(func() { + t.Log("Starting cleanup...") + cli := containerztest.Client(t, dut) + if err := cli.RemoveContainer(ctx, containerName, true); err != nil && status.Code(err) != codes.NotFound && status.Code(err) != codes.Unknown { + t.Logf("Cleanup: failed to remove container %q: %v", containerName, err) + } + if err := cli.RemoveImage(ctx, imageName, tag, true); err != nil && status.Code(err) != codes.NotFound && status.Code(err) != codes.Unknown { + t.Logf("Cleanup: failed to remove image %q:%q: %v", imageName, tag, err) + } + t.Log("Cleanup finished.") + }) + + t.Run("Setup", func(t *testing.T) { + if err := loadImage(ctx, t, cli, imageName, tag, containerTarPath(t)); err != nil { + t.Fatalf("Failed to load image: %v", err) + } + }) + + standbyRPBefore, _, err := findRPs(t, dut) + if err != nil { + t.Fatalf("Failed to find RPs before switchover: %v", err) + } + + switchoverReady := gnmi.OC().Component(standbyRPBefore).SwitchoverReady() + gnmi.Await(t, dut, switchoverReady.State(), 5*time.Minute, true) + + t.Run("InterruptStart", func(t *testing.T) { + errCh := make(chan error, 1) + go func() { + _, err := cli.StartContainer(ctx, imageName, tag, "./cntrsrv", containerName) + if err != nil { + // Expected error due to switchover + errCh <- nil + return + } + errCh <- nil + }() + + doSwitchover(t, dut, standbyRPBefore) + + if err := <-errCh; err != nil { + t.Logf("StartContainer background error: %v", err) + } + }) + + t.Run("VerifyInterruption", func(t *testing.T) { + waitForSwitchover(t, dut) + + cli = containerztest.Client(t, dut) + + t.Log("Verifying container state after interrupted start...") + err := verifyContainerState(ctx, t, cli, containerName, cpb.ListContainerResponse_RUNNING) + if err == nil { + t.Errorf("Container unexpectedly reached RUNNING state despite switchover interruption") + } else { + t.Logf("Container is not RUNNING (expected): %v", err) + } + }) +} + +// TestInterruptVolumeCreationFailover implements CNTR-3.9. +func TestInterruptVolumeCreationFailover(t *testing.T) { + dut := ondatra.DUT(t, "dut") + ctx := context.Background() + + cli := containerztest.Client(t, dut) + + t.Cleanup(func() { + t.Log("Starting cleanup...") + cli := containerztest.Client(t, dut) + if err := cli.RemoveVolume(ctx, volName, true); err != nil && status.Code(err) != codes.NotFound && status.Code(err) != codes.Unknown { + t.Logf("Cleanup: failed to remove volume %q: %v", volName, err) + } + t.Log("Cleanup finished.") + }) + + standbyRPBefore, _, err := findRPs(t, dut) + if err != nil { + t.Fatalf("Failed to find RPs before switchover: %v", err) + } + + switchoverReady := gnmi.OC().Component(standbyRPBefore).SwitchoverReady() + gnmi.Await(t, dut, switchoverReady.State(), 5*time.Minute, true) + + t.Run("InterruptVolumeCreation", func(t *testing.T) { + errCh := make(chan error, 1) + go func() { + volOpts := map[string]string{ + "type": "none", + "options": "bind", + "mountpoint": "/tmp", + } + _, err := cli.CreateVolume(ctx, volName, "local", nil, volOpts) + if err != nil { + errCh <- nil + return + } + errCh <- nil + }() + + doSwitchover(t, dut, standbyRPBefore) + + if err := <-errCh; err != nil { + t.Logf("CreateVolume background error: %v", err) + } + }) + + t.Run("VerifyInterruption", func(t *testing.T) { + waitForSwitchover(t, dut) + + cli = containerztest.Client(t, dut) + + t.Log("Verifying volume state after interrupted creation...") + if err := verifyVolumeExists(ctx, t, cli, volName); err == nil { + t.Logf("Volume fully created despite interruption (creation was likely too fast).") + } else { + t.Logf("Volume not found after interrupted creation (expected): %v", err) + } + }) +} + // waitForSwitchover waits for the switchover to complete by polling telemetry. func waitForSwitchover(t *testing.T, dut *ondatra.DUTDevice) { t.Helper() From cf18ac7fea57962a993fc8ad129d494e967eb40a Mon Sep 17 00:00:00 2001 From: chitadi <119812932+chitadi@users.noreply.github.com> Date: Fri, 12 Jun 2026 14:09:29 +0530 Subject: [PATCH 2/4] Update test case comments for CNTR identifiers More changes made according to numbering proposed in PR - 5562 --- .../failover/tests/supervisor_failover/failover_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/feature/container/failover/tests/supervisor_failover/failover_test.go b/feature/container/failover/tests/supervisor_failover/failover_test.go index a18fc31ce67..142c0d7962d 100644 --- a/feature/container/failover/tests/supervisor_failover/failover_test.go +++ b/feature/container/failover/tests/supervisor_failover/failover_test.go @@ -327,7 +327,7 @@ func TestContainerRemovalPersistence(t *testing.T) { }) } -// TestDoubleFailoverImagePersistence implements CNTR-3.7. +// TestDoubleFailoverImagePersistence implements CNTR-3.10. func TestDoubleFailoverImagePersistence(t *testing.T) { dut := ondatra.DUT(t, "dut") ctx := context.Background() @@ -399,7 +399,7 @@ func TestDoubleFailoverImagePersistence(t *testing.T) { }) } -// TestContainerPersistenceAfterColdReboot implements CNTR-3.8 checking container persistence after a chassis cold reboot. +// TestContainerPersistenceAfterColdReboot implements CNTR-3.21 checking container persistence after a chassis cold reboot. func TestContainerPersistenceAfterColdReboot(t *testing.T) { dut := ondatra.DUT(t, "dut") ctx := context.Background() From 9776a102a2b43ac5dce5304b96dc23e4109ab029 Mon Sep 17 00:00:00 2001 From: chitadi <119812932+chitadi@users.noreply.github.com> Date: Fri, 12 Jun 2026 14:26:42 +0530 Subject: [PATCH 3/4] Improve error handling in failover tests --- .../supervisor_failover/failover_test.go | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/feature/container/failover/tests/supervisor_failover/failover_test.go b/feature/container/failover/tests/supervisor_failover/failover_test.go index 142c0d7962d..67711d698be 100644 --- a/feature/container/failover/tests/supervisor_failover/failover_test.go +++ b/feature/container/failover/tests/supervisor_failover/failover_test.go @@ -553,7 +553,7 @@ func TestInterruptImageTransferFailover(t *testing.T) { doSwitchover(t, dut, standbyRPBefore) if err := <-errCh; err != nil { - t.Logf("PushImage returned error: %v", err) + t.Fatalf("PushImage failed to start: %v", err) } }) @@ -612,18 +612,15 @@ func TestInterruptContainerStartFailover(t *testing.T) { errCh := make(chan error, 1) go func() { _, err := cli.StartContainer(ctx, imageName, tag, "./cntrsrv", containerName) - if err != nil { - // Expected error due to switchover - errCh <- nil - return - } - errCh <- nil + errCh <- err }() doSwitchover(t, dut, standbyRPBefore) if err := <-errCh; err != nil { - t.Logf("StartContainer background error: %v", err) + if status.Code(err) != codes.Unavailable && status.Code(err) != codes.Canceled && status.Code(err) != codes.DeadlineExceeded { + t.Fatalf("StartContainer failed with unexpected error: %v", err) + } } }) @@ -675,17 +672,15 @@ func TestInterruptVolumeCreationFailover(t *testing.T) { "mountpoint": "/tmp", } _, err := cli.CreateVolume(ctx, volName, "local", nil, volOpts) - if err != nil { - errCh <- nil - return - } - errCh <- nil + errCh <- err }() doSwitchover(t, dut, standbyRPBefore) if err := <-errCh; err != nil { - t.Logf("CreateVolume background error: %v", err) + if status.Code(err) != codes.Unavailable && status.Code(err) != codes.Canceled && status.Code(err) != codes.DeadlineExceeded { + t.Fatalf("CreateVolume failed with unexpected error: %v", err) + } } }) From 1367be7a57fd9a29576f2a285f0b41b33bdc08cb Mon Sep 17 00:00:00 2001 From: chitadi <119812932+chitadi@users.noreply.github.com> Date: Fri, 12 Jun 2026 15:22:57 +0530 Subject: [PATCH 4/4] Handle interruptions during transfer, start, and volume creation --- .../supervisor_failover/failover_test.go | 57 ++++++++++++++----- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/feature/container/failover/tests/supervisor_failover/failover_test.go b/feature/container/failover/tests/supervisor_failover/failover_test.go index 67711d698be..cf872eeaa79 100644 --- a/feature/container/failover/tests/supervisor_failover/failover_test.go +++ b/feature/container/failover/tests/supervisor_failover/failover_test.go @@ -530,6 +530,7 @@ func TestInterruptImageTransferFailover(t *testing.T) { switchoverReady := gnmi.OC().Component(standbyRPBefore).SwitchoverReady() gnmi.Await(t, dut, switchoverReady.State(), 5*time.Minute, true) + var transferInterrupted bool t.Run("InterruptTransfer", func(t *testing.T) { // Run PushImage in a background goroutine errCh := make(chan error, 1) @@ -541,7 +542,7 @@ func TestInterruptImageTransferFailover(t *testing.T) { } for prog := range progCh { if prog.Error != nil { - // We expect an error due to the connection dropping + transferInterrupted = true errCh <- nil return } @@ -563,10 +564,19 @@ func TestInterruptImageTransferFailover(t *testing.T) { cli = containerztest.Client(t, dut) // Re-initialize client t.Log("Verifying image state after interrupted transfer...") - if err := verifyImageExists(ctx, t, cli, imageName, tag); err != nil { - t.Logf("Image not found after interrupted transfer (expected): %v", err) + err := verifyImageExists(ctx, t, cli, imageName, tag) + if transferInterrupted { + if err == nil { + t.Errorf("Image unexpectedly exists despite interrupted transfer") + } else { + t.Logf("Image not found after interrupted transfer (expected): %v", err) + } } else { - t.Logf("Image fully loaded despite interruption (transfer was likely too fast).") + if err != nil { + t.Errorf("Image not found, but transfer completed successfully before switchover: %v", err) + } else { + t.Logf("Image fully loaded as expected (transfer completed before switchover).") + } } }) } @@ -608,6 +618,7 @@ func TestInterruptContainerStartFailover(t *testing.T) { switchoverReady := gnmi.OC().Component(standbyRPBefore).SwitchoverReady() gnmi.Await(t, dut, switchoverReady.State(), 5*time.Minute, true) + var startErr error t.Run("InterruptStart", func(t *testing.T) { errCh := make(chan error, 1) go func() { @@ -617,9 +628,10 @@ func TestInterruptContainerStartFailover(t *testing.T) { doSwitchover(t, dut, standbyRPBefore) - if err := <-errCh; err != nil { - if status.Code(err) != codes.Unavailable && status.Code(err) != codes.Canceled && status.Code(err) != codes.DeadlineExceeded { - t.Fatalf("StartContainer failed with unexpected error: %v", err) + startErr = <-errCh + if startErr != nil { + if status.Code(startErr) != codes.Unavailable && status.Code(startErr) != codes.Canceled && status.Code(startErr) != codes.DeadlineExceeded { + t.Fatalf("StartContainer failed with unexpected error: %v", startErr) } } }) @@ -631,10 +643,18 @@ func TestInterruptContainerStartFailover(t *testing.T) { t.Log("Verifying container state after interrupted start...") err := verifyContainerState(ctx, t, cli, containerName, cpb.ListContainerResponse_RUNNING) - if err == nil { - t.Errorf("Container unexpectedly reached RUNNING state despite switchover interruption") + if startErr != nil { + if err == nil { + t.Errorf("Container unexpectedly reached RUNNING state despite switchover interruption") + } else { + t.Logf("Container is not RUNNING (expected): %v", err) + } } else { - t.Logf("Container is not RUNNING (expected): %v", err) + if err != nil { + t.Errorf("Container is not RUNNING, but StartContainer completed successfully before switchover: %v", err) + } else { + t.Logf("Container is RUNNING as expected (StartContainer completed before switchover).") + } } }) } @@ -663,6 +683,7 @@ func TestInterruptVolumeCreationFailover(t *testing.T) { switchoverReady := gnmi.OC().Component(standbyRPBefore).SwitchoverReady() gnmi.Await(t, dut, switchoverReady.State(), 5*time.Minute, true) + var volumeInterrupted bool t.Run("InterruptVolumeCreation", func(t *testing.T) { errCh := make(chan error, 1) go func() { @@ -681,6 +702,7 @@ func TestInterruptVolumeCreationFailover(t *testing.T) { if status.Code(err) != codes.Unavailable && status.Code(err) != codes.Canceled && status.Code(err) != codes.DeadlineExceeded { t.Fatalf("CreateVolume failed with unexpected error: %v", err) } + volumeInterrupted = true } }) @@ -690,10 +712,19 @@ func TestInterruptVolumeCreationFailover(t *testing.T) { cli = containerztest.Client(t, dut) t.Log("Verifying volume state after interrupted creation...") - if err := verifyVolumeExists(ctx, t, cli, volName); err == nil { - t.Logf("Volume fully created despite interruption (creation was likely too fast).") + err := verifyVolumeExists(ctx, t, cli, volName) + if volumeInterrupted { + if err == nil { + t.Errorf("Volume unexpectedly exists despite interrupted creation") + } else { + t.Logf("Volume not found after interrupted creation (expected): %v", err) + } } else { - t.Logf("Volume not found after interrupted creation (expected): %v", err) + if err != nil { + t.Errorf("Volume not found, but CreateVolume completed successfully before switchover: %v", err) + } else { + t.Logf("Volume fully created as expected (creation completed before switchover).") + } } }) }