diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 108f6f1d1a..7513326011 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -644,6 +644,9 @@ func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineReque case strings.Contains(machine.Status.LastOperation.Description, machineutils.DelVolumesAttachments): return c.deleteNodeVolAttachments(ctx, deleteMachineRequest) + case strings.Contains(machine.Status.LastOperation.Description, machineutils.SetDeletionTaint): + return c.taintNode(ctx, deleteMachineRequest) + case strings.Contains(machine.Status.LastOperation.Description, machineutils.InitiateVMDeletion): return c.deleteVM(ctx, deleteMachineRequest) diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 76214eac72..44d9b337a4 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -1949,7 +1949,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Drain successful. %s", machineutils.InitiateVMDeletion), + err: fmt.Errorf("Drain successful. %s", machineutils.SetDeletionTaint), retry: machineutils.ShortRetry, nodeTerminationConditionIsSet: true, machine: newMachine( @@ -1969,7 +1969,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion), + Description: fmt.Sprintf("Drain successful. %s", machineutils.SetDeletionTaint), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -2465,7 +2465,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Drain successful. %s", machineutils.InitiateVMDeletion), + err: fmt.Errorf("Drain successful. %s", machineutils.SetDeletionTaint), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -2484,7 +2484,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion), + Description: fmt.Sprintf("Drain successful. %s", machineutils.SetDeletionTaint), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -2567,7 +2567,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Drain successful. %s", machineutils.InitiateVMDeletion), + err: fmt.Errorf("Drain successful. %s", machineutils.SetDeletionTaint), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ @@ -2586,7 +2586,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion), + Description: fmt.Sprintf("Drain successful. %s", machineutils.SetDeletionTaint), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -3064,7 +3064,7 @@ var _ = Describe("machine", func() { LastUpdateTime: metav1.Now(), }, LastOperation: v1alpha1.LastOperation{ - Description: fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion), + Description: fmt.Sprintf("Node tainted. %s", machineutils.InitiateVMDeletion), State: v1alpha1.MachineStateProcessing, Type: v1alpha1.MachineOperationDelete, LastUpdateTime: metav1.Now(), @@ -3233,6 +3233,318 @@ var _ = Describe("machine", func() { ), }, }), + Entry("Set ToBedeletedByClusterAutoscaler Taint", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Drain successful. %s", machineutils.SetDeletionTaint), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + v1alpha1.NodeLabelKey: "fakeID-0", + }, + true, + metav1.Now(), + ), + nodes: []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "fakeID-0", + }, + }, + }, + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: nil, + retry: machineutils.ShortRetry, + nodeDeleted: false, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Drain successful. %s", machineutils.SetDeletionTaint), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + v1alpha1.NodeLabelKey: "fakeID-0", + }, + true, + metav1.Now(), + ), + }, + }), + Entry("Continue if ToBedeletedByClusterAutoscaler Taint is set", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Drain successful. %s", machineutils.SetDeletionTaint), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + v1alpha1.NodeLabelKey: "fakeID-0", + }, + true, + metav1.Now(), + ), + nodes: []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "fakeID-0", + }, + Spec: corev1.NodeSpec{ + Taints: []corev1.Taint{ + { + Key: machineutils.TaintToBeDeleted, + Value: "gardener-machine-controller-manager", + Effect: corev1.TaintEffectPreferNoSchedule, + }, + }, + }, + }, + }, + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: nil, + retry: machineutils.ShortRetry, + nodeDeleted: false, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Node tainted. %s", machineutils.InitiateVMDeletion), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + v1alpha1.NodeLabelKey: "fakeID-0", + }, + true, + metav1.Now(), + ), + }, + }), + Entry("Skip ToBedeletedByClusterAutoscaler Taint if node does not exist", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + machineClasses: []*v1alpha1.MachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + SecretRef: newSecretReference(objMeta, 0), + }, + }, + machines: newMachines( + 1, + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Drain successful. %s", machineutils.SetDeletionTaint), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + v1alpha1.NodeLabelKey: "fakeID-0", + }, + true, + metav1.Now(), + ), + nodes: []*corev1.Node{}, + }, + action: action{ + machine: "machine-0", + fakeDriver: &driver.FakeDriver{ + VMExists: true, + ProviderID: "fakeID-0", + NodeName: "fakeNode-0", + Err: nil, + }, + }, + expect: expect{ + err: nil, + retry: machineutils.ShortRetry, + nodeDeleted: false, + machine: newMachine( + &v1alpha1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: v1alpha1.MachineSpec{ + Class: v1alpha1.ClassSpec{ + Kind: "MachineClass", + Name: "machine-0", + }, + ProviderID: "fakeID", + }, + }, + &v1alpha1.MachineStatus{ + CurrentStatus: v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineTerminating, + LastUpdateTime: metav1.Now(), + }, + LastOperation: v1alpha1.LastOperation{ + Description: fmt.Sprintf("Node does not exist. %s", machineutils.InitiateVMDeletion), + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + }, + nil, + map[string]string{ + machineutils.MachinePriority: "3", + }, + map[string]string{ + v1alpha1.NodeLabelKey: "fakeID-0", + }, + true, + metav1.Now(), + ), + }, + }), Entry("Delete node object successfully", &data{ setup: setup{ secrets: []*corev1.Secret{ diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 36fa6f414f..3ac464693e 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -44,6 +44,7 @@ import ( "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/status" "github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils" utilstrings "github.com/gardener/machine-controller-manager/pkg/util/strings" + taintutils "github.com/gardener/machine-controller-manager/pkg/util/taints" utiltime "github.com/gardener/machine-controller-manager/pkg/util/time" v1 "k8s.io/api/core/v1" @@ -426,7 +427,7 @@ func (c *controller) updateMachineStatusAndNodeCondition(ctx context.Context, ma }, // Let the clone.Status.CurrentStatus (LastUpdateTime) be as it was before. // This helps while computing when the drain timeout to determine if force deletion is to be triggered. - // Ref - https://github.com/gardener/machine-controller-manager/blob/rel-v0.34.0/pkg/util/provider/machinecontroller/machine_util.go#L872 + // Ref - https://github.com/gardener/machine-controller-manager/blob/97ca0de6df297c1b53ac2b66ec28120840b6906a/pkg/util/provider/machinecontroller/machine_util.go#L1621 machine.Status.CurrentStatus, machine.Status.LastKnownState, ) @@ -1395,7 +1396,7 @@ func (c *controller) updateMachineStatusAndNodeLabel(ctx context.Context, getMac }, // Let the clone.Status.CurrentStatus (LastUpdateTime) be as it was before. // This helps while computing when the drain timeout to determine if force deletion is to be triggered. - // Ref - https://github.com/gardener/machine-controller-manager/blob/rel-v0.34.0/pkg/util/provider/machinecontroller/machine_util.go#L872 + // Ref - https://github.com/gardener/machine-controller-manager/blob/97ca0de6df297c1b53ac2b66ec28120840b6906a/pkg/util/provider/machinecontroller/machine_util.go#L1621 getMachineStatusRequest.Machine.Status.CurrentStatus, getMachineStatusRequest.Machine.Status.LastKnownState, ) @@ -1700,7 +1701,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver if forceDeletePods { description = fmt.Sprintf("Force Drain successful. %s", machineutils.DelVolumesAttachments) } else { // regular drain already waits for vol detach and attach for another node. - description = fmt.Sprintf("Drain successful. %s", machineutils.InitiateVMDeletion) + description = fmt.Sprintf("Drain successful. %s", machineutils.SetDeletionTaint) } err = fmt.Errorf("%s", description) state = v1alpha1.MachineStateProcessing @@ -1732,7 +1733,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver }, // Let the clone.Status.CurrentStatus (LastUpdateTime) be as it was before. // This helps while computing when the drain timeout to determine if force deletion is to be triggered. - // Ref - https://github.com/gardener/machine-controller-manager/blob/rel-v0.34.0/pkg/util/provider/machinecontroller/machine_util.go#L872 + // Ref - https://github.com/gardener/machine-controller-manager/blob/97ca0de6df297c1b53ac2b66ec28120840b6906a/pkg/util/provider/machinecontroller/machine_util.go#L1621 machine.Status.CurrentStatus, machine.Status.LastKnownState, ) @@ -1744,7 +1745,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver return machineutils.ShortRetry, err } -// deleteNodeVolAttachments deletes VolumeAttachment(s) for a node before moving to VM deletion stage. +// deleteNodeVolAttachments deletes VolumeAttachment(s) for a node before moving to taint Node stage. func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { var ( description string @@ -1760,11 +1761,11 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine return retryPeriod, err } // node not found move to vm deletion - description = fmt.Sprintf("Skipping deleteNodeVolAttachments due to - %s. Moving to VM Deletion. %s", err.Error(), machineutils.InitiateVMDeletion) + description = fmt.Sprintf("Skipping deleteNodeVolAttachments due to - %s. Moving to taint Node. %s", err.Error(), machineutils.SetDeletionTaint) state = v1alpha1.MachineStateProcessing retryPeriod = 0 } else if len(node.Status.VolumesAttached) == 0 { - description = fmt.Sprintf("Node Volumes for node: %s are already detached. Moving to VM Deletion. %s", nodeName, machineutils.InitiateVMDeletion) + description = fmt.Sprintf("Node Volumes for node: %s are already detached. Moving to taint Node. %s", nodeName, machineutils.SetDeletionTaint) state = v1alpha1.MachineStateProcessing retryPeriod = 0 } else { @@ -1783,7 +1784,7 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine } return retryPeriod, nil } - description = fmt.Sprintf("No Live VolumeAttachments for node: %s. Moving to VM Deletion. %s", nodeName, machineutils.InitiateVMDeletion) + description = fmt.Sprintf("No Live VolumeAttachments for node: %s. Moving to taint Node. %s", nodeName, machineutils.SetDeletionTaint) state = v1alpha1.MachineStateProcessing } now := metav1.Now() @@ -1808,6 +1809,63 @@ func (c *controller) deleteNodeVolAttachments(ctx context.Context, deleteMachine return retryPeriod, err } +func (c *controller) taintNode(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { + var ( + machine = deleteMachineRequest.Machine + toBeDeletedTaint = v1.Taint{ + Key: machineutils.TaintToBeDeleted, + Value: "gardener-machine-controller-manager", + Effect: v1.TaintEffectPreferNoSchedule, + } + description = "" + taintUpdated = false + skipStep = false + ) + node, err := c.nodeLister.Get(getNodeName(machine)) + if err != nil { + if !apierrors.IsNotFound(err) { + klog.Errorf("error occurred while trying to fetch node object - err: %v", err) + return machineutils.ShortRetry, err + } + skipStep = true + description = fmt.Sprintf("Node does not exist. %s", machineutils.InitiateVMDeletion) + } + var updatedNode *v1.Node + if node != nil { + updatedNode, taintUpdated, _ = taintutils.AddOrUpdateTaint(node, &toBeDeletedTaint) + if !taintUpdated { + description = fmt.Sprintf("Node tainted. %s", machineutils.InitiateVMDeletion) + } + } + + if !taintUpdated || skipStep { + return c.machineStatusUpdate( + ctx, + machine, + v1alpha1.LastOperation{ + Description: description, + State: v1alpha1.MachineStateProcessing, + Type: v1alpha1.MachineOperationDelete, + LastUpdateTime: metav1.Now(), + }, + // Let the clone.Status.CurrentStatus (LastUpdateTime) be as it was before. + // This helps while computing when the drain timeout to determine if force deletion is to be triggered. + // Ref - https://github.com/gardener/machine-controller-manager/blob/97ca0de6df297c1b53ac2b66ec28120840b6906a/pkg/util/provider/machinecontroller/machine_util.go#L1621 + machine.Status.CurrentStatus, + machine.Status.LastKnownState, + ) + } + + if _, err := c.targetCoreClient.CoreV1().Nodes().Update(ctx, updatedNode, metav1.UpdateOptions{}); err != nil { + if apierrors.IsConflict(err) { + return machineutils.ConflictRetry, err + } + return machineutils.ShortRetry, err + } + + return machineutils.ShortRetry, nil +} + // deleteVM attempts to delete the VM backed by the machine object func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.RetryPeriod, error) { var ( @@ -1867,7 +1925,7 @@ func (c *controller) deleteVM(ctx context.Context, deleteMachineRequest *driver. }, // Let the clone.Status.CurrentStatus (LastUpdateTime) be as it was before. // This helps while computing when the drain timeout to determine if force deletion is to be triggered. - // Ref - https://github.com/gardener/machine-controller-manager/blob/rel-v0.34.0/pkg/util/provider/machinecontroller/machine_util.go#L872 + // Ref - https://github.com/gardener/machine-controller-manager/blob/97ca0de6df297c1b53ac2b66ec28120840b6906a/pkg/util/provider/machinecontroller/machine_util.go#L1621 machine.Status.CurrentStatus, lastKnownState, ) @@ -1989,7 +2047,7 @@ func (c *controller) deleteNodeObject(ctx context.Context, machine *v1alpha1.Mac }, // Let the clone.Status.CurrentStatus (LastUpdateTime) be as it was before. // This helps while computing when the drain timeout to determine if force deletion is to be triggered. - // Ref - https://github.com/gardener/machine-controller-manager/blob/rel-v0.34.0/pkg/util/provider/machinecontroller/machine_util.go#L872 + // Ref - https://github.com/gardener/machine-controller-manager/blob/97ca0de6df297c1b53ac2b66ec28120840b6906a/pkg/util/provider/machinecontroller/machine_util.go#L1621 machine.Status.CurrentStatus, machine.Status.LastKnownState, ) diff --git a/pkg/util/provider/machineutils/utils.go b/pkg/util/provider/machineutils/utils.go index f163dc37eb..d419a53a70 100644 --- a/pkg/util/provider/machineutils/utils.go +++ b/pkg/util/provider/machineutils/utils.go @@ -23,6 +23,9 @@ const ( // InitiateDrain specifies next step as initiate node drain InitiateDrain = "Initiate node drain" + // SetDeletionTaint specifies next step as set deletion taint + SetDeletionTaint = "Set deletion taint" + // NodeReadyForUpdate specifies next step as node ready for update. NodeReadyForUpdate = "Node drain successful. Node is ready for update" @@ -84,6 +87,10 @@ const ( // indicating that a node is not yet ready to have user workload scheduled TaintNodeCriticalComponentsNotReady = "node.gardener.cloud/critical-components-not-ready" + // TaintToBeDeleted is the taint of the cluster autoscaler which is used in cloud-provider and + // kube-proxy to check if a node is getting deleted soon. + TaintToBeDeleted = "ToBeDeletedByClusterAutoscaler" + // MachineLabelKey defines the labels which contains the name of the machine of a node MachineLabelKey = "node.gardener.cloud/machine-name"