Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,11 @@ spec:
description: The generation observed by the MachineDeployment controller.
format: int64
type: integer
preservedFailedReplicas:
description: PreservedFailedReplicas is the number of preserved machines
in Failed phase targeted by this MachineDeployment
format: int32
type: integer
readyReplicas:
description: Total number of ready machines targeted by this MachineDeployment.
format: int32
Expand All @@ -543,7 +548,8 @@ spec:
description: |-
Total number of unavailable machines targeted by this MachineDeployment. This is the total number of
machines that are still required for the MachineDeployment to have 100% available capacity. They may
either be machines that are running but not yet available or machines that still have not been created.
either be machines that are running but not yet available, machines that still have not been created, or
machines that are preserved in Failed phase.
format: int32
type: integer
updatedReplicas:
Expand Down
5 changes: 5 additions & 0 deletions kubernetes/crds/machine.sapcloud.io_machinesets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,11 @@ spec:
by the controller.
format: int64
type: integer
preservedFailedReplicas:
description: PreservedFailedReplicas is the number of preserved replicas
in Failed phase for this replica set
format: int32
type: integer
readyReplicas:
description: The number of ready replicas for this replica set.
format: int32
Expand Down
9 changes: 8 additions & 1 deletion pkg/apis/machine/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,9 @@ type MachineSetStatus struct {
// The number of available replicas (ready for at least minReadySeconds) for this replica set.
AvailableReplicas int32

// PreservedFailedReplicas is the number of preserved replicas in Failed phase for this replica set
PreservedFailedReplicas int32

// ObservedGeneration is the most recent generation observed by the controller.
ObservedGeneration int64

Expand Down Expand Up @@ -633,9 +636,13 @@ type MachineDeploymentStatus struct {
// Total number of available machines (ready for at least minReadySeconds) targeted by this MachineDeployment.
AvailableReplicas int32

// PreservedFailedReplicas is the number of preserved machines in Failed phase targeted by this MachineDeployment
PreservedFailedReplicas int32

// Total number of unavailable machines targeted by this MachineDeployment. This is the total number of
// machines that are still required for the MachineDeployment to have 100% available capacity. They may
// either be machines that are running but not yet available or machines that still have not been created.
// either be machines that are running but not yet available, machines that still have not been created, or
// machines that are preserved in Failed phase.
UnavailableReplicas int32

// Represents the latest available observations of a MachineDeployment's current state.
Expand Down
6 changes: 5 additions & 1 deletion pkg/apis/machine/v1alpha1/machinedeployment_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,9 +213,13 @@ type MachineDeploymentStatus struct {
// +optional
AvailableReplicas int32 `json:"availableReplicas,omitempty"`

// PreservedFailedReplicas is the number of preserved machines in Failed phase targeted by this MachineDeployment
PreservedFailedReplicas int32 `json:"preservedFailedReplicas,omitempty"`
Comment thread
gagan16k marked this conversation as resolved.

// Total number of unavailable machines targeted by this MachineDeployment. This is the total number of
// machines that are still required for the MachineDeployment to have 100% available capacity. They may
// either be machines that are running but not yet available or machines that still have not been created.
// either be machines that are running but not yet available, machines that still have not been created, or
// machines that are preserved in Failed phase.
// +optional
UnavailableReplicas int32 `json:"unavailableReplicas,omitempty"`

Expand Down
3 changes: 3 additions & 0 deletions pkg/apis/machine/v1alpha1/machineset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ type MachineSetStatus struct {
// +optional
AvailableReplicas int32 `json:"availableReplicas,omitempty"`

// PreservedFailedReplicas is the number of preserved replicas in Failed phase for this replica set
PreservedFailedReplicas int32 `json:"preservedFailedReplicas,omitempty"`

// ObservedGeneration is the most recent generation observed by the controller.
// +optional
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
Expand Down
24 changes: 19 additions & 5 deletions pkg/controller/deployment_machineset_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ package controller
import (
"context"
"fmt"
"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
"reflect"

"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"

"k8s.io/klog/v2"

"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
Expand All @@ -50,7 +51,8 @@ func updateMachineSetStatus(ctx context.Context, machineClient machineapi.Machin
is.Generation == is.Status.ObservedGeneration &&
reflect.DeepEqual(is.Status.Conditions, newStatus.Conditions) &&
reflect.DeepEqual(is.Status.FailedMachines, newStatus.FailedMachines) &&
is.Status.AutoPreserveFailedMachineCount == newStatus.AutoPreserveFailedMachineCount {
is.Status.AutoPreserveFailedMachineCount == newStatus.AutoPreserveFailedMachineCount &&
is.Status.PreservedFailedReplicas == newStatus.PreservedFailedReplicas {
return is, nil
}

Expand All @@ -68,8 +70,9 @@ func updateMachineSetStatus(ctx context.Context, machineClient machineapi.Machin
fmt.Sprintf("fullyLabeledReplicas %d->%d, ", is.Status.FullyLabeledReplicas, newStatus.FullyLabeledReplicas)+
fmt.Sprintf("readyReplicas %d->%d, ", is.Status.ReadyReplicas, newStatus.ReadyReplicas)+
fmt.Sprintf("availableReplicas %d->%d, ", is.Status.AvailableReplicas, newStatus.AvailableReplicas)+
fmt.Sprintf("sequence No: %v->%v,", is.Status.ObservedGeneration, newStatus.ObservedGeneration)+
fmt.Sprintf("autoPreserveFailedMachineCount %v->%v", is.Status.AutoPreserveFailedMachineCount, newStatus.AutoPreserveFailedMachineCount))
fmt.Sprintf("observedGeneration: %v->%v, ", is.Status.ObservedGeneration, newStatus.ObservedGeneration)+
fmt.Sprintf("autoPreserveFailedMachineCount %v->%v, ", is.Status.AutoPreserveFailedMachineCount, newStatus.AutoPreserveFailedMachineCount)+
fmt.Sprintf("preservedFailedReplicas %v->%v", is.Status.PreservedFailedReplicas, newStatus.PreservedFailedReplicas))

is.Status = newStatus
updatedIS, updateErr = c.UpdateStatus(ctx, is, metav1.UpdateOptions{})
Expand Down Expand Up @@ -102,6 +105,7 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al
fullyLabeledReplicasCount := 0
readyReplicasCount := 0
availableReplicasCount := 0
preservedFailedReplicasCount := 0
autoPreserveFailedMachineCount := 0

failedMachines := []v1alpha1.MachineSummary{}
Expand All @@ -118,7 +122,11 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al
readyReplicasCount++
}
}
if machine.Status.LastOperation.State == v1alpha1.MachineStateFailed {

// exclude failed preserved machines from failedMachines to prevent MCD from being marked as unavailable
if isMachinePreservedAndFailed(machine) {
preservedFailedReplicasCount++
} else if machine.Status.LastOperation.State == v1alpha1.MachineStateFailed {
machineSummary.Name = machine.Name
machineSummary.ProviderID = machine.Spec.ProviderID
machineSummary.LastOperation = machine.Status.LastOperation
Expand All @@ -129,6 +137,7 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al
failedMachines = append(failedMachines, machineSummary)
}
// Count number of failed machines annotated with PreserveMachineAnnotationValuePreservedByMCM
// Cannot combine with above if block in case auto-preservation is not complete yet
if machine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM {
autoPreserveFailedMachineCount++
}
Expand Down Expand Up @@ -162,6 +171,7 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al
newStatus.AvailableReplicas = int32(availableReplicasCount) // #nosec G115 (CWE-190) -- number of machines will not exceed MaxInt32
newStatus.LastOperation.LastUpdateTime = metav1.Now()
newStatus.AutoPreserveFailedMachineCount = int32(autoPreserveFailedMachineCount) // #nosec G115 (CWE-190) -- number of machines will not exceed MaxInt32
newStatus.PreservedFailedReplicas = int32(preservedFailedReplicasCount) // #nosec G115 (CWE-190) -- number of machines will not exceed MaxInt32
return newStatus
}

Expand Down Expand Up @@ -224,6 +234,10 @@ func isMachineAvailable(machine *v1alpha1.Machine) bool {
return false
}

func isMachinePreservedAndFailed(machine *v1alpha1.Machine) bool {
Comment thread
takoverflow marked this conversation as resolved.
return machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed && !machine.Status.CurrentStatus.PreserveExpiryTime.IsZero()
}

func isMachineReady(machine *v1alpha1.Machine) bool {
// TODO add more conditions
return machine.Status.CurrentStatus.Phase == v1alpha1.MachineRunning
Expand Down
18 changes: 10 additions & 8 deletions pkg/controller/deployment_sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -625,19 +625,21 @@ func (dc *controller) syncMachineDeploymentStatus(ctx context.Context, allISs []
func calculateDeploymentStatus(allISs []*v1alpha1.MachineSet, newIS *v1alpha1.MachineSet, deployment *v1alpha1.MachineDeployment) v1alpha1.MachineDeploymentStatus {
availableReplicas := GetAvailableReplicaCountForMachineSets(allISs)
totalReplicas := GetReplicaCountForMachineSets(allISs)
totalPreservedFailedReplicas := GetPreservedFailedReplicaCountForMachineSets(allISs)
// If unavailableReplicas is negative, then that means the Deployment has more available replicas running than
// desired, e.g. whenever it scales down. In such a case we should simply default unavailableReplicas to zero.
unavailableReplicas := max(totalReplicas-availableReplicas, 0)

status := v1alpha1.MachineDeploymentStatus{
// TODO: Ensure that if we start retrying status updates, we won't pick up a new Generation value.
ObservedGeneration: deployment.Generation,
Replicas: GetActualReplicaCountForMachineSets(allISs),
UpdatedReplicas: GetActualReplicaCountForMachineSets([]*v1alpha1.MachineSet{newIS}),
ReadyReplicas: GetReadyReplicaCountForMachineSets(allISs),
AvailableReplicas: availableReplicas,
UnavailableReplicas: unavailableReplicas,
CollisionCount: deployment.Status.CollisionCount,
ObservedGeneration: deployment.Generation,
Replicas: GetActualReplicaCountForMachineSets(allISs),
UpdatedReplicas: GetActualReplicaCountForMachineSets([]*v1alpha1.MachineSet{newIS}),
ReadyReplicas: GetReadyReplicaCountForMachineSets(allISs),
AvailableReplicas: availableReplicas,
PreservedFailedReplicas: totalPreservedFailedReplicas,
UnavailableReplicas: unavailableReplicas,
CollisionCount: deployment.Status.CollisionCount,
}
status.FailedMachines = []*v1alpha1.MachineSummary{}

Expand All @@ -654,7 +656,7 @@ func calculateDeploymentStatus(allISs []*v1alpha1.MachineSet, newIS *v1alpha1.Ma
// Copy conditions one by one so we won't mutate the original object.
status.Conditions = append(status.Conditions, deployment.Status.Conditions...)

if availableReplicas >= (deployment.Spec.Replicas)-MaxUnavailable(*deployment) {
if availableReplicas >= (deployment.Spec.Replicas)-MaxUnavailable(*deployment)-totalPreservedFailedReplicas {
Comment thread
gagan16k marked this conversation as resolved.
Outdated
minAvailability := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentAvailable, v1alpha1.ConditionTrue, MinimumReplicasAvailable, "Deployment has minimum availability.")
SetMachineDeploymentCondition(&status, *minAvailability)
} else {
Expand Down
14 changes: 13 additions & 1 deletion pkg/controller/deployment_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ package controller
import (
"context"
"fmt"
"github.com/gardener/machine-controller-manager/pkg/util/nodeops"
"maps"
"reflect"
"sort"
"strconv"
"strings"
"time"

"github.com/gardener/machine-controller-manager/pkg/util/nodeops"

v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/meta"
Expand Down Expand Up @@ -1041,6 +1042,17 @@ func GetAvailableReplicaCountForMachineSets(MachineSets []*v1alpha1.MachineSet)
return totalAvailableReplicas
}

// GetPreservedFailedReplicaCountForMachineSets returns the number of preserved failed machines corresponding to the given machine sets.
func GetPreservedFailedReplicaCountForMachineSets(MachineSets []*v1alpha1.MachineSet) int32 {
totalPreservedFailedReplicas := int32(0)
for _, is := range MachineSets {
if is != nil {
totalPreservedFailedReplicas += is.Status.PreservedFailedReplicas
}
}
return totalPreservedFailedReplicas
}

// IsRollingUpdate returns true if the strategy type is a rolling update.
func IsRollingUpdate(deployment *v1alpha1.MachineDeployment) bool {
return deployment.Spec.Strategy.Type == v1alpha1.RollingUpdateMachineDeploymentStrategyType
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/machineset.go
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,7 @@ func (c *controller) shouldFailedMachineBeTerminated(machine *v1alpha1.Machine)
// to trigger preservation of the machines, by the machine controller, up to the limit defined in the
// MachineSet's AutoPreserveFailedMachineMax field. If the AutoPreserveFailedMachineMax limit is breached, it removes the preserve=auto-preserved annotation from the oldest annotated machines.
func (c *controller) manageAutoPreservationOfFailedMachines(ctx context.Context, machines []*v1alpha1.Machine, machineSet *v1alpha1.MachineSet) []*v1alpha1.Machine {
// TODO@thiyyakat: if preservation is to be honoured across updates, capacity remaining should consider machines in all machinesets
autoPreservationCapacityRemaining := machineSet.Spec.AutoPreserveFailedMachineMax - machineSet.Status.AutoPreserveFailedMachineCount
if autoPreservationCapacityRemaining == 0 {
// no capacity remaining, nothing to do
Expand Down
35 changes: 35 additions & 0 deletions pkg/controller/machineset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1221,6 +1221,41 @@ var _ = Describe("machineset", func() {
Expect(len(machines.Items)).To(Equal(int(0)))
Expect(Err).Should(BeNil())
})

// Testcase: It should count preserved Failed Replicas and exclude these replicas from failed machines
It("should count preserved Failed Replicas and exclude these replicas from failed machines", func() {
stop := make(chan struct{})
defer close(stop)
objects := []runtime.Object{}
testPreservedFailedMachine := &machinev1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "machine-1",
Namespace: testNamespace,
UID: "1234568",
Labels: map[string]string{
"test-label": "test-label",
},
},
Status: machinev1.MachineStatus{
CurrentStatus: machinev1.CurrentStatus{
Phase: MachineFailed,
PreserveExpiryTime: &metav1.Time{Time: time.Now().Add(1 * time.Hour)},
},
},
}
objects = append(objects, testMachineSet, testPreservedFailedMachine)
c, trackers := createController(stop, testNamespace, objects, nil, nil)
defer trackers.Stop()
waitForCacheSync(stop, c)
Key := testNamespace + "/" + testMachineSet.Name
Err := c.reconcileClusterMachineSet(Key)
Expect(Err).Should(BeNil())
waitForCacheSync(stop, c)
testMachineSet, err := c.controlMachineClient.MachineSets(testNamespace).Get(context.TODO(), testMachineSet.Name, metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())
Expect(testMachineSet.Status.PreservedFailedReplicas).To(Equal(int32(1)))
Expect(testMachineSet.Status.FailedMachines).To(BeNil())
})
})

Describe("#claimMachines", func() {
Expand Down
5 changes: 4 additions & 1 deletion pkg/util/provider/machinecontroller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,10 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a
case "", machineutils.PreserveMachineAnnotationValueFalse:
machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, false)
case machineutils.PreserveMachineAnnotationValueWhenFailed:
if !machineutils.IsMachineFailed(clone) || machineutils.IsMachinePreservationExpired(clone) {
// on timing out, remove preserve annotation to prevent incorrect re-preservation
if machineutils.IsMachinePreservationExpired(clone) {
machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true)
} else if !machineutils.IsMachineFailed(clone) {
machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, false)
} else {
machineAnnotationsSynced, err = c.preserveMachine(ctx, clone, effectivePreserveValue)
Expand Down
7 changes: 4 additions & 3 deletions pkg/util/provider/machinecontroller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@ package controller
import (
"context"
"fmt"
k8stesting "k8s.io/client-go/testing"
"math"
"time"

k8stesting "k8s.io/client-go/testing"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -4283,7 +4284,7 @@ var _ = Describe("machine", func() {
retry: machineutils.LongRetry,
},
}),
Entry("when node of Failed machine is annotated with `when-failed`, should start preservation", testCase{
Entry("when node of Failed machine has annotation `when-failed`, should start preservation", testCase{
setup: setup{
nodeAnnotationValue: machineutils.PreserveMachineAnnotationValueWhenFailed,
nodeName: "node-1",
Expand Down Expand Up @@ -4460,7 +4461,7 @@ var _ = Describe("machine", func() {
},
}),
// case possible when MCM goes down and node annotation value is cleared and machine is annotated
Entry("when node and machine are found to be annotated with \"\", and 'now', respectively and last applied node perserve value is 'now', should stop preservation", testCase{
Entry("when node and machine are found to be annotated with \"\", and 'now', respectively and last applied node preserve value is 'now', should stop preservation", testCase{
setup: setup{
nodeAnnotationValue: "",
laNodePreserveValue: "now",
Expand Down
Loading