Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,11 @@ spec:
description: The generation observed by the MachineDeployment controller.
format: int64
type: integer
preservedFailedReplicas:
description: PreservedFailedReplicas is the number of preserved machines
in Failed phase targeted by this MachineDeployment
format: int32
type: integer
readyReplicas:
description: Total number of ready machines targeted by this MachineDeployment.
format: int32
Expand All @@ -543,7 +548,8 @@ spec:
description: |-
Total number of unavailable machines targeted by this MachineDeployment. This is the total number of
machines that are still required for the MachineDeployment to have 100% available capacity. They may
either be machines that are running but not yet available or machines that still have not been created.
either be machines that are running but not yet available, machines that still have not been created, or
machines that are preserved in Failed phase.
format: int32
type: integer
updatedReplicas:
Expand Down
5 changes: 5 additions & 0 deletions kubernetes/crds/machine.sapcloud.io_machinesets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,11 @@ spec:
by the controller.
format: int64
type: integer
preservedFailedReplicas:
description: PreservedFailedReplicas is the number of preserved replicas
in Failed phase for this replica set
format: int32
type: integer
readyReplicas:
description: The number of ready replicas for this replica set.
format: int32
Expand Down
9 changes: 8 additions & 1 deletion pkg/apis/machine/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,9 @@ type MachineSetStatus struct {
// The number of available replicas (ready for at least minReadySeconds) for this replica set.
AvailableReplicas int32

// PreservedFailedReplicas is the number of preserved replicas in Failed phase for this replica set
PreservedFailedReplicas int32

// ObservedGeneration is the most recent generation observed by the controller.
ObservedGeneration int64

Expand Down Expand Up @@ -633,9 +636,13 @@ type MachineDeploymentStatus struct {
// Total number of available machines (ready for at least minReadySeconds) targeted by this MachineDeployment.
AvailableReplicas int32

// PreservedFailedReplicas is the number of preserved machines in Failed phase targeted by this MachineDeployment
PreservedFailedReplicas int32

// Total number of unavailable machines targeted by this MachineDeployment. This is the total number of
// machines that are still required for the MachineDeployment to have 100% available capacity. They may
// either be machines that are running but not yet available or machines that still have not been created.
// either be machines that are running but not yet available, machines that still have not been created, or
// machines that are preserved in Failed phase.
UnavailableReplicas int32

// Represents the latest available observations of a MachineDeployment's current state.
Expand Down
6 changes: 5 additions & 1 deletion pkg/apis/machine/v1alpha1/machinedeployment_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,9 +213,13 @@ type MachineDeploymentStatus struct {
// +optional
AvailableReplicas int32 `json:"availableReplicas,omitempty"`

// PreservedFailedReplicas is the number of preserved machines in Failed phase targeted by this MachineDeployment
PreservedFailedReplicas int32 `json:"preservedFailedReplicas,omitempty"`
Comment thread
gagan16k marked this conversation as resolved.

// Total number of unavailable machines targeted by this MachineDeployment. This is the total number of
// machines that are still required for the MachineDeployment to have 100% available capacity. They may
// either be machines that are running but not yet available or machines that still have not been created.
// either be machines that are running but not yet available, machines that still have not been created, or
// machines that are preserved in Failed phase.
// +optional
UnavailableReplicas int32 `json:"unavailableReplicas,omitempty"`

Expand Down
3 changes: 3 additions & 0 deletions pkg/apis/machine/v1alpha1/machineset_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ type MachineSetStatus struct {
// +optional
AvailableReplicas int32 `json:"availableReplicas,omitempty"`

// PreservedFailedReplicas is the number of preserved replicas in Failed phase for this replica set
PreservedFailedReplicas int32 `json:"preservedFailedReplicas,omitempty"`

// ObservedGeneration is the most recent generation observed by the controller.
// +optional
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
Expand Down
24 changes: 19 additions & 5 deletions pkg/controller/deployment_machineset_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,10 @@ package controller
import (
"context"
"fmt"
"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
"reflect"

"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"

"k8s.io/klog/v2"

"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
Expand All @@ -50,7 +51,8 @@ func updateMachineSetStatus(ctx context.Context, machineClient machineapi.Machin
is.Generation == is.Status.ObservedGeneration &&
reflect.DeepEqual(is.Status.Conditions, newStatus.Conditions) &&
reflect.DeepEqual(is.Status.FailedMachines, newStatus.FailedMachines) &&
is.Status.AutoPreserveFailedMachineCount == newStatus.AutoPreserveFailedMachineCount {
is.Status.AutoPreserveFailedMachineCount == newStatus.AutoPreserveFailedMachineCount &&
is.Status.PreservedFailedReplicas == newStatus.PreservedFailedReplicas {
return is, nil
}

Expand All @@ -68,8 +70,9 @@ func updateMachineSetStatus(ctx context.Context, machineClient machineapi.Machin
fmt.Sprintf("fullyLabeledReplicas %d->%d, ", is.Status.FullyLabeledReplicas, newStatus.FullyLabeledReplicas)+
fmt.Sprintf("readyReplicas %d->%d, ", is.Status.ReadyReplicas, newStatus.ReadyReplicas)+
fmt.Sprintf("availableReplicas %d->%d, ", is.Status.AvailableReplicas, newStatus.AvailableReplicas)+
fmt.Sprintf("sequence No: %v->%v,", is.Status.ObservedGeneration, newStatus.ObservedGeneration)+
fmt.Sprintf("autoPreserveFailedMachineCount %v->%v", is.Status.AutoPreserveFailedMachineCount, newStatus.AutoPreserveFailedMachineCount))
fmt.Sprintf("sequence No: %v->%v, ", is.Status.ObservedGeneration, newStatus.ObservedGeneration)+
Comment thread
takoverflow marked this conversation as resolved.
Outdated
fmt.Sprintf("autoPreserveFailedMachineCount %v->%v, ", is.Status.AutoPreserveFailedMachineCount, newStatus.AutoPreserveFailedMachineCount)+
fmt.Sprintf("preservedFailedReplicas %v->%v", is.Status.PreservedFailedReplicas, newStatus.PreservedFailedReplicas))

is.Status = newStatus
updatedIS, updateErr = c.UpdateStatus(ctx, is, metav1.UpdateOptions{})
Expand Down Expand Up @@ -102,6 +105,7 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al
fullyLabeledReplicasCount := 0
readyReplicasCount := 0
availableReplicasCount := 0
preservedFailedReplicasCount := 0
autoPreserveFailedMachineCount := 0

failedMachines := []v1alpha1.MachineSummary{}
Expand All @@ -118,7 +122,9 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al
readyReplicasCount++
}
}
if machine.Status.LastOperation.State == v1alpha1.MachineStateFailed {
isMachinePreserved := !machine.Status.CurrentStatus.PreserveExpiryTime.IsZero()
// exclude failed preserved machines from failedMachines to prevent MCD from being marked as unavailable
if machine.Status.LastOperation.State == v1alpha1.MachineStateFailed && !isMachinePreserved {
Comment thread
takoverflow marked this conversation as resolved.
Outdated
machineSummary.Name = machine.Name
machineSummary.ProviderID = machine.Spec.ProviderID
machineSummary.LastOperation = machine.Status.LastOperation
Expand All @@ -127,8 +133,11 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al
machineSummary.OwnerRef = controller.Name
}
failedMachines = append(failedMachines, machineSummary)
} else if isMachinePreserved && machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed {
Comment thread
takoverflow marked this conversation as resolved.
Outdated
preservedFailedReplicasCount++
}
// Count number of failed machines annotated with PreserveMachineAnnotationValuePreservedByMCM
// Cannot combine with above if block in case auto-preservation is not complete yet
if machine.Annotations[machineutils.PreserveMachineAnnotationKey] == machineutils.PreserveMachineAnnotationValuePreservedByMCM {
autoPreserveFailedMachineCount++
}
Expand Down Expand Up @@ -162,6 +171,7 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al
newStatus.AvailableReplicas = int32(availableReplicasCount) // #nosec G115 (CWE-190) -- number of machines will not exceed MaxInt32
newStatus.LastOperation.LastUpdateTime = metav1.Now()
newStatus.AutoPreserveFailedMachineCount = int32(autoPreserveFailedMachineCount) // #nosec G115 (CWE-190) -- number of machines will not exceed MaxInt32
newStatus.PreservedFailedReplicas = int32(preservedFailedReplicasCount) // #nosec G115 (CWE-190) -- number of machines will not exceed MaxInt32
return newStatus
}

Expand Down Expand Up @@ -224,6 +234,10 @@ func isMachineAvailable(machine *v1alpha1.Machine) bool {
return false
}

func isMachinePreservedAndFailed(machine *v1alpha1.Machine) bool {
Comment thread
takoverflow marked this conversation as resolved.
return machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed && !machine.Status.CurrentStatus.PreserveExpiryTime.IsZero()
}

func isMachineReady(machine *v1alpha1.Machine) bool {
// TODO add more conditions
return machine.Status.CurrentStatus.Phase == v1alpha1.MachineRunning
Expand Down
18 changes: 10 additions & 8 deletions pkg/controller/deployment_sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -625,19 +625,21 @@ func (dc *controller) syncMachineDeploymentStatus(ctx context.Context, allISs []
func calculateDeploymentStatus(allISs []*v1alpha1.MachineSet, newIS *v1alpha1.MachineSet, deployment *v1alpha1.MachineDeployment) v1alpha1.MachineDeploymentStatus {
availableReplicas := GetAvailableReplicaCountForMachineSets(allISs)
totalReplicas := GetReplicaCountForMachineSets(allISs)
totalPreservedFailedReplicas := GetPreservedFailedReplicaCountForMachineSets(allISs)
// If unavailableReplicas is negative, then that means the Deployment has more available replicas running than
// desired, e.g. whenever it scales down. In such a case we should simply default unavailableReplicas to zero.
unavailableReplicas := max(totalReplicas-availableReplicas, 0)

status := v1alpha1.MachineDeploymentStatus{
// TODO: Ensure that if we start retrying status updates, we won't pick up a new Generation value.
ObservedGeneration: deployment.Generation,
Replicas: GetActualReplicaCountForMachineSets(allISs),
UpdatedReplicas: GetActualReplicaCountForMachineSets([]*v1alpha1.MachineSet{newIS}),
ReadyReplicas: GetReadyReplicaCountForMachineSets(allISs),
AvailableReplicas: availableReplicas,
UnavailableReplicas: unavailableReplicas,
CollisionCount: deployment.Status.CollisionCount,
ObservedGeneration: deployment.Generation,
Replicas: GetActualReplicaCountForMachineSets(allISs),
UpdatedReplicas: GetActualReplicaCountForMachineSets([]*v1alpha1.MachineSet{newIS}),
ReadyReplicas: GetReadyReplicaCountForMachineSets(allISs),
AvailableReplicas: availableReplicas,
PreservedFailedReplicas: totalPreservedFailedReplicas,
UnavailableReplicas: unavailableReplicas,
CollisionCount: deployment.Status.CollisionCount,
}
status.FailedMachines = []*v1alpha1.MachineSummary{}

Expand All @@ -654,7 +656,7 @@ func calculateDeploymentStatus(allISs []*v1alpha1.MachineSet, newIS *v1alpha1.Ma
// Copy conditions one by one so we won't mutate the original object.
status.Conditions = append(status.Conditions, deployment.Status.Conditions...)

if availableReplicas >= (deployment.Spec.Replicas)-MaxUnavailable(*deployment) {
if availableReplicas >= (deployment.Spec.Replicas)-MaxUnavailable(*deployment)-totalPreservedFailedReplicas {
Comment thread
gagan16k marked this conversation as resolved.
Outdated
minAvailability := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentAvailable, v1alpha1.ConditionTrue, MinimumReplicasAvailable, "Deployment has minimum availability.")
SetMachineDeploymentCondition(&status, *minAvailability)
} else {
Expand Down
14 changes: 13 additions & 1 deletion pkg/controller/deployment_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ package controller
import (
"context"
"fmt"
"github.com/gardener/machine-controller-manager/pkg/util/nodeops"
"maps"
"reflect"
"sort"
"strconv"
"strings"
"time"

"github.com/gardener/machine-controller-manager/pkg/util/nodeops"

v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/meta"
Expand Down Expand Up @@ -1041,6 +1042,17 @@ func GetAvailableReplicaCountForMachineSets(MachineSets []*v1alpha1.MachineSet)
return totalAvailableReplicas
}

// GetPreservedFailedReplicaCountForMachineSets returns the number of available machines corresponding to the given machine sets.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Docstring needs to be corrected

func GetPreservedFailedReplicaCountForMachineSets(MachineSets []*v1alpha1.MachineSet) int32 {
totalPreservedFailedReplicas := int32(0)
for _, is := range MachineSets {
if is != nil {
totalPreservedFailedReplicas += is.Status.PreservedFailedReplicas
}
}
return totalPreservedFailedReplicas
}

// IsRollingUpdate returns true if the strategy type is a rolling update.
func IsRollingUpdate(deployment *v1alpha1.MachineDeployment) bool {
return deployment.Spec.Strategy.Type == v1alpha1.RollingUpdateMachineDeploymentStrategyType
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/machineset.go
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,7 @@ func (c *controller) shouldFailedMachineBeTerminated(machine *v1alpha1.Machine)
// to trigger preservation of the machines, by the machine controller, up to the limit defined in the
// MachineSet's AutoPreserveFailedMachineMax field. If the AutoPreserveFailedMachineMax limit is breached, it removes the preserve=auto-preserved annotation from the oldest annotated machines.
func (c *controller) manageAutoPreservationOfFailedMachines(ctx context.Context, machines []*v1alpha1.Machine, machineSet *v1alpha1.MachineSet) []*v1alpha1.Machine {
// TODO@thiyyakat: if preservation is to be honoured across updates, capacity remaining should consider machines in all machinesets
autoPreservationCapacityRemaining := machineSet.Spec.AutoPreserveFailedMachineMax - machineSet.Status.AutoPreserveFailedMachineCount
if autoPreservationCapacityRemaining == 0 {
// no capacity remaining, nothing to do
Expand Down
35 changes: 35 additions & 0 deletions pkg/controller/machineset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1221,6 +1221,41 @@ var _ = Describe("machineset", func() {
Expect(len(machines.Items)).To(Equal(int(0)))
Expect(Err).Should(BeNil())
})

// Testcase: It should count preserved Failed Replicas and exclude these replicas from failed machines
It("should count preserved Failed Replicas and exclude these replicas from failed machines", func() {
stop := make(chan struct{})
defer close(stop)
objects := []runtime.Object{}
testPreservedFailedMachine := &machinev1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "machine-1",
Namespace: testNamespace,
UID: "1234568",
Labels: map[string]string{
"test-label": "test-label",
},
},
Status: machinev1.MachineStatus{
CurrentStatus: machinev1.CurrentStatus{
Phase: MachineFailed,
PreserveExpiryTime: &metav1.Time{Time: time.Now().Add(1 * time.Hour)},
},
},
}
objects = append(objects, testMachineSet, testPreservedFailedMachine)
c, trackers := createController(stop, testNamespace, objects, nil, nil)
defer trackers.Stop()
waitForCacheSync(stop, c)
Key := testNamespace + "/" + testMachineSet.Name
Err := c.reconcileClusterMachineSet(Key)
Expect(Err).Should(BeNil())
waitForCacheSync(stop, c)
testMachineSet, err := c.controlMachineClient.MachineSets(testNamespace).Get(context.TODO(), testMachineSet.Name, metav1.GetOptions{})
Expect(err).ToNot(HaveOccurred())
Expect(testMachineSet.Status.PreservedFailedReplicas).To(Equal(int32(1)))
Expect(testMachineSet.Status.FailedMachines).To(BeNil())
})
})

Describe("#claimMachines", func() {
Expand Down
5 changes: 4 additions & 1 deletion pkg/util/provider/machinecontroller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,10 @@ func (c *controller) manageMachinePreservation(ctx context.Context, machine *v1a
case "", machineutils.PreserveMachineAnnotationValueFalse:
machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, false)
case machineutils.PreserveMachineAnnotationValueWhenFailed:
if !machineutils.IsMachineFailed(clone) || machineutils.IsMachinePreservationExpired(clone) {
// on timing out, remove preserve annotation to prevent incorrect re-preservation
if machineutils.IsMachinePreservationExpired(clone) {
machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, true)
} else if !machineutils.IsMachineFailed(clone) {
machineAnnotationsSynced, err = c.stopPreservationIfActive(ctx, clone, false)
} else {
machineAnnotationsSynced, err = c.preserveMachine(ctx, clone, effectivePreserveValue)
Expand Down
Loading