diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index 8f0741647dd4..ac27a96c58fd 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -19,6 +19,7 @@ package controllers import ( "context" "fmt" + "slices" "sort" "strings" "time" @@ -456,6 +457,11 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl } if err := r.syncMachines(ctx, controlPlane); err != nil { + // Note: If any of the calls got a NotFound error, it means that at least one Machine got deleted. + // Let's return here so that the next Reconcile will get the updated list of Machines. + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil // Note: Requeue is not needed, changes to Machines trigger another reconcile. + } return ctrl.Result{}, errors.Wrap(err, "failed to sync Machines") } @@ -504,7 +510,9 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl // Note: We have to wait here even if there are no more Machines that need rollout (in-place update in // progress is not counted as needs rollout). if machines := controlPlane.MachinesToCompleteInPlaceUpdate(); machines.Len() > 0 { - log.Info("Waiting for in-place update to complete", "machines", strings.Join(machines.Names(), ", ")) + for _, machine := range machines { + log.Info(fmt.Sprintf("Waiting for in-place update of Machine %s to complete", machine.Name), "Machine", klog.KObj(machine)) + } return ctrl.Result{}, nil // Note: Changes to Machines trigger another reconcile. } @@ -513,10 +521,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl switch { case len(machinesNeedingRollout) > 0: var allMessages []string - for machine, machineUpToDateResult := range machinesUpToDateResults { - allMessages = append(allMessages, fmt.Sprintf("Machine %s needs rollout: %s", machine, strings.Join(machineUpToDateResult.LogMessages, ","))) + machinesNeedingRolloutNames := machinesNeedingRollout.Names() + slices.Sort(machinesNeedingRolloutNames) + for _, name := range machinesNeedingRolloutNames { + allMessages = append(allMessages, fmt.Sprintf("Machine %s needs rollout: %s", name, strings.Join(machinesUpToDateResults[name].LogMessages, ", "))) } - log.Info(fmt.Sprintf("Rolling out Control Plane machines: %s", strings.Join(allMessages, ",")), "machinesNeedingRollout", machinesNeedingRollout.Names()) + log.Info(fmt.Sprintf("Machines need rollout: %s", strings.Join(machinesNeedingRolloutNames, ",")), "reason", strings.Join(allMessages, ", ")) v1beta1conditions.MarkFalse(controlPlane.KCP, controlplanev1.MachinesSpecUpToDateV1Beta1Condition, controlplanev1.RollingUpdateInProgressV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Rolling %d replicas with outdated spec (%d replicas up to date)", len(machinesNeedingRollout), len(controlPlane.Machines)-len(machinesNeedingRollout)) return r.updateControlPlane(ctx, controlPlane, machinesNeedingRollout, machinesUpToDateResults) default: @@ -1042,10 +1052,15 @@ func reconcileMachineUpToDateCondition(_ context.Context, controlPlane *internal } if inplace.IsUpdateInProgress(machine) { + msg := "* In-place update in progress" + if c := conditions.Get(machine, clusterv1.MachineUpdatingCondition); c != nil && c.Status == metav1.ConditionTrue && c.Message != "" { + msg = fmt.Sprintf("* %s", c.Message) + } conditions.Set(machine, metav1.Condition{ - Type: clusterv1.MachineUpToDateCondition, - Status: metav1.ConditionFalse, - Reason: clusterv1.MachineUpToDateUpdatingReason, + Type: clusterv1.MachineUpToDateCondition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineUpToDateUpdatingReason, + Message: msg, }) continue } diff --git a/controlplane/kubeadm/internal/controllers/controller_test.go b/controlplane/kubeadm/internal/controllers/controller_test.go index 9e77ea3c8a1a..d5a7d8c13c0b 100644 --- a/controlplane/kubeadm/internal/controllers/controller_test.go +++ b/controlplane/kubeadm/internal/controllers/controller_test.go @@ -2400,9 +2400,10 @@ func TestKubeadmControlPlaneReconciler_reconcileControlPlaneAndMachinesCondition Message: "Waiting for a Node with spec.providerID foo to exist", }, { - Type: clusterv1.MachineUpToDateCondition, - Status: metav1.ConditionFalse, - Reason: clusterv1.MachineUpToDateUpdatingReason, + Type: clusterv1.MachineUpToDateCondition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineUpToDateUpdatingReason, + Message: "* In-place update in progress", }, }, }, diff --git a/controlplane/kubeadm/internal/controllers/inplace_canupdatemachine.go b/controlplane/kubeadm/internal/controllers/inplace_canupdatemachine.go index 73cb3deb45d3..503919c75059 100644 --- a/controlplane/kubeadm/internal/controllers/inplace_canupdatemachine.go +++ b/controlplane/kubeadm/internal/controllers/inplace_canupdatemachine.go @@ -45,7 +45,7 @@ func (r *KubeadmControlPlaneReconciler) canUpdateMachine(ctx context.Context, ma return r.overrideCanUpdateMachineFunc(ctx, machine, machineUpToDateResult) } - log := ctrl.LoggerFrom(ctx) + log := ctrl.LoggerFrom(ctx).WithValues("Machine", klog.KObj(machine)) // Machine cannot be updated in-place if the feature gate is not enabled. if !feature.Gates.Enabled(feature.InPlaceUpdates) { @@ -79,7 +79,7 @@ func (r *KubeadmControlPlaneReconciler) canUpdateMachine(ctx context.Context, ma return false, err } if !canUpdateMachine { - log.Info(fmt.Sprintf("Machine cannot be updated in-place by extensions: %s", strings.Join(reasons, ",")), "Machine", klog.KObj(machine)) + log.Info(fmt.Sprintf("Machine %s cannot be updated in-place by extensions", machine.Name), "reason", strings.Join(reasons, ",")) return false, nil } return true, nil diff --git a/controlplane/kubeadm/internal/controllers/inplace_trigger.go b/controlplane/kubeadm/internal/controllers/inplace_trigger.go index e16ea5230d90..aa9c863553b5 100644 --- a/controlplane/kubeadm/internal/controllers/inplace_trigger.go +++ b/controlplane/kubeadm/internal/controllers/inplace_trigger.go @@ -40,8 +40,8 @@ func (r *KubeadmControlPlaneReconciler) triggerInPlaceUpdate(ctx context.Context return r.overrideTriggerInPlaceUpdate(ctx, machine, machineUpToDateResult) } - log := ctrl.LoggerFrom(ctx) - log.Info("Triggering in-place update", "Machine", klog.KObj(machine)) + log := ctrl.LoggerFrom(ctx).WithValues("Machine", klog.KObj(machine)) + log.Info(fmt.Sprintf("Triggering in-place update for Machine %s", machine.Name)) // Mark Machine for in-place update. // Note: Once we write UpdateInProgressAnnotation we will always continue with the in-place update. @@ -133,7 +133,7 @@ func (r *KubeadmControlPlaneReconciler) triggerInPlaceUpdate(ctx context.Context return errors.Wrapf(err, "failed to complete triggering in-place update for Machine %s", klog.KObj(machine)) } - log.Info("Completed triggering in-place update", "Machine", klog.KObj(machine)) + log.Info(fmt.Sprintf("Completed triggering in-place update for Machine %s", machine.Name)) r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulStartInPlaceUpdate", "Machine starting in-place update") // Wait until the cache observed the Machine with PendingHooksAnnotation to ensure subsequent reconciles diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index 90af3f894a07..4c310a12e18d 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -53,7 +53,7 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte } log.WithValues(controlPlane.StatusToLogKeyAndValues(newMachine, nil)...). - Info("Machine created (init)", + Info(fmt.Sprintf("Machine %s created (init)", newMachine.Name), "Machine", klog.KObj(newMachine), newMachine.Spec.InfrastructureRef.Kind, klog.KRef(newMachine.Namespace, newMachine.Spec.InfrastructureRef.Name), newMachine.Spec.Bootstrap.ConfigRef.Kind, klog.KRef(newMachine.Namespace, newMachine.Spec.Bootstrap.ConfigRef.Name)) @@ -87,7 +87,7 @@ func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, } log.WithValues(controlPlane.StatusToLogKeyAndValues(newMachine, nil)...). - Info("Machine created (scale up)", + Info(fmt.Sprintf("Machine %s created (scale up)", newMachine.Name), "Machine", klog.KObj(newMachine), newMachine.Spec.InfrastructureRef.Kind, klog.KRef(newMachine.Namespace, newMachine.Spec.InfrastructureRef.Name), newMachine.Spec.Bootstrap.ConfigRef.Kind, klog.KRef(newMachine.Namespace, newMachine.Spec.Bootstrap.ConfigRef.Name)) @@ -144,7 +144,7 @@ func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. // Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time). log.WithValues(controlPlane.StatusToLogKeyAndValues(nil, machineToDelete)...). - Info("Deleting Machine (scale down)", "Machine", klog.KObj(machineToDelete)) + Info(fmt.Sprintf("Machine %s deleting (scale down)", machineToDelete.Name), "Machine", klog.KObj(machineToDelete)) // Requeue the control plane, in case there are additional operations to perform return ctrl.Result{Requeue: true}, nil diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index e757697b31be..304937e4de00 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -668,7 +668,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result // We only delete the node after the underlying infrastructure is gone. // https://github.com/kubernetes-sigs/cluster-api/issues/2565 if isDeleteNodeAllowed { - log.Info("Deleting node", "Node", klog.KRef("", m.Status.NodeRef.Name)) + log.Info("Deleting Node", "Node", klog.KRef("", m.Status.NodeRef.Name)) var deleteNodeErr error waitErr := wait.PollUntilContextTimeout(ctx, 2*time.Second, r.nodeDeletionRetryTimeout, true, func(ctx context.Context) (bool, error) { @@ -678,9 +678,9 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result return true, nil }) if waitErr != nil { - log.Error(deleteNodeErr, "Timed out deleting node", "Node", klog.KRef("", m.Status.NodeRef.Name)) + log.Error(deleteNodeErr, "Timed out deleting Node", "Node", klog.KRef("", m.Status.NodeRef.Name)) v1beta1conditions.MarkFalse(m, clusterv1.MachineNodeHealthyV1Beta1Condition, clusterv1.DeletionFailedV1Beta1Reason, clusterv1.ConditionSeverityWarning, "") - r.recorder.Eventf(m, corev1.EventTypeWarning, "FailedDeleteNode", "error deleting Machine's node: %v", deleteNodeErr) + r.recorder.Eventf(m, corev1.EventTypeWarning, "FailedDeleteNode", "error deleting Machine's Node: %v", deleteNodeErr) // If the node deletion timeout is not expired yet, requeue the Machine for reconciliation. if m.Spec.Deletion.NodeDeletionTimeoutSeconds == nil || *m.Spec.Deletion.NodeDeletionTimeoutSeconds == 0 || m.DeletionTimestamp.Add(time.Duration(*m.Spec.Deletion.NodeDeletionTimeoutSeconds)*time.Second).After(time.Now()) { diff --git a/internal/controllers/machine/machine_controller_inplace_update.go b/internal/controllers/machine/machine_controller_inplace_update.go index ca9654868a54..9671a8140df2 100644 --- a/internal/controllers/machine/machine_controller_inplace_update.go +++ b/internal/controllers/machine/machine_controller_inplace_update.go @@ -66,7 +66,7 @@ func (r *Reconciler) reconcileInPlaceUpdate(ctx context.Context, s *scope) (ctrl // If hook is not pending, we're waiting for the owner controller to mark it as pending. if !hasUpdateMachinePending { - log.Info("In-place update annotation is set, waiting for UpdateMachine hook to be marked as pending") + log.Info("Machine marked for in-place update, waiting for owning controller to mark UpdateMachine hook as pending") return ctrl.Result{}, nil } @@ -110,7 +110,6 @@ func (r *Reconciler) reconcileInPlaceUpdate(ctx context.Context, s *scope) (ctrl return result, nil } - log.Info("In-place update completed successfully") if err := r.completeInPlaceUpdate(ctx, s); err != nil { return ctrl.Result{}, errors.Wrap(err, "failed to complete in-place update") } @@ -144,11 +143,9 @@ func (r *Reconciler) callUpdateMachineHook(ctx context.Context, s *scope) (ctrl. if err != nil { return ctrl.Result{}, "", err } - if len(extensions) == 0 { return ctrl.Result{}, "", errors.New("no extensions registered for UpdateMachine hook") } - if len(extensions) > 1 { return ctrl.Result{}, "", errors.Errorf("found multiple UpdateMachine hooks (%s): only one hook is supported", strings.Join(extensions, ",")) } @@ -208,7 +205,7 @@ func (r *Reconciler) completeInPlaceUpdate(ctx context.Context, s *scope) error return err } - log.Info("In-place update completed!") + log.Info("Completed in-place update") return nil } @@ -224,6 +221,8 @@ func (r *Reconciler) removeInPlaceUpdateAnnotation(ctx context.Context, obj clie return errors.Wrapf(err, "failed to remove %s annotation from object %s", clusterv1.UpdateInProgressAnnotation, klog.KObj(obj)) } + // Note: DeepCopy object to not modify the passed-in object which can lead to conflict errors later on. + obj = obj.DeepCopyObject().(client.Object) orig := obj.DeepCopyObject().(client.Object) delete(annotations, clusterv1.UpdateInProgressAnnotation) obj.SetAnnotations(annotations) diff --git a/internal/controllers/machine/machine_controller_noderef.go b/internal/controllers/machine/machine_controller_noderef.go index 08f0757066e9..04683f6d6ed2 100644 --- a/internal/controllers/machine/machine_controller_noderef.go +++ b/internal/controllers/machine/machine_controller_noderef.go @@ -70,7 +70,6 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result, // Check that the Machine has a valid ProviderID. if machine.Spec.ProviderID == "" { - log.Info("Waiting for infrastructure provider to report spec.providerID", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Namespace, machine.Spec.InfrastructureRef.Name)) v1beta1conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyV1Beta1Condition, clusterv1.WaitingForNodeRefV1Beta1Reason, clusterv1.ConditionSeverityInfo, "") return ctrl.Result{}, nil } @@ -96,7 +95,7 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result, return ctrl.Result{}, errors.Wrapf(err, "no matching Node for Machine %q in namespace %q", machine.Name, machine.Namespace) } v1beta1conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyV1Beta1Condition, clusterv1.NodeProvisioningV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Waiting for a node with matching ProviderID to exist") - log.Info("Infrastructure provider reporting spec.providerID, matching Kubernetes node is not yet available", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Namespace, machine.Spec.InfrastructureRef.Name), "providerID", machine.Spec.ProviderID) + log.Info("Infrastructure provider reporting spec.providerID, matching Kubernetes Node is not yet available", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Namespace, machine.Spec.InfrastructureRef.Name), "providerID", machine.Spec.ProviderID) // No need to requeue here. Nodes emit an event that triggers reconciliation. return ctrl.Result{}, nil } @@ -112,7 +111,7 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result, machine.Status.NodeRef = clusterv1.MachineNodeReference{ Name: s.node.Name, } - log.Info("Infrastructure provider reporting spec.providerID, Kubernetes node is now available", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Namespace, machine.Spec.InfrastructureRef.Name), "providerID", machine.Spec.ProviderID, "Node", klog.KRef("", machine.Status.NodeRef.Name)) + log.Info("Infrastructure provider reporting spec.providerID, Kubernetes Node is now available", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Namespace, machine.Spec.InfrastructureRef.Name), "providerID", machine.Spec.ProviderID, "Node", klog.KRef("", machine.Status.NodeRef.Name)) r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name) } diff --git a/internal/controllers/machine/machine_controller_phases.go b/internal/controllers/machine/machine_controller_phases.go index 27380dce14a2..d4fe4608a6d1 100644 --- a/internal/controllers/machine/machine_controller_phases.go +++ b/internal/controllers/machine/machine_controller_phases.go @@ -37,6 +37,7 @@ import ( capierrors "sigs.k8s.io/cluster-api/errors" "sigs.k8s.io/cluster-api/internal/contract" "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/conditions" v1beta1conditions "sigs.k8s.io/cluster-api/util/conditions/deprecated/v1beta1" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/predicates" @@ -212,9 +213,12 @@ func (r *Reconciler) reconcileBootstrap(ctx context.Context, s *scope) (ctrl.Res // If the data secret was not created yet, return. if !dataSecretCreated { - log.Info(fmt.Sprintf("Waiting for bootstrap provider to generate data secret and set %s", - contract.Bootstrap().DataSecretCreated(contractVersion).Path().String()), - s.bootstrapConfig.GetKind(), klog.KObj(s.bootstrapConfig)) + // Only log if the Machine is a control plane Machine or the Cluster is already initialized to reduce noise. + if util.IsControlPlaneMachine(m) || conditions.IsTrue(s.cluster, clusterv1.ClusterControlPlaneInitializedCondition) { + log.Info(fmt.Sprintf("Waiting for bootstrap provider to generate data secret and set %s", + contract.Bootstrap().DataSecretCreated(contractVersion).Path().String()), + s.bootstrapConfig.GetKind(), klog.KObj(s.bootstrapConfig)) + } return ctrl.Result{}, nil } @@ -309,21 +313,27 @@ func (r *Reconciler) reconcileInfrastructure(ctx context.Context, s *scope) (ctr // If the InfrastructureMachine is not provisioned (and it wasn't already provisioned before), return. if !provisioned && !ptr.Deref(m.Status.Initialization.InfrastructureProvisioned, false) { - log.Info(fmt.Sprintf("Waiting for infrastructure provider to create machine infrastructure and set %s", - contract.InfrastructureMachine().Provisioned(contractVersion).Path().String()), - s.infraMachine.GetKind(), klog.KObj(s.infraMachine)) + // Only log if the Machine is a control plane Machine or the Cluster is already initialized to reduce noise. + if util.IsControlPlaneMachine(m) || conditions.IsTrue(s.cluster, clusterv1.ClusterControlPlaneInitializedCondition) { + log.Info(fmt.Sprintf("Waiting for infrastructure provider to set %s on %s", + contract.InfrastructureMachine().Provisioned(contractVersion).Path().String(), s.infraMachine.GetKind()), + s.infraMachine.GetKind(), klog.KObj(s.infraMachine)) + } return ctrl.Result{}, nil } // Get providerID from the InfrastructureMachine (intentionally not setting it on the Machine yet). - var providerID *string - if providerID, err = contract.InfrastructureMachine().ProviderID().Get(s.infraMachine); err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to read providerID from %s %s", - s.infraMachine.GetKind(), klog.KObj(s.infraMachine)) - } else if *providerID == "" { - return ctrl.Result{}, errors.Errorf("got empty %s field from %s %s", + providerID, err := contract.InfrastructureMachine().ProviderID().Get(s.infraMachine) + switch { + case err != nil && !errors.Is(err, contract.ErrFieldNotFound): + return ctrl.Result{}, errors.Wrapf(err, "failed to read %s from %s %s", contract.InfrastructureMachine().ProviderID().Path().String(), s.infraMachine.GetKind(), klog.KObj(s.infraMachine)) + case ptr.Deref(providerID, "") == "": + log.Info(fmt.Sprintf("Waiting for infrastructure provider to set %s on %s", + contract.InfrastructureMachine().ProviderID().Path().String(), s.infraMachine.GetKind()), + s.infraMachine.GetKind(), klog.KObj(s.infraMachine)) + return ctrl.Result{}, nil // Note: Requeue is not needed, changes to InfraMachine trigger another reconcile. } // Get and set addresses from the InfrastructureMachine. diff --git a/internal/controllers/machine/machine_controller_phases_test.go b/internal/controllers/machine/machine_controller_phases_test.go index 1bb858635f89..10d30fe5b2ce 100644 --- a/internal/controllers/machine/machine_controller_phases_test.go +++ b/internal/controllers/machine/machine_controller_phases_test.go @@ -680,7 +680,7 @@ func TestReconcileInfrastructure(t *testing.T) { }, }, { - name: "infra machine ready and no provider ID, it should fail", + name: "should do nothing if infra machine ready and no provider ID", contract: "v1beta1", machine: defaultMachine.DeepCopy(), infraMachine: map[string]interface{}{ @@ -697,7 +697,7 @@ func TestReconcileInfrastructure(t *testing.T) { }, infraMachineGetError: nil, expectResult: ctrl.Result{}, - expectError: true, + expectError: false, }, { name: "should never revert back to infrastructure not ready", diff --git a/internal/controllers/machine/machine_controller_status.go b/internal/controllers/machine/machine_controller_status.go index 145458b37245..5b2970e6e44f 100644 --- a/internal/controllers/machine/machine_controller_status.go +++ b/internal/controllers/machine/machine_controller_status.go @@ -642,6 +642,9 @@ func setUpdatingCondition(_ context.Context, machine *clusterv1.Machine, updatin if updatingReason == "" { updatingReason = clusterv1.MachineInPlaceUpdatingReason } + if updatingMessage == "" { + updatingMessage = "In-place update in progress" + } conditions.Set(machine, metav1.Condition{ Type: clusterv1.MachineUpdatingCondition, Status: metav1.ConditionTrue, @@ -714,11 +717,16 @@ func setUpToDateCondition(_ context.Context, m *clusterv1.Machine, ms *clusterv1 return } - if conditions.IsTrue(m, clusterv1.MachineUpdatingCondition) { + if c := conditions.Get(m, clusterv1.MachineUpdatingCondition); c != nil && c.Status == metav1.ConditionTrue { + msg := "* In-place update in progress" + if c.Message != "" { + msg = fmt.Sprintf("* %s", c.Message) + } conditions.Set(m, metav1.Condition{ - Type: clusterv1.MachineUpToDateCondition, - Status: metav1.ConditionFalse, - Reason: clusterv1.MachineUpToDateUpdatingReason, + Type: clusterv1.MachineUpToDateCondition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineUpToDateUpdatingReason, + Message: msg, }) return } diff --git a/internal/controllers/machine/machine_controller_status_test.go b/internal/controllers/machine/machine_controller_status_test.go index 56872c1d62c4..fa8e7dc5a13c 100644 --- a/internal/controllers/machine/machine_controller_status_test.go +++ b/internal/controllers/machine/machine_controller_status_test.go @@ -1579,9 +1579,49 @@ func TestSetUpToDateCondition(t *testing.T) { }, }, expectCondition: &metav1.Condition{ - Type: clusterv1.MachineUpToDateCondition, - Status: metav1.ConditionFalse, - Reason: clusterv1.MachineUpToDateUpdatingReason, + Type: clusterv1.MachineUpToDateCondition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineUpToDateUpdatingReason, + Message: "* In-place update in progress", + }, + }, + { + name: "updating (message from Updating)", + machineDeployment: &clusterv1.MachineDeployment{ + Spec: clusterv1.MachineDeploymentSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: "v1.31.0", + }, + }, + }, + }, + machineSet: &clusterv1.MachineSet{ + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: "v1.31.0", + }, + }, + }, + }, + machine: &clusterv1.Machine{ + Status: clusterv1.MachineStatus{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineUpdatingCondition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineInPlaceUpdatingReason, + Message: "In-place update in progress: Extension is updating Machine", + }, + }, + }, + }, + expectCondition: &metav1.Condition{ + Type: clusterv1.MachineUpToDateCondition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineUpToDateUpdatingReason, + Message: "* In-place update in progress: Extension is updating Machine", }, }, { diff --git a/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset.go b/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset.go index 198d7ff67f8a..bc3917afe377 100644 --- a/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset.go +++ b/internal/controllers/machinedeployment/machinedeployment_canupdatemachineset.go @@ -42,7 +42,7 @@ func (p *rolloutPlanner) canUpdateMachineSetInPlace(ctx context.Context, oldMS, return p.overrideCanUpdateMachineSetInPlace(ctx, oldMS, newMS) } - log := ctrl.LoggerFrom(ctx) + log := ctrl.LoggerFrom(ctx).WithValues("MachineSet", klog.KObj(oldMS)) templateObjects, err := p.getTemplateObjects(ctx, oldMS, newMS) if err != nil { @@ -77,9 +77,10 @@ func (p *rolloutPlanner) canUpdateMachineSetInPlace(ctx context.Context, oldMS, return false, err } if !canUpdateMachineSet { - log.Info(fmt.Sprintf("MachineSet cannot be updated in-place by extensions: %s", strings.Join(reasons, ",")), "MachineSet", klog.KObj(oldMS)) + log.Info(fmt.Sprintf("MachineSet %s cannot be updated in-place by extensions", oldMS.Name), "reason", strings.Join(reasons, ",")) return false, nil } + log.Info(fmt.Sprintf("MachineSet %s can be updated in-place by extensions", oldMS.Name)) return true, nil } diff --git a/internal/controllers/machinedeployment/machinedeployment_controller.go b/internal/controllers/machinedeployment/machinedeployment_controller.go index d7014ba75490..aa8847aa416b 100644 --- a/internal/controllers/machinedeployment/machinedeployment_controller.go +++ b/internal/controllers/machinedeployment/machinedeployment_controller.go @@ -346,7 +346,7 @@ func (r *Reconciler) createOrUpdateMachineSetsAndSyncMachineDeploymentRevision(c // Create the MachineSet. if err := ssa.Patch(ctx, r.Client, machineDeploymentManagerName, ms); err != nil { r.recorder.Eventf(p.md, corev1.EventTypeWarning, "FailedCreate", "Failed to create MachineSet %s: %v", klog.KObj(ms), err) - return errors.Wrapf(err, "failed to create new MachineSet %s", klog.KObj(ms)) + return errors.Wrapf(err, "failed to create MachineSet %s", klog.KObj(ms)) } log.Info(fmt.Sprintf("MachineSet created (%s)", p.createReason)) r.recorder.Eventf(p.md, corev1.EventTypeNormal, "SuccessfulCreate", "Created MachineSet %s with %d replicas", klog.KObj(ms), ptr.Deref(ms.Spec.Replicas, 0)) @@ -370,6 +370,11 @@ func (r *Reconciler) createOrUpdateMachineSetsAndSyncMachineDeploymentRevision(c err := ssa.Patch(ctx, r.Client, machineDeploymentManagerName, ms, ssa.WithCachingProxy{Cache: r.ssaCache, Original: originalMS}) if err != nil { + // Note: If we are Applying a MachineSet with UID set and the MachineSet does not exist anymore, the + // kube-apiserver returns a conflict error. + if (apierrors.IsConflict(err) || apierrors.IsNotFound(err)) && !ms.DeletionTimestamp.IsZero() { + continue + } r.recorder.Eventf(p.md, corev1.EventTypeWarning, "FailedUpdate", "Failed to update MachineSet %s: %v", klog.KObj(ms), err) return errors.Wrapf(err, "failed to update MachineSet %s", klog.KObj(ms)) } @@ -388,7 +393,7 @@ func (r *Reconciler) createOrUpdateMachineSetsAndSyncMachineDeploymentRevision(c r.recorder.Eventf(p.md, corev1.EventTypeNormal, "SuccessfulScale", "Scaled up MachineSet %v: %d -> %d", ms.Name, originalReplicas, newReplicas) } if newReplicas == originalReplicas && len(changes) > 0 { - log.Info(fmt.Sprintf("MachineSet %s updated", ms.Name), "diff", strings.Join(changes, ",")) + log.Info(fmt.Sprintf("Updated MachineSet %s", ms.Name), "diff", strings.Join(changes, ",")) } // Only wait for cache if the object was changed. @@ -462,11 +467,11 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) error { for _, ms := range s.machineSets { if ms.DeletionTimestamp.IsZero() { if err := r.Client.Delete(ctx, ms); err != nil && !apierrors.IsNotFound(err) { - return errors.Wrapf(err, "failed to delete MachineSet %s", klog.KObj(ms)) + return errors.Wrapf(err, "failed to delete MachineSet %s (MachineDeployment deleting)", klog.KObj(ms)) } // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. // Also, setting DeletionTimestamp doesn't mean the MachineSet is actually deleted (deletion takes some time). - log.Info("Deleting MachineSet (MachineDeployment deleted)", "MachineSet", klog.KObj(ms)) + log.Info(fmt.Sprintf("MachineSet %s deleting (MachineDeployment deleting)", ms.Name), "MachineSet", klog.KObj(ms)) } } diff --git a/internal/controllers/machinedeployment/machinedeployment_rollout_planner.go b/internal/controllers/machinedeployment/machinedeployment_rollout_planner.go index 900b09e73834..336974f8ada8 100644 --- a/internal/controllers/machinedeployment/machinedeployment_rollout_planner.go +++ b/internal/controllers/machinedeployment/machinedeployment_rollout_planner.go @@ -24,8 +24,6 @@ import ( "github.com/pkg/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "k8s.io/klog/v2" - ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" @@ -129,7 +127,7 @@ func (p *rolloutPlanner) init(ctx context.Context, md *clusterv1.MachineDeployme } if !mdTemplateExists { - return errors.New("cannot create a new MachineSet when templates do not exist") + return errors.New("cannot create a MachineSet when templates do not exist") } // Compute a new MachineSet with mandatory labels, fields in-place propagated from the MachineDeployment etc. @@ -200,7 +198,6 @@ func (p *rolloutPlanner) computeDesiredOldMS(ctx context.Context, currentOldMS * // computeDesiredMS computes the desired MachineSet, which could be either a newly created newMS, or the new desired version of an existing newMS/OldMS. // Note: because we are using Server-Side-Apply we always have to calculate the full object. func computeDesiredMS(ctx context.Context, deployment *clusterv1.MachineDeployment, currentMS *clusterv1.MachineSet) (*clusterv1.MachineSet, error) { - log := ctrl.LoggerFrom(ctx) var name string var uid types.UID var finalizers []string @@ -230,7 +227,6 @@ func computeDesiredMS(ctx context.Context, deployment *clusterv1.MachineDeployme replicas = 0 machineTemplateSpec = *deployment.Spec.Template.Spec.DeepCopy() creationTimestamp = metav1.NewTime(time.Now()) - log.V(5).Info(fmt.Sprintf("Computing new MachineSet %s with %d replicas", name, replicas), "MachineSet", klog.KRef(deployment.Namespace, name)) } else { // For updating an existing MachineSet use name, uid, finalizers, replicas, uniqueIdentifier and machine template spec from existingMS. // Note: We use the uid, to ensure that the Server-Side-Apply only updates the existingMS. diff --git a/internal/controllers/machinedeployment/machinedeployment_sync.go b/internal/controllers/machinedeployment/machinedeployment_sync.go index d9c975e8c876..18d1bd7cd7c7 100644 --- a/internal/controllers/machinedeployment/machinedeployment_sync.go +++ b/internal/controllers/machinedeployment/machinedeployment_sync.go @@ -333,7 +333,6 @@ func (r *Reconciler) cleanupDeployment(ctx context.Context, oldMSs []*clusterv1. } sort.Sort(mdutil.MachineSetsByCreationTimestamp(cleanableMSes)) - log.V(4).Info("Looking to cleanup old machine sets for deployment") for i := range cleanableMSCount { ms := cleanableMSes[i] @@ -346,15 +345,14 @@ func (r *Reconciler) cleanupDeployment(ctx context.Context, oldMSs []*clusterv1. continue } - log.V(4).Info("Trying to cleanup machine set for deployment", "MachineSet", klog.KObj(ms)) if err := r.Client.Delete(ctx, ms); err != nil && !apierrors.IsNotFound(err) { // Return error instead of aggregating and continuing DELETEs on the theory // that we may be overloading the api server. r.recorder.Eventf(deployment, corev1.EventTypeWarning, "FailedDelete", "Failed to delete MachineSet %q: %v", ms.Name, err) - return err + return errors.Wrapf(err, "failed to delete MachineSet %s (cleanup of old MachineSets)", klog.KObj(ms)) } // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. - log.Info("Deleting MachineSet (cleanup of old MachineSet)", "MachineSet", klog.KObj(ms)) + log.Info(fmt.Sprintf("MachineSet %s deleting (cleanup of old MachineSets)", ms.Name), "MachineSet", klog.KObj(ms)) r.recorder.Eventf(deployment, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted MachineSet %q", ms.Name) } diff --git a/internal/controllers/machinedeployment/mdutil/util.go b/internal/controllers/machinedeployment/mdutil/util.go index 80c4a536c273..d2f4b71cb4ea 100644 --- a/internal/controllers/machinedeployment/mdutil/util.go +++ b/internal/controllers/machinedeployment/mdutil/util.go @@ -483,7 +483,7 @@ func FindNewAndOldMachineSets(deployment *clusterv1.MachineDeployment, msList [] // No need to set an additional condition message, it is not used anywhere. upToDateResults[ms.Name] = upToDateResult } - diffs = append(diffs, fmt.Sprintf("MachineSet %s: diff: %s", ms.Name, strings.Join(upToDateResult.LogMessages, ", "))) + diffs = append(diffs, fmt.Sprintf("MachineSet %s needs rollout: %s", ms.Name, strings.Join(upToDateResult.LogMessages, ", "))) } } diff --git a/internal/controllers/machinedeployment/mdutil/util_test.go b/internal/controllers/machinedeployment/mdutil/util_test.go index 6a3c4e6c91f3..5c352c67e168 100644 --- a/internal/controllers/machinedeployment/mdutil/util_test.go +++ b/internal/controllers/machinedeployment/mdutil/util_test.go @@ -469,7 +469,7 @@ func TestFindNewAndOldMachineSets(t *testing.T) { EligibleForInPlaceUpdate: true, }, }, - expectedCreateReason: fmt.Sprintf(`couldn't find MachineSet matching MachineDeployment spec template: MachineSet %s: diff: spec.infrastructureRef InfrastructureMachineTemplate old-infra-ref, InfrastructureMachineTemplate new-infra-ref required`, oldMS.Name), + expectedCreateReason: fmt.Sprintf(`couldn't find MachineSet matching MachineDeployment spec template: MachineSet %s needs rollout: spec.infrastructureRef InfrastructureMachineTemplate old-infra-ref, InfrastructureMachineTemplate new-infra-ref required`, oldMS.Name), }, { Name: "Get the MachineSet with the MachineTemplate that matches the intent of the MachineDeployment", @@ -550,7 +550,7 @@ func TestFindNewAndOldMachineSets(t *testing.T) { EligibleForInPlaceUpdate: true, }, }, - expectedCreateReason: fmt.Sprintf(`couldn't find MachineSet matching MachineDeployment spec template: MachineSet %s: diff: spec.infrastructureRef InfrastructureMachineTemplate old-infra-ref, InfrastructureMachineTemplate new-infra-ref required`, oldMS.Name), + expectedCreateReason: fmt.Sprintf(`couldn't find MachineSet matching MachineDeployment spec template: MachineSet %s needs rollout: spec.infrastructureRef InfrastructureMachineTemplate old-infra-ref, InfrastructureMachineTemplate new-infra-ref required`, oldMS.Name), }, { Name: "Get nil if no MachineSet matches the desired intent of the MachineDeployment, reconciliationTime is > rolloutAfter", @@ -571,7 +571,7 @@ func TestFindNewAndOldMachineSets(t *testing.T) { EligibleForInPlaceUpdate: false, }, }, - expectedCreateReason: fmt.Sprintf(`couldn't find MachineSet matching MachineDeployment spec template: MachineSet %s: diff: spec.infrastructureRef InfrastructureMachineTemplate old-infra-ref, InfrastructureMachineTemplate new-infra-ref required, MachineDeployment spec.rolloutAfter expired`, oldMS.Name), + expectedCreateReason: fmt.Sprintf(`couldn't find MachineSet matching MachineDeployment spec template: MachineSet %s needs rollout: spec.infrastructureRef InfrastructureMachineTemplate old-infra-ref, InfrastructureMachineTemplate new-infra-ref required, MachineDeployment spec.rolloutAfter expired`, oldMS.Name), }, { Name: "Get the MachineSet if reconciliationTime < rolloutAfter", diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go index 0c21e5b69e7e..a190ba799c7a 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go @@ -211,7 +211,6 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster var err error remoteClient, err = r.ClusterCache.GetClient(ctx, util.ObjectKey(cluster)) if err != nil { - logger.Error(err, "Error creating remote cluster cache") return ctrl.Result{}, err } @@ -224,8 +223,7 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster logger.V(3).Info("Finding targets") targets, err := r.getTargetsFromMHC(ctx, logger, remoteClient, cluster, m) if err != nil { - logger.Error(err, "Failed to fetch targets from MachineHealthCheck") - return ctrl.Result{}, err + return ctrl.Result{}, errors.Wrapf(err, "failed to fetch targets from MachineHealthCheck") } totalTargets := len(targets) m.Status.ExpectedMachines = ptr.To(int32(totalTargets)) diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index 13c8594a2f19..f88a13773296 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -357,6 +357,8 @@ func (r *Reconciler) triggerInPlaceUpdate(ctx context.Context, s *scope) (ctrl.R errs := []error{} machinesTriggeredInPlace := []*clusterv1.Machine{} for _, machine := range s.machines { + log := log.WithValues("Machine", klog.KObj(machine)) + // If a machine is not updating in place, or if the in-place update has been already triggered, no-op if _, ok := machine.Annotations[clusterv1.UpdateInProgressAnnotation]; !ok || hooks.IsPending(runtimehooksv1.UpdateMachine, machine) { continue @@ -403,18 +405,18 @@ func (r *Reconciler) triggerInPlaceUpdate(ctx context.Context, s *scope) (ctrl.R // have to ensure we preserve PendingHooksAnnotation on existing Machines in MachineSet and that would lead to race // conditions when the Machine controller tries to remove the annotation and MachineSet adds it back. if err := r.Client.Patch(ctx, machine, client.MergeFrom(orig)); err != nil { - errs = append(errs, errors.Wrapf(err, "failed to start in-place update for Machine %s", klog.KObj(machine))) + errs = append(errs, errors.Wrapf(err, "failed to complete triggering in-place update for Machine %s", klog.KObj(machine))) continue } machinesTriggeredInPlace = append(machinesTriggeredInPlace, machine) - log.Info("Completed triggering in-place update", "Machine", klog.KObj(machine)) + log.Info(fmt.Sprintf("Completed triggering in-place update for Machine %s", machine.Name)) r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulStartInPlaceUpdate", "Machine starting in-place update") } // Wait until the cache observed the Machine with PendingHooksAnnotation to ensure subsequent reconciles // will observe it as well and won't repeatedly call the logic to trigger in-place update. - if err := clientutil.WaitForCacheToBeUpToDate(ctx, r.Client, "starting in-place updates", machinesTriggeredInPlace...); err != nil { + if err := clientutil.WaitForCacheToBeUpToDate(ctx, r.Client, "complete triggering in-place update", machinesTriggeredInPlace...); err != nil { errs = append(errs, err) } if len(errs) > 0 { @@ -482,19 +484,19 @@ func (r *Reconciler) completeMoveMachine(ctx context.Context, s *scope, currentM // Write InfraMachine. // Note: Let's update InfraMachine first because that is the call that is most likely to fail. if err := ssa.Patch(ctx, r.Client, machineSetManagerName, desiredInfraMachine); err != nil { - return errors.Wrapf(err, "failed to update InfraMachine %s", klog.KObj(desiredInfraMachine)) + return errors.Wrapf(err, "failed to complete triggering in-place update for Machine %s", klog.KObj(desiredMachine)) } // Write BootstrapConfig. if desiredMachine.Spec.Bootstrap.ConfigRef.IsDefined() { if err := ssa.Patch(ctx, r.Client, machineSetManagerName, desiredBootstrapConfig); err != nil { - return errors.Wrapf(err, "failed to update BootstrapConfig %s", klog.KObj(desiredBootstrapConfig)) + return errors.Wrapf(err, "failed to complete triggering in-place update for Machine %s", klog.KObj(desiredMachine)) } } // Write Machine. if err := ssa.Patch(ctx, r.Client, machineSetManagerName, desiredMachine); err != nil { - return errors.Wrap(err, "failed to update Machine") + return errors.Wrapf(err, "failed to complete triggering in-place update for Machine %s", klog.KObj(desiredMachine)) } return nil @@ -584,7 +586,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result } // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. // Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time). - log.Info("Deleting Machine (MachineSet deleted)", "Machine", klog.KObj(machine)) + log.Info(fmt.Sprintf("Deleting Machine %s (MachineSet deleted)", machine.Name), "Machine", klog.KObj(machine)) } } @@ -750,10 +752,10 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e case diff < 0: // If there are not enough Machines, create missing Machines unless Machine creation is disabled. machinesToAdd := -diff - log.Info(fmt.Sprintf("MachineSet is scaling up to %d replicas by creating %d machines", *(ms.Spec.Replicas), machinesToAdd), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines)) + log.Info(fmt.Sprintf("MachineSet is scaling up to %d replicas by creating %d Machines", *(ms.Spec.Replicas), machinesToAdd), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines)) if ms.Annotations != nil { if value, ok := ms.Annotations[clusterv1.DisableMachineCreateAnnotation]; ok && value == "true" { - log.Info("Automatic creation of new machines disabled for machine set") + log.Info("Automatic creation of new machines disabled for MachineSet") return ctrl.Result{}, nil } } @@ -776,7 +778,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e } } if notAcknowledgeMoveReplicas.Len() > 0 { - log.Info(fmt.Sprintf("Replicas %s moved from an old MachineSet still pending acknowledge from MachineDeployment", notAcknowledgeMoveReplicas.UnsortedList())) + log.V(5).Info(fmt.Sprintf("Machines %s moved from an old MachineSet still pending acknowledge from MachineDeployment", notAcknowledgeMoveReplicas.UnsortedList())) } machinesToDeleteOrMove := len(machines) - notAcknowledgeMoveReplicas.Len() - int(ptr.Deref(ms.Spec.Replicas, 0)) @@ -889,7 +891,7 @@ func (r *Reconciler) createMachines(ctx context.Context, s *scope, machinesToAdd } machinesAdded = append(machinesAdded, machine) - log.Info(fmt.Sprintf("Machine created (scale up, creating %d of %d)", i+1, machinesToAdd), "Machine", klog.KObj(machine)) + log.Info(fmt.Sprintf("Machine %s created (scale up, creating %d of %d)", machine.Name, i+1, machinesToAdd), "Machine", klog.KObj(machine)) r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulCreate", "Created Machine %q", machine.Name) } @@ -934,7 +936,7 @@ func (r *Reconciler) deleteMachines(ctx context.Context, s *scope, machinesToDel } machinesDeleted = append(machinesDeleted, machine) - log.Info(fmt.Sprintf("Deleting Machine (scale down, deleting %d of %d)", i+1, machinesToDelete)) + log.Info(fmt.Sprintf("Machine %s deleting (scale down, deleting %d of %d)", machine.Name, i+1, machinesToDelete)) r.recorder.Eventf(ms, corev1.EventTypeNormal, "SuccessfulDelete", "Deleted Machine %q", machine.Name) } else { log.Info(fmt.Sprintf("Waiting for Machine to be deleted (scale down, deleting %d of %d)", i+1, machinesToDelete)) @@ -1010,6 +1012,8 @@ func (r *Reconciler) startMoveMachines(ctx context.Context, s *scope, targetMSNa continue } + log.Info(fmt.Sprintf("Triggering in-place update for Machine %s (by moving to MachineSet %s)", machine.Name, targetMS.Name)) + // Perform the first part of a move operation, the part under the responsibility of the oldMS. orig := machine.DeepCopy() @@ -1047,8 +1051,7 @@ func (r *Reconciler) startMoveMachines(ctx context.Context, s *scope, targetMSNa // UpdateInProgressAnnotation on existing Machines and that would lead to race conditions when // the Machine controller tries to remove the annotation and then the MachineSet controller adds it back. if err := r.Client.Patch(ctx, machine, client.MergeFrom(orig)); err != nil { - log.Error(err, "Failed to start moving Machine") - errs = append(errs, errors.Wrapf(err, "failed to start moving Machine %s", klog.KObj(machine))) + errs = append(errs, errors.Wrapf(err, "failed to trigger in-place update for Machine %s by moving to MachineSet %s", klog.KObj(machine), klog.KObj(targetMS))) continue } machinesMoved = append(machinesMoved, machine) @@ -1407,7 +1410,7 @@ func (r *Reconciler) reconcileV1Beta1Status(ctx context.Context, s *scope) error // are actually provisioned (vs reporting completed immediately after the last machine object is created). This convention is also used by KCP. if ms.Status.Deprecated.V1Beta1.ReadyReplicas == currentReplicas { if v1beta1conditions.IsFalse(ms, clusterv1.ResizedV1Beta1Condition) { - log.Info("All the replicas are ready", "replicas", ms.Status.Deprecated.V1Beta1.ReadyReplicas) + log.V(5).Info("All the replicas are ready", "replicas", ms.Status.Deprecated.V1Beta1.ReadyReplicas) } v1beta1conditions.MarkTrue(ms, clusterv1.ResizedV1Beta1Condition) } @@ -1668,7 +1671,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) ( } // Note: We intentionally log after Delete because we want this log line to show up only after DeletionTimestamp has been set. // Also, setting DeletionTimestamp doesn't mean the Machine is actually deleted (deletion takes some time). - log.Info("Deleting Machine (remediating unhealthy Machine)", "Machine", klog.KObj(m)) + log.Info(fmt.Sprintf("Deleting Machine %s (remediating unhealthy Machine)", m.Name), "Machine", klog.KObj(m)) } if len(errs) > 0 { return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines") diff --git a/internal/controllers/machineset/machineset_controller_status.go b/internal/controllers/machineset/machineset_controller_status.go index b1fa1f493e67..40d8bdf89c03 100644 --- a/internal/controllers/machineset/machineset_controller_status.go +++ b/internal/controllers/machineset/machineset_controller_status.go @@ -199,7 +199,13 @@ func setScalingDownCondition(_ context.Context, ms *clusterv1.MachineSet, machin if !ms.DeletionTimestamp.IsZero() { desiredReplicas = 0 } - currentReplicas := int32(len(machines)) + currentReplicas := int32(0) + for _, m := range machines { + if _, ok := m.Annotations[clusterv1.PendingAcknowledgeMoveAnnotation]; ok { + continue + } + currentReplicas++ + } // Scaling down. if currentReplicas > desiredReplicas { diff --git a/internal/controllers/topology/machineset/machineset_controller.go b/internal/controllers/topology/machineset/machineset_controller.go index 0a90ffef1126..af80c79d024f 100644 --- a/internal/controllers/topology/machineset/machineset_controller.go +++ b/internal/controllers/topology/machineset/machineset_controller.go @@ -19,12 +19,14 @@ package machineset import ( "context" "fmt" + "time" "github.com/pkg/errors" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/builder" @@ -39,7 +41,6 @@ import ( "sigs.k8s.io/cluster-api/util/finalizers" "sigs.k8s.io/cluster-api/util/labels" clog "sigs.k8s.io/cluster-api/util/log" - "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/predicates" ) @@ -159,17 +160,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re return ctrl.Result{}, nil } - // Create a patch helper to add or remove the finalizer from the MachineSet. - patchHelper, err := patch.NewHelper(ms, r.Client) - if err != nil { - return ctrl.Result{}, err - } - defer func() { - if err := patchHelper.Patch(ctx, ms); err != nil { - reterr = kerrors.NewAggregate([]error{reterr, err}) - } - }() - // Handle deletion reconciliation loop. if !ms.DeletionTimestamp.IsZero() { return ctrl.Result{}, r.reconcileDelete(ctx, ms) @@ -221,10 +211,26 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, ms *clusterv1.MachineS return errors.Wrapf(err, "failed to delete %s %s for MachineSet %s", ref.Kind, klog.KRef(ms.Namespace, ref.Name), klog.KObj(ms)) } - // Remove the finalizer so the MachineSet can be garbage collected by Kubernetes. - controllerutil.RemoveFinalizer(ms, clusterv1.MachineSetTopologyFinalizer) + // Note: It can happen that both MachineSet controllers are going through reconcileDelete at the same time. + // If the MachineSet does not have any Machines anymore at this point, the topology/machineset controller will + // likely complete reconcileDelete after the regular MachineSet controller. In that case it might happen + // that removing the finalizer here would try to re-add the finalizer of the other controller and then the + // request to the apiserver fails with: "Forbidden: no new finalizers can be added if the object is being deleted" + msKey := client.ObjectKeyFromObject(ms) + return retry.RetryOnConflict(wait.Backoff{ + Steps: 3, + Duration: 50 * time.Millisecond, + Factor: 1.0, + Jitter: 0.1, + }, func() error { + if err := r.Client.Get(ctx, msKey, ms); err != nil { + return client.IgnoreNotFound(err) + } - return nil + orig := ms.DeepCopy() + controllerutil.RemoveFinalizer(ms, clusterv1.MachineSetTopologyFinalizer) + return client.IgnoreNotFound(r.Client.Patch(ctx, ms, client.MergeFrom(orig))) + }) } // getMachineDeploymentName calculates the MachineDeployment name based on owner references. diff --git a/internal/runtime/client/client.go b/internal/runtime/client/client.go index 61aa3704bbf1..22bc11fe29d6 100644 --- a/internal/runtime/client/client.go +++ b/internal/runtime/client/client.go @@ -395,7 +395,7 @@ func (c *client) CallExtension(ctx context.Context, hook runtimecatalog.Hook, fo } if retryResponse, ok := response.(runtimehooksv1.RetryResponseObject); ok && retryResponse.GetRetryAfterSeconds() != 0 { - log.Info(fmt.Sprintf("Extension handler returned blocking response with retryAfterSeconds of %d", retryResponse.GetRetryAfterSeconds())) + log.V(4).Info(fmt.Sprintf("Extension handler returned blocking response with retryAfterSeconds of %d", retryResponse.GetRetryAfterSeconds())) } else { log.V(4).Info("Extension handler returned success response") } diff --git a/internal/util/tree/tree.go b/internal/util/tree/tree.go index d34d7f1e8f4d..e20d3a40613f 100644 --- a/internal/util/tree/tree.go +++ b/internal/util/tree/tree.go @@ -646,7 +646,7 @@ func newRowDescriptor(obj ctrlclient.Object) rowDescriptor { // If the object is a MachineDeployment, returns all the replica counters and pick the Available condition // as the condition to show for this object in case not all the conditions are visualized. if obj.Spec.Replicas != nil { - v.replicas = fmt.Sprintf("%d/%d", *obj.Spec.Replicas, ptr.Deref(obj.Status.Replicas, 0)) + v.replicas = fmt.Sprintf("%d/%d", ptr.Deref(obj.Status.Replicas, 0), ptr.Deref(obj.Spec.Replicas, 0)) } if obj.Status.AvailableReplicas != nil { v.availableCounters = fmt.Sprintf("%d", *obj.Status.AvailableReplicas) @@ -670,7 +670,7 @@ func newRowDescriptor(obj ctrlclient.Object) rowDescriptor { // If the object is a MachineSet, returns all the replica counters; no condition // is shown for this object in case not all the conditions are visualized. if obj.Spec.Replicas != nil { - v.replicas = fmt.Sprintf("%d/%d", *obj.Spec.Replicas, ptr.Deref(obj.Status.Replicas, 0)) + v.replicas = fmt.Sprintf("%d/%d", ptr.Deref(obj.Status.Replicas, 0), ptr.Deref(obj.Spec.Replicas, 0)) } if obj.Status.AvailableReplicas != nil { v.availableCounters = fmt.Sprintf("%d", *obj.Status.AvailableReplicas) diff --git a/test/extension/handlers/inplaceupdate/handlers.go b/test/extension/handlers/inplaceupdate/handlers.go index 9b5c8524b5aa..dedcbd0bc879 100644 --- a/test/extension/handlers/inplaceupdate/handlers.go +++ b/test/extension/handlers/inplaceupdate/handlers.go @@ -215,10 +215,10 @@ func (h *ExtensionHandlers) DoUpdateMachine(ctx context.Context, req *runtimehoo // Note: We are intentionally not actually applying any in-place changes we are just faking them, // which is good enough for test purposes. if firstTimeCalled, ok := h.state.Load(key); ok { - if time.Since(firstTimeCalled.(time.Time)) > time.Duration(20+rand.Intn(10))*time.Second { + if time.Since(firstTimeCalled.(time.Time)) > time.Duration(30+rand.Intn(10))*time.Second { h.state.Delete(key) resp.Status = runtimehooksv1.ResponseStatusSuccess - resp.Message = "In-place update is done" + resp.Message = "Extension completed updating Machine" resp.RetryAfterSeconds = 0 return } @@ -227,8 +227,8 @@ func (h *ExtensionHandlers) DoUpdateMachine(ctx context.Context, req *runtimehoo } resp.Status = runtimehooksv1.ResponseStatusSuccess - resp.Message = "In-place update still in progress" - resp.RetryAfterSeconds = 5 + resp.Message = "Extension is updating Machine" + resp.RetryAfterSeconds = 15 } func (h *ExtensionHandlers) getObjectsFromCanUpdateMachineRequest(req *runtimehooksv1.CanUpdateMachineRequest) (*clusterv1.Machine, *clusterv1.Machine, runtime.Object, runtime.Object, runtime.Object, runtime.Object, error) { //nolint:gocritic // accepting high number of return parameters for now diff --git a/util/patch/patch.go b/util/patch/patch.go index e34b53bcae62..64378f457e68 100644 --- a/util/patch/patch.go +++ b/util/patch/patch.go @@ -170,17 +170,17 @@ func (h *Helper) Patch(ctx context.Context, obj client.Object, opts ...Option) e // patching conditions first avoids an extra loop if spec or status patch succeeds first // given that causes the resourceVersion to mutate. if err := h.patchStatusConditions(ctx, obj, options.ForceOverwriteConditions, options.OwnedConditions, options.OwnedV1Beta2Conditions); err != nil { - errs = append(errs, err) + errs = append(errs, errors.Wrapf(err, "failed to patch status conditions")) } // Then proceed to patch the rest of the object. if err := h.patch(ctx, obj); err != nil { - errs = append(errs, err) + errs = append(errs, errors.Wrapf(err, "failed to patch spec and metadata")) } if err := h.patchStatus(ctx, obj); err != nil { //nolint:staticcheck if !(apierrors.IsNotFound(err) && !obj.GetDeletionTimestamp().IsZero() && len(obj.GetFinalizers()) == 0) { - errs = append(errs, err) + errs = append(errs, errors.Wrapf(err, "failed to patch status")) } }