Skip to content

Commit 447629b

Browse files
ytimocinclaude
andauthored
fix: don't treat in-progress binding states as terminal failures (#670)
HasBindingFailed previously returned true for any condition with Status=False, regardless of the Reason. Several in-progress reasons on the per-cluster binding conditions (Overridden, WorkSynchronized, Applied, Available) are expressed as Status=False rather than Status=Unknown — most notably WorkNotSynchronizedYetReason and NotAvailableYetReason. The over-broad classification caused upstream controllers (rollout, updaterun) to bail on bindings that were still making forward progress. Switch HasBindingFailed to consult an explicit allowlist of non-terminal reasons (nonTerminalBindingFailureReasons) drawn from the condition package: OverriddenPendingReason, WorkNotSynchronizedYetReason, ApplyPendingReason, NotAvailableYetReason. Anything outside that set is treated as terminal — the safer default, since wrongly classifying an unknown reason as "transient" would stall rollouts forever. Per code review feedback (the original draft used a `strings.Contains` on the literal "failed", which is fragile against typos and convention drift), the allowlist uses typed constants. New "Pending" / "NotYet" reasons added to pkg/utils/condition must also be added to the allowlist; this is documented inline. Tests: - TestHasBindingFailed_NonTerminalReasons covers each known transient reason returning false, each known terminal reason returning true, an unknown-reason fall-through to terminal, and a mixed-condition case where a transient reason on one condition does not mask a terminal failure on another. - Existing TestHasBindingFailed (which uses camelCase reason strings not in the allowlist) still passes — those legacy strings are correctly classified as terminal. - TestCheckClusterUpdateResult in the updaterun package was encoding the previous bug: it asserted that WorkNotSynchronizedYet should produce an error from checkClusterUpdateResult. Updated the case to match corrected semantics (in-progress, no error) and added a new case for the terminal SyncWorkFailedReason path. This aligns with the existing TODO at execution.go:626 that called for distinguishing recoverable from non-recoverable failures. Fixes #648 Signed-off-by: Yetkin Timocin <ytimocin@microsoft.com> Co-authored-by: Claude Code <noreply@anthropic.com>
1 parent abfa3ef commit 447629b

3 files changed

Lines changed: 161 additions & 6 deletions

File tree

pkg/controllers/updaterun/execution_test.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,9 @@ func TestCheckClusterUpdateResult(t *testing.T) {
242242
wantErr: true,
243243
},
244244
{
245-
name: "checkClusterUpdateResult should return false and error if the binding has false workSynchronized condition",
245+
// WorkNotSynchronizedYet is an in-progress reason, not a terminal failure (issue #648).
246+
// checkClusterUpdateResult now treats it as "still updating" rather than an error.
247+
name: "checkClusterUpdateResult should return false but no error if workSynchronized is in-progress (WorkNotSynchronizedYet)",
246248
binding: &placementv1beta1.ClusterResourceBinding{
247249
ObjectMeta: metav1.ObjectMeta{Generation: 1},
248250
Status: placementv1beta1.ResourceBindingStatus{
@@ -258,6 +260,25 @@ func TestCheckClusterUpdateResult(t *testing.T) {
258260
},
259261
clusterStatus: &placementv1beta1.ClusterUpdatingStatus{ClusterName: "test-cluster"},
260262
wantSucceeded: false,
263+
wantErr: false,
264+
},
265+
{
266+
name: "checkClusterUpdateResult should return false and error if workSynchronized failed terminally (SyncWorkFailed)",
267+
binding: &placementv1beta1.ClusterResourceBinding{
268+
ObjectMeta: metav1.ObjectMeta{Generation: 1},
269+
Status: placementv1beta1.ResourceBindingStatus{
270+
Conditions: []metav1.Condition{
271+
{
272+
Type: string(placementv1beta1.ResourceBindingWorkSynchronized),
273+
Status: metav1.ConditionFalse,
274+
ObservedGeneration: 1,
275+
Reason: condition.SyncWorkFailedReason,
276+
},
277+
},
278+
},
279+
},
280+
clusterStatus: &placementv1beta1.ClusterUpdatingStatus{ClusterName: "test-cluster"},
281+
wantSucceeded: false,
261282
wantErr: true,
262283
},
263284
{

pkg/utils/binding/binding.go

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,51 @@ import (
2323
"github.com/kubefleet-dev/kubefleet/pkg/utils/condition"
2424
)
2525

26-
// HasBindingFailed checks if BindingObj has failed based on its conditions.
26+
// nonTerminalBindingFailureReasons is the closed set of `Reason` strings that the per-cluster
27+
// binding conditions (Overridden, WorkSynchronized, Applied, Available) can carry while still
28+
// `Status=False` *without* representing a terminal failure. Each one means the binding is still
29+
// progressing and the controller should keep waiting rather than treat the binding as failed.
30+
//
31+
// The set is intentionally an allowlist: any new `Status=False` reason added to the codebase is
32+
// treated as terminal until it is explicitly added here. This is the safer default because the
33+
// alternative (treat unknown reasons as transient) would stall the rollout controller forever
34+
// on a genuinely-failing binding it does not yet know how to classify.
35+
//
36+
// When you add a new in-progress / pending / "not yet" reason to pkg/utils/condition/reason.go
37+
// for any of the above conditions, also add it here.
38+
var nonTerminalBindingFailureReasons = map[string]struct{}{
39+
condition.OverriddenPendingReason: {},
40+
condition.WorkNotSynchronizedYetReason: {},
41+
condition.ApplyPendingReason: {},
42+
condition.NotAvailableYetReason: {},
43+
}
44+
45+
// HasBindingFailed reports whether a binding has reached a terminal failure on any of its
46+
// per-cluster conditions (Overridden, WorkSynchronized, Applied, Available).
47+
//
48+
// A condition counts as a terminal failure when its `Status` is `False` *and* its `Reason` is
49+
// not in `nonTerminalBindingFailureReasons`. The Reason check is necessary because several of
50+
// the in-progress states (e.g. `WorkNotSynchronizedYetReason`, `NotAvailableYetReason`) are
51+
// expressed as `Status=False` rather than `Status=Unknown`. Treating those as failures was the
52+
// previous bug — it caused the rollout controller to drop bindings that were still progressing
53+
// out of `canBeReadyBindings`, stalling rollout decisions.
54+
//
55+
// Unknown reasons fall through to "terminal" by design; see the comment on
56+
// `nonTerminalBindingFailureReasons`.
2757
func HasBindingFailed(binding placementv1beta1.BindingObj) bool {
2858
for i := condition.OverriddenCondition; i <= condition.AvailableCondition; i++ {
29-
if condition.IsConditionStatusFalse(binding.GetCondition(string(i.ResourceBindingConditionType())), binding.GetGeneration()) {
30-
// TODO: parse the reason of the condition to see if the failure is recoverable/retriable or not
31-
klog.V(2).Infof("binding %s has condition %s with status false", klog.KObj(binding), string(i.ResourceBindingConditionType()))
32-
return true
59+
c := binding.GetCondition(string(i.ResourceBindingConditionType()))
60+
if !condition.IsConditionStatusFalse(c, binding.GetGeneration()) {
61+
continue
62+
}
63+
if _, transient := nonTerminalBindingFailureReasons[c.Reason]; transient {
64+
klog.V(3).Infof("binding %s has condition %s status false with non-terminal reason %q; treating as in-progress",
65+
klog.KObj(binding), string(i.ResourceBindingConditionType()), c.Reason)
66+
continue
3367
}
68+
klog.V(2).Infof("binding %s has terminal failure on condition %s (reason %q)",
69+
klog.KObj(binding), string(i.ResourceBindingConditionType()), c.Reason)
70+
return true
3471
}
3572
return false
3673
}

pkg/utils/binding/binding_test.go

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2323

2424
placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1"
25+
"github.com/kubefleet-dev/kubefleet/pkg/utils/condition"
2526
)
2627

2728
func TestHasBindingFailed(t *testing.T) {
@@ -326,6 +327,102 @@ func TestHasBindingFailed(t *testing.T) {
326327
}
327328
}
328329

330+
// TestHasBindingFailed_NonTerminalReasons verifies that conditions with Status=False but a
331+
// known in-progress reason (WorkNotSynchronizedYet, NotAvailableYet, OverriddenPending,
332+
// ApplyPending) are NOT treated as failures. This is the bug fix for issue #648.
333+
func TestHasBindingFailed_NonTerminalReasons(t *testing.T) {
334+
bindingWithCondition := func(condType string, reason string) *placementv1beta1.ClusterResourceBinding {
335+
return &placementv1beta1.ClusterResourceBinding{
336+
ObjectMeta: metav1.ObjectMeta{Name: "test-binding", Generation: 1},
337+
Status: placementv1beta1.ResourceBindingStatus{
338+
Conditions: []metav1.Condition{
339+
{
340+
Type: condType,
341+
Status: metav1.ConditionFalse,
342+
ObservedGeneration: 1,
343+
Reason: reason,
344+
Message: "in progress",
345+
},
346+
},
347+
},
348+
}
349+
}
350+
351+
tests := []struct {
352+
name string
353+
binding *placementv1beta1.ClusterResourceBinding
354+
want bool
355+
}{
356+
{
357+
name: "WorkSynchronized=False with WorkNotSynchronizedYet is in-progress, not a failure",
358+
binding: bindingWithCondition(string(placementv1beta1.ResourceBindingWorkSynchronized), condition.WorkNotSynchronizedYetReason),
359+
want: false,
360+
},
361+
{
362+
name: "Available=False with NotAvailableYet is in-progress, not a failure",
363+
binding: bindingWithCondition(string(placementv1beta1.ResourceBindingAvailable), condition.NotAvailableYetReason),
364+
want: false,
365+
},
366+
{
367+
name: "Overridden=False with OverriddenPending is in-progress, not a failure",
368+
binding: bindingWithCondition(string(placementv1beta1.ResourceBindingOverridden), condition.OverriddenPendingReason),
369+
want: false,
370+
},
371+
{
372+
name: "Applied=False with ApplyPending is in-progress, not a failure",
373+
binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), condition.ApplyPendingReason),
374+
want: false,
375+
},
376+
{
377+
name: "Applied=False with ApplyFailed is a terminal failure",
378+
binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), condition.ApplyFailedReason),
379+
want: true,
380+
},
381+
{
382+
name: "Overridden=False with OverriddenFailed is a terminal failure",
383+
binding: bindingWithCondition(string(placementv1beta1.ResourceBindingOverridden), condition.OverriddenFailedReason),
384+
want: true,
385+
},
386+
{
387+
name: "Status=False with an unknown reason defaults to terminal (safer default)",
388+
binding: bindingWithCondition(string(placementv1beta1.ResourceBindingApplied), "SomeReasonNobodyAddedToTheAllowlist"),
389+
want: true,
390+
},
391+
{
392+
name: "non-terminal reason on one condition does not mask a terminal failure on another",
393+
binding: &placementv1beta1.ClusterResourceBinding{
394+
ObjectMeta: metav1.ObjectMeta{Name: "test-binding", Generation: 1},
395+
Status: placementv1beta1.ResourceBindingStatus{
396+
Conditions: []metav1.Condition{
397+
{
398+
Type: string(placementv1beta1.ResourceBindingOverridden),
399+
Status: metav1.ConditionFalse,
400+
ObservedGeneration: 1,
401+
Reason: condition.OverriddenPendingReason,
402+
},
403+
{
404+
Type: string(placementv1beta1.ResourceBindingApplied),
405+
Status: metav1.ConditionFalse,
406+
ObservedGeneration: 1,
407+
Reason: condition.ApplyFailedReason,
408+
},
409+
},
410+
},
411+
},
412+
want: true,
413+
},
414+
}
415+
416+
for _, tc := range tests {
417+
t.Run(tc.name, func(t *testing.T) {
418+
got := HasBindingFailed(tc.binding)
419+
if got != tc.want {
420+
t.Errorf("HasBindingFailed(%v) = %v, want %v", tc.binding, got, tc.want)
421+
}
422+
})
423+
}
424+
}
425+
329426
func TestIsBindingReportDiff(t *testing.T) {
330427
tests := []struct {
331428
name string

0 commit comments

Comments
 (0)