Skip to content

Commit cbd97cd

Browse files
IshwarKanseclaude
andcommitted
fix: filter by container name in restart detection loop
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 8bcce30 commit cbd97cd

1 file changed

Lines changed: 63 additions & 52 deletions

File tree

test/e2e/zz_tls_profile_test.go

Lines changed: 63 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,10 @@ func TestTLSProfileWatcher(t *testing.T) {
6161
// running and healthy when the cluster uses the default TLS profile (no
6262
// tlsSecurityProfile set on the APIServer CR, which defaults to Intermediate).
6363
func assertOperatorRunningWithDefaultTLSProfile(t *testing.T) {
64-
// Verify the APIServer CR exists and is readable.
6564
apiServer := &configv1.APIServer{}
6665
err := f.K8sClient.Get(context.Background(), types.NamespacedName{Name: "cluster"}, apiServer)
6766
assert.NilError(t, err, "failed to get APIServer CR")
6867

69-
// Verify the operator deployment is ready.
7068
f.AssertDeploymentReady(operatorDeploymentName, f.OperatorNamespace,
7169
framework.WithTimeout(2*time.Minute))(t)
7270

@@ -78,36 +76,29 @@ func assertOperatorRunningWithDefaultTLSProfile(t *testing.T) {
7876
func assertOperatorRestartsOnTLSProfileChange(t *testing.T) {
7977
ctx := context.Background()
8078

81-
// Save original APIServer spec so we can restore it.
8279
apiServer := &configv1.APIServer{}
8380
err := f.K8sClient.Get(ctx, types.NamespacedName{Name: "cluster"}, apiServer)
8481
assert.NilError(t, err, "failed to get APIServer CR")
8582
originalTLSProfile := apiServer.Spec.TLSSecurityProfile
8683

87-
// Ensure we restore the original profile even if the test fails.
8884
f.CleanUp(t, func() {
8985
restoreTLSProfile(t, originalTLSProfile)
9086
})
9187

92-
// Wait for the operator to be stable before we start.
9388
f.AssertDeploymentReady(operatorDeploymentName, f.OperatorNamespace,
9489
framework.WithTimeout(2*time.Minute))(t)
9590

96-
// Record the current container restart count.
9791
initialRestarts := getOperatorContainerRestartCount(t)
9892
t.Logf("container restart count before TLS change: %d", initialRestarts)
9993

100-
// Change TLS profile to Old.
10194
t.Log("setting TLS profile to Old")
10295
setTLSProfile(t, &configv1.TLSSecurityProfile{
10396
Type: configv1.TLSProfileOldType,
10497
Old: &configv1.OldTLSProfile{},
10598
})
10699

107-
// The operator should restart: wait for the restart count to increase.
108100
waitForOperatorContainerRestart(t, initialRestarts)
109101

110-
// After restart, the operator should become ready again.
111102
f.AssertDeploymentReady(operatorDeploymentName, f.OperatorNamespace,
112103
framework.WithTimeout(5*time.Minute))(t)
113104
t.Log("operator restarted and is ready after TLS profile change to Old")
@@ -131,12 +122,8 @@ func assertOperatorRestartsOnCustomTLSProfile(t *testing.T) {
131122
f.AssertDeploymentReady(operatorDeploymentName, f.OperatorNamespace,
132123
framework.WithTimeout(5*time.Minute))(t)
133124

134-
// Wait for the restart count to stabilize — the previous test's cleanup
135-
// may have triggered a restart that hasn't completed yet.
136125
initialRestarts := waitForStableRestartCount(t)
137126

138-
// Set a Custom TLS profile that differs from the default Intermediate
139-
// profile by using a smaller cipher list (a subset of Intermediate's ciphers).
140127
t.Log("setting TLS profile to Custom")
141128
setTLSProfile(t, &configv1.TLSSecurityProfile{
142129
Type: configv1.TLSProfileCustomType,
@@ -173,7 +160,6 @@ func assertOperatorStableOnNonTLSChange(t *testing.T) {
173160
framework.WithTimeout(5*time.Minute))(t)
174161
initialRestarts := waitForStableRestartCount(t)
175162

176-
// Update a non-TLS field: add a test annotation.
177163
const testAnnotation = "observability-operator.rhobs/e2e-tls-test"
178164
apiServer := &configv1.APIServer{}
179165
err := f.K8sClient.Get(ctx, types.NamespacedName{Name: "cluster"}, apiServer)
@@ -201,22 +187,22 @@ func assertOperatorStableOnNonTLSChange(t *testing.T) {
201187
assert.NilError(t, err, "failed to patch APIServer with test annotation")
202188
t.Log("added test annotation to APIServer CR")
203189

204-
// Wait a reasonable period and verify the restart count has not increased.
205190
t.Log("waiting 60s to confirm operator does not restart")
206191
err = wait.PollUntilContextTimeout(ctx, 10*time.Second, 60*time.Second, true, func(ctx context.Context) (bool, error) {
207-
currentRestarts := getOperatorContainerRestartCount(t)
192+
currentRestarts, restartErr := operatorContainerRestartCount()
193+
if restartErr != nil {
194+
return false, nil
195+
}
208196
if currentRestarts > initialRestarts {
209197
return true, fmt.Errorf("operator restarted unexpectedly: restart count changed from %d to %d", initialRestarts, currentRestarts)
210198
}
211-
return false, nil // keep polling — we want this to NOT match (timeout = success)
199+
return false, nil
212200
})
213201

214202
if wait.Interrupted(err) {
215-
// Timeout means the restart count never changed — that's what we want.
216203
t.Log("operator remained stable after non-TLS APIServer change")
217204
return
218205
}
219-
// If we get here, it means the restart count increased — fail.
220206
assert.NilError(t, err, "operator should not restart on non-TLS APIServer changes")
221207
}
222208

@@ -254,7 +240,6 @@ func restoreTLSProfile(t *testing.T, profile *configv1.TLSSecurityProfile) {
254240
return
255241
}
256242

257-
// Wait for operator to recover after the restore.
258243
_ = wait.PollUntilContextTimeout(ctx, 5*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) {
259244
dep := &appsv1.Deployment{}
260245
if err := f.K8sClient.Get(ctx, types.NamespacedName{
@@ -267,52 +252,77 @@ func restoreTLSProfile(t *testing.T, profile *configv1.TLSSecurityProfile) {
267252
})
268253
}
269254

270-
// getOperatorContainerRestartCount returns the total restart count of the
271-
// operator container in the running operator pod.
272-
func getOperatorContainerRestartCount(t *testing.T) int32 {
273-
t.Helper()
274-
275-
pod := getRunningOperatorPod(t)
255+
// operatorContainerRestartCount returns the restart count for the operator
256+
// container, or an error if the pod or container cannot be found. This is
257+
// safe to call inside poll loops since it does not call t.Fatal.
258+
func operatorContainerRestartCount() (int32, error) {
259+
pod, err := runningOperatorPod()
260+
if err != nil {
261+
return 0, err
262+
}
276263
for _, cs := range pod.Status.ContainerStatuses {
277264
if cs.Name == operatorContainerName {
278-
return cs.RestartCount
265+
return cs.RestartCount, nil
279266
}
280267
}
281-
282-
t.Fatalf("container %q not found in operator pod", operatorContainerName)
283-
return 0
268+
return 0, fmt.Errorf("container %q not found in operator pod", operatorContainerName)
284269
}
285270

286-
// getRunningOperatorPod returns the running operator pod.
287-
func getRunningOperatorPod(t *testing.T) *corev1.Pod {
288-
t.Helper()
271+
// runningOperatorPod returns the running operator pod, or an error if it
272+
// cannot be found. This is safe to call inside poll loops since it does
273+
// not call t.Fatal.
274+
func runningOperatorPod() (*corev1.Pod, error) {
275+
ctx := context.Background()
289276

290277
dep := &appsv1.Deployment{}
291-
err := f.K8sClient.Get(context.Background(), types.NamespacedName{
278+
if err := f.K8sClient.Get(ctx, types.NamespacedName{
292279
Name: operatorDeploymentName,
293280
Namespace: f.OperatorNamespace,
294-
}, dep)
295-
assert.NilError(t, err, "failed to get operator deployment")
281+
}, dep); err != nil {
282+
return nil, fmt.Errorf("failed to get operator deployment: %w", err)
283+
}
296284

297285
selector, err := metav1.LabelSelectorAsSelector(dep.Spec.Selector)
298-
assert.NilError(t, err, "failed to parse deployment selector")
286+
if err != nil {
287+
return nil, fmt.Errorf("failed to parse deployment selector: %w", err)
288+
}
299289

300290
var pods corev1.PodList
301-
err = f.K8sClient.List(context.Background(), &pods,
291+
if err := f.K8sClient.List(ctx, &pods,
302292
client.InNamespace(f.OperatorNamespace),
303293
client.MatchingLabelsSelector{Selector: selector},
304-
)
305-
assert.NilError(t, err, "failed to list operator pods")
294+
); err != nil {
295+
return nil, fmt.Errorf("failed to list operator pods: %w", err)
296+
}
306297

307298
for i := range pods.Items {
308299
p := &pods.Items[i]
309300
if p.Status.Phase == corev1.PodRunning && p.DeletionTimestamp == nil {
310-
return p
301+
return p, nil
311302
}
312303
}
313304

314-
t.Fatal("no running operator pod found")
315-
return nil
305+
return nil, fmt.Errorf("no running operator pod found")
306+
}
307+
308+
// getOperatorContainerRestartCount returns the restart count for the operator
309+
// container. It calls t.Fatal if the pod or container cannot be found; use
310+
// operatorContainerRestartCount() instead when calling from inside poll loops.
311+
func getOperatorContainerRestartCount(t *testing.T) int32 {
312+
t.Helper()
313+
count, err := operatorContainerRestartCount()
314+
assert.NilError(t, err)
315+
return count
316+
}
317+
318+
// getRunningOperatorPod returns the running operator pod. It calls t.Fatal if
319+
// the pod cannot be found; use runningOperatorPod() instead when calling from
320+
// inside poll loops.
321+
func getRunningOperatorPod(t *testing.T) *corev1.Pod {
322+
t.Helper()
323+
pod, err := runningOperatorPod()
324+
assert.NilError(t, err)
325+
return pod
316326
}
317327

318328
// waitForOperatorContainerRestart polls until the operator process has
@@ -322,7 +332,6 @@ func getRunningOperatorPod(t *testing.T) *corev1.Pod {
322332
func waitForOperatorContainerRestart(t *testing.T, baselineRestarts int32) {
323333
t.Helper()
324334

325-
// Record the current pod UID so we can detect pod replacement.
326335
originalPod := getRunningOperatorPod(t)
327336
originalPodUID := originalPod.UID
328337

@@ -352,14 +361,12 @@ func waitForOperatorContainerRestart(t *testing.T, baselineRestarts int32) {
352361
if p.DeletionTimestamp != nil {
353362
continue
354363
}
355-
// Detect pod replacement (new pod UID).
356364
if p.UID != originalPodUID && p.Status.Phase == corev1.PodRunning {
357365
t.Logf("operator pod replaced: old UID=%s, new UID=%s", originalPodUID, p.UID)
358366
return true, nil
359367
}
360-
// Detect container restart within the same pod.
361368
for _, cs := range p.Status.ContainerStatuses {
362-
if cs.RestartCount > baselineRestarts {
369+
if cs.Name == operatorContainerName && cs.RestartCount > baselineRestarts {
363370
t.Logf("operator container restarted: restart count %d -> %d", baselineRestarts, cs.RestartCount)
364371
return true, nil
365372
}
@@ -381,7 +388,10 @@ func waitForStableRestartCount(t *testing.T) int32 {
381388
var stableSince time.Time
382389

383390
err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, 5*time.Minute, true, func(ctx context.Context) (bool, error) {
384-
current := getOperatorContainerRestartCount(t)
391+
current, restartErr := operatorContainerRestartCount()
392+
if restartErr != nil {
393+
return false, nil
394+
}
385395
if current != lastCount {
386396
lastCount = current
387397
stableSince = time.Now()
@@ -391,11 +401,12 @@ func waitForStableRestartCount(t *testing.T) int32 {
391401
return false, nil
392402
}
393403

394-
// Also verify the container has been running long enough for
395-
// the watcher to process its initial reconcile event.
396-
pod := getRunningOperatorPod(t)
404+
pod, podErr := runningOperatorPod()
405+
if podErr != nil {
406+
return false, nil
407+
}
397408
for _, cs := range pod.Status.ContainerStatuses {
398-
if cs.State.Running != nil {
409+
if cs.Name == operatorContainerName && cs.State.Running != nil {
399410
uptime := time.Since(cs.State.Running.StartedAt.Time)
400411
if uptime < 15*time.Second {
401412
t.Logf("operator container uptime %s < 15s, waiting longer", uptime.Round(time.Second))

0 commit comments

Comments
 (0)