@@ -61,12 +61,10 @@ func TestTLSProfileWatcher(t *testing.T) {
6161// running and healthy when the cluster uses the default TLS profile (no
6262// tlsSecurityProfile set on the APIServer CR, which defaults to Intermediate).
6363func assertOperatorRunningWithDefaultTLSProfile (t * testing.T ) {
64- // Verify the APIServer CR exists and is readable.
6564 apiServer := & configv1.APIServer {}
6665 err := f .K8sClient .Get (context .Background (), types.NamespacedName {Name : "cluster" }, apiServer )
6766 assert .NilError (t , err , "failed to get APIServer CR" )
6867
69- // Verify the operator deployment is ready.
7068 f .AssertDeploymentReady (operatorDeploymentName , f .OperatorNamespace ,
7169 framework .WithTimeout (2 * time .Minute ))(t )
7270
@@ -78,36 +76,29 @@ func assertOperatorRunningWithDefaultTLSProfile(t *testing.T) {
7876func assertOperatorRestartsOnTLSProfileChange (t * testing.T ) {
7977 ctx := context .Background ()
8078
81- // Save original APIServer spec so we can restore it.
8279 apiServer := & configv1.APIServer {}
8380 err := f .K8sClient .Get (ctx , types.NamespacedName {Name : "cluster" }, apiServer )
8481 assert .NilError (t , err , "failed to get APIServer CR" )
8582 originalTLSProfile := apiServer .Spec .TLSSecurityProfile
8683
87- // Ensure we restore the original profile even if the test fails.
8884 f .CleanUp (t , func () {
8985 restoreTLSProfile (t , originalTLSProfile )
9086 })
9187
92- // Wait for the operator to be stable before we start.
9388 f .AssertDeploymentReady (operatorDeploymentName , f .OperatorNamespace ,
9489 framework .WithTimeout (2 * time .Minute ))(t )
9590
96- // Record the current container restart count.
9791 initialRestarts := getOperatorContainerRestartCount (t )
9892 t .Logf ("container restart count before TLS change: %d" , initialRestarts )
9993
100- // Change TLS profile to Old.
10194 t .Log ("setting TLS profile to Old" )
10295 setTLSProfile (t , & configv1.TLSSecurityProfile {
10396 Type : configv1 .TLSProfileOldType ,
10497 Old : & configv1.OldTLSProfile {},
10598 })
10699
107- // The operator should restart: wait for the restart count to increase.
108100 waitForOperatorContainerRestart (t , initialRestarts )
109101
110- // After restart, the operator should become ready again.
111102 f .AssertDeploymentReady (operatorDeploymentName , f .OperatorNamespace ,
112103 framework .WithTimeout (5 * time .Minute ))(t )
113104 t .Log ("operator restarted and is ready after TLS profile change to Old" )
@@ -131,12 +122,8 @@ func assertOperatorRestartsOnCustomTLSProfile(t *testing.T) {
131122 f .AssertDeploymentReady (operatorDeploymentName , f .OperatorNamespace ,
132123 framework .WithTimeout (5 * time .Minute ))(t )
133124
134- // Wait for the restart count to stabilize — the previous test's cleanup
135- // may have triggered a restart that hasn't completed yet.
136125 initialRestarts := waitForStableRestartCount (t )
137126
138- // Set a Custom TLS profile that differs from the default Intermediate
139- // profile by using a smaller cipher list (a subset of Intermediate's ciphers).
140127 t .Log ("setting TLS profile to Custom" )
141128 setTLSProfile (t , & configv1.TLSSecurityProfile {
142129 Type : configv1 .TLSProfileCustomType ,
@@ -173,7 +160,6 @@ func assertOperatorStableOnNonTLSChange(t *testing.T) {
173160 framework .WithTimeout (5 * time .Minute ))(t )
174161 initialRestarts := waitForStableRestartCount (t )
175162
176- // Update a non-TLS field: add a test annotation.
177163 const testAnnotation = "observability-operator.rhobs/e2e-tls-test"
178164 apiServer := & configv1.APIServer {}
179165 err := f .K8sClient .Get (ctx , types.NamespacedName {Name : "cluster" }, apiServer )
@@ -201,22 +187,22 @@ func assertOperatorStableOnNonTLSChange(t *testing.T) {
201187 assert .NilError (t , err , "failed to patch APIServer with test annotation" )
202188 t .Log ("added test annotation to APIServer CR" )
203189
204- // Wait a reasonable period and verify the restart count has not increased.
205190 t .Log ("waiting 60s to confirm operator does not restart" )
206191 err = wait .PollUntilContextTimeout (ctx , 10 * time .Second , 60 * time .Second , true , func (ctx context.Context ) (bool , error ) {
207- currentRestarts := getOperatorContainerRestartCount (t )
192+ currentRestarts , restartErr := operatorContainerRestartCount ()
193+ if restartErr != nil {
194+ return false , nil
195+ }
208196 if currentRestarts > initialRestarts {
209197 return true , fmt .Errorf ("operator restarted unexpectedly: restart count changed from %d to %d" , initialRestarts , currentRestarts )
210198 }
211- return false , nil // keep polling — we want this to NOT match (timeout = success)
199+ return false , nil
212200 })
213201
214202 if wait .Interrupted (err ) {
215- // Timeout means the restart count never changed — that's what we want.
216203 t .Log ("operator remained stable after non-TLS APIServer change" )
217204 return
218205 }
219- // If we get here, it means the restart count increased — fail.
220206 assert .NilError (t , err , "operator should not restart on non-TLS APIServer changes" )
221207}
222208
@@ -254,7 +240,6 @@ func restoreTLSProfile(t *testing.T, profile *configv1.TLSSecurityProfile) {
254240 return
255241 }
256242
257- // Wait for operator to recover after the restore.
258243 _ = wait .PollUntilContextTimeout (ctx , 5 * time .Second , 5 * time .Minute , true , func (ctx context.Context ) (bool , error ) {
259244 dep := & appsv1.Deployment {}
260245 if err := f .K8sClient .Get (ctx , types.NamespacedName {
@@ -267,52 +252,77 @@ func restoreTLSProfile(t *testing.T, profile *configv1.TLSSecurityProfile) {
267252 })
268253}
269254
270- // getOperatorContainerRestartCount returns the total restart count of the
271- // operator container in the running operator pod.
272- func getOperatorContainerRestartCount (t * testing.T ) int32 {
273- t .Helper ()
274-
275- pod := getRunningOperatorPod (t )
255+ // operatorContainerRestartCount returns the restart count for the operator
256+ // container, or an error if the pod or container cannot be found. This is
257+ // safe to call inside poll loops since it does not call t.Fatal.
258+ func operatorContainerRestartCount () (int32 , error ) {
259+ pod , err := runningOperatorPod ()
260+ if err != nil {
261+ return 0 , err
262+ }
276263 for _ , cs := range pod .Status .ContainerStatuses {
277264 if cs .Name == operatorContainerName {
278- return cs .RestartCount
265+ return cs .RestartCount , nil
279266 }
280267 }
281-
282- t .Fatalf ("container %q not found in operator pod" , operatorContainerName )
283- return 0
268+ return 0 , fmt .Errorf ("container %q not found in operator pod" , operatorContainerName )
284269}
285270
286- // getRunningOperatorPod returns the running operator pod.
287- func getRunningOperatorPod (t * testing.T ) * corev1.Pod {
288- t .Helper ()
271+ // runningOperatorPod returns the running operator pod, or an error if it
272+ // cannot be found. This is safe to call inside poll loops since it does
273+ // not call t.Fatal.
274+ func runningOperatorPod () (* corev1.Pod , error ) {
275+ ctx := context .Background ()
289276
290277 dep := & appsv1.Deployment {}
291- err := f .K8sClient .Get (context . Background () , types.NamespacedName {
278+ if err := f .K8sClient .Get (ctx , types.NamespacedName {
292279 Name : operatorDeploymentName ,
293280 Namespace : f .OperatorNamespace ,
294- }, dep )
295- assert .NilError (t , err , "failed to get operator deployment" )
281+ }, dep ); err != nil {
282+ return nil , fmt .Errorf ("failed to get operator deployment: %w" , err )
283+ }
296284
297285 selector , err := metav1 .LabelSelectorAsSelector (dep .Spec .Selector )
298- assert .NilError (t , err , "failed to parse deployment selector" )
286+ if err != nil {
287+ return nil , fmt .Errorf ("failed to parse deployment selector: %w" , err )
288+ }
299289
300290 var pods corev1.PodList
301- err = f .K8sClient .List (context . Background () , & pods ,
291+ if err : = f .K8sClient .List (ctx , & pods ,
302292 client .InNamespace (f .OperatorNamespace ),
303293 client.MatchingLabelsSelector {Selector : selector },
304- )
305- assert .NilError (t , err , "failed to list operator pods" )
294+ ); err != nil {
295+ return nil , fmt .Errorf ("failed to list operator pods: %w" , err )
296+ }
306297
307298 for i := range pods .Items {
308299 p := & pods .Items [i ]
309300 if p .Status .Phase == corev1 .PodRunning && p .DeletionTimestamp == nil {
310- return p
301+ return p , nil
311302 }
312303 }
313304
314- t .Fatal ("no running operator pod found" )
315- return nil
305+ return nil , fmt .Errorf ("no running operator pod found" )
306+ }
307+
308+ // getOperatorContainerRestartCount returns the restart count for the operator
309+ // container. It calls t.Fatal if the pod or container cannot be found; use
310+ // operatorContainerRestartCount() instead when calling from inside poll loops.
311+ func getOperatorContainerRestartCount (t * testing.T ) int32 {
312+ t .Helper ()
313+ count , err := operatorContainerRestartCount ()
314+ assert .NilError (t , err )
315+ return count
316+ }
317+
318+ // getRunningOperatorPod returns the running operator pod. It calls t.Fatal if
319+ // the pod cannot be found; use runningOperatorPod() instead when calling from
320+ // inside poll loops.
321+ func getRunningOperatorPod (t * testing.T ) * corev1.Pod {
322+ t .Helper ()
323+ pod , err := runningOperatorPod ()
324+ assert .NilError (t , err )
325+ return pod
316326}
317327
318328// waitForOperatorContainerRestart polls until the operator process has
@@ -322,7 +332,6 @@ func getRunningOperatorPod(t *testing.T) *corev1.Pod {
322332func waitForOperatorContainerRestart (t * testing.T , baselineRestarts int32 ) {
323333 t .Helper ()
324334
325- // Record the current pod UID so we can detect pod replacement.
326335 originalPod := getRunningOperatorPod (t )
327336 originalPodUID := originalPod .UID
328337
@@ -352,14 +361,12 @@ func waitForOperatorContainerRestart(t *testing.T, baselineRestarts int32) {
352361 if p .DeletionTimestamp != nil {
353362 continue
354363 }
355- // Detect pod replacement (new pod UID).
356364 if p .UID != originalPodUID && p .Status .Phase == corev1 .PodRunning {
357365 t .Logf ("operator pod replaced: old UID=%s, new UID=%s" , originalPodUID , p .UID )
358366 return true , nil
359367 }
360- // Detect container restart within the same pod.
361368 for _ , cs := range p .Status .ContainerStatuses {
362- if cs .RestartCount > baselineRestarts {
369+ if cs .Name == operatorContainerName && cs . RestartCount > baselineRestarts {
363370 t .Logf ("operator container restarted: restart count %d -> %d" , baselineRestarts , cs .RestartCount )
364371 return true , nil
365372 }
@@ -381,7 +388,10 @@ func waitForStableRestartCount(t *testing.T) int32 {
381388 var stableSince time.Time
382389
383390 err := wait .PollUntilContextTimeout (context .Background (), 5 * time .Second , 5 * time .Minute , true , func (ctx context.Context ) (bool , error ) {
384- current := getOperatorContainerRestartCount (t )
391+ current , restartErr := operatorContainerRestartCount ()
392+ if restartErr != nil {
393+ return false , nil
394+ }
385395 if current != lastCount {
386396 lastCount = current
387397 stableSince = time .Now ()
@@ -391,11 +401,12 @@ func waitForStableRestartCount(t *testing.T) int32 {
391401 return false , nil
392402 }
393403
394- // Also verify the container has been running long enough for
395- // the watcher to process its initial reconcile event.
396- pod := getRunningOperatorPod (t )
404+ pod , podErr := runningOperatorPod ()
405+ if podErr != nil {
406+ return false , nil
407+ }
397408 for _ , cs := range pod .Status .ContainerStatuses {
398- if cs .State .Running != nil {
409+ if cs .Name == operatorContainerName && cs . State .Running != nil {
399410 uptime := time .Since (cs .State .Running .StartedAt .Time )
400411 if uptime < 15 * time .Second {
401412 t .Logf ("operator container uptime %s < 15s, waiting longer" , uptime .Round (time .Second ))
0 commit comments