test: add e2e tests for cluster-health-analyzer operand

DavidRajnoha · DavidRajnoha · commit 25f4b524d26f · 2026-04-01T14:04:55.000+02:00
Validate that the Monitoring UIPlugin with ClusterHealthAnalyzer enabled
deploys the health-analyzer and correctly processes alerts into incident
metrics. The test creates a crashing pod, waits for the corresponding
PrometheusRule alert to fire, then verifies the cluster_health_components_map
metric is exposed with the expected labels.

Also extends the framework's AssertPromQLResult with configurable timeout
and poll interval options (AssertPromQLResultWithOptions).

Made-with: Cursor
diff --git a/test/e2e/framework/assertions.go b/test/e2e/framework/assertions.go
@@ -422,12 +422,26 @@ func (f *Framework) GetPodMetrics(pod *v1.Pod, opts ...func(*HTTPOptions)) ([]by
 // It returns an error if the request fails. Otherwise the result is passed to
 // the callback function for additional checks.
 func (f *Framework) AssertPromQLResult(t *testing.T, expr string, callback func(model.Value) error) error {
+	return f.AssertPromQLResultWithOptions(t, expr, callback)
+}
+
+// AssertPromQLResultWithOptions is like AssertPromQLResult but accepts
+// WithTimeout and WithPollInterval options to override the default polling
+// parameters.
+func (f *Framework) AssertPromQLResultWithOptions(t *testing.T, expr string, callback func(model.Value) error, fns ...OptionFn) error {
 	t.Helper()
+	option := AssertOption{
+		PollInterval: 20 * time.Second,
+		WaitTimeout:  3 * DefaultTestTimeout,
+	}
+	for _, fn := range fns {
+		fn(&option)
+	}
 	var (
 		pollErr error
 		v       model.Value
 	)
-	if err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, 3*DefaultTestTimeout, true, func(context.Context) (bool, error) {
+	if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(context.Context) (bool, error) {
 		v, pollErr = f.getPromQLResult(context.Background(), expr)
 		if pollErr != nil {
 			t.Logf("error from getPromQLResult(): %s", pollErr)
diff --git a/test/e2e/framework/framework.go b/test/e2e/framework/framework.go
@@ -14,6 +14,7 @@ import (
 
 	configv1 "github.com/openshift/api/config/v1"
 	"github.com/pkg/errors"
+	"golang.org/x/mod/semver"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	v1 "k8s.io/api/core/v1"
@@ -25,6 +26,7 @@ import (
 	"k8s.io/client-go/rest"
 	"k8s.io/client-go/tools/portforward"
 	"k8s.io/client-go/transport/spdy"
+	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
@@ -257,3 +259,96 @@ func (f *Framework) CleanUp(t *testing.T, cleanupFunc func()) {
 		}
 	})
 }
+
+// SkipIfClusterVersionBelow skips the test if the cluster version is below
+// minVersion. The minVersion string should be a semver-compatible version
+// (e.g. "4.19" or "v4.19").
+func (f *Framework) SkipIfClusterVersionBelow(t *testing.T, minVersion string) {
+	t.Helper()
+	cv := &configv1.ClusterVersion{}
+	err := f.K8sClient.Get(t.Context(), client.ObjectKey{Name: "version"}, cv)
+	if err != nil {
+		t.Skipf("Skipping: unable to determine cluster version: %v", err)
+		return
+	}
+
+	actual := cv.Status.Desired.Version
+	if actual == "" {
+		t.Skip("Skipping: cluster version is empty")
+		return
+	}
+	t.Logf("Detected cluster version: %s", actual)
+
+	if !strings.HasPrefix(actual, "v") {
+		actual = "v" + actual
+	}
+	if !strings.HasPrefix(minVersion, "v") {
+		minVersion = "v" + minVersion
+	}
+
+	canonicalActual := fmt.Sprintf("%s-0", semver.Canonical(actual))
+	canonicalMin := fmt.Sprintf("%s-0", semver.Canonical(minVersion))
+
+	if semver.Compare(canonicalActual, canonicalMin) < 0 {
+		t.Skipf("Skipping: cluster version %s is below minimum required %s", cv.Status.Desired.Version, minVersion)
+	}
+}
+
+// DumpNamespaceDebug logs deployments (with conditions), pods (with container
+// statuses), and events for the given namespace. Useful as a t.Cleanup or
+// on-failure diagnostic helper.
+func (f *Framework) DumpNamespaceDebug(t *testing.T, namespace string) {
+	t.Helper()
+	ctx := t.Context()
+
+	t.Log("=== BEGIN DEBUG DUMP ===")
+	defer t.Log("=== END DEBUG DUMP ===")
+
+	var deployments appsv1.DeploymentList
+	if err := f.K8sClient.List(ctx, &deployments, client.InNamespace(namespace)); err != nil {
+		t.Logf("Failed to list deployments in %s: %v", namespace, err)
+	} else {
+		t.Logf("Deployments in namespace %s: %d", namespace, len(deployments.Items))
+		for _, d := range deployments.Items {
+			t.Logf("  Deployment: name=%s replicas=%d readyReplicas=%d availableReplicas=%d",
+				d.Name, ptr.Deref(d.Spec.Replicas, 0), d.Status.ReadyReplicas, d.Status.AvailableReplicas)
+			for _, c := range d.Status.Conditions {
+				t.Logf("    condition: type=%s status=%s reason=%s message=%s",
+					c.Type, c.Status, c.Reason, c.Message)
+			}
+		}
+	}
+
+	var pods corev1.PodList
+	if err := f.K8sClient.List(ctx, &pods, client.InNamespace(namespace)); err != nil {
+		t.Logf("Failed to list pods in %s: %v", namespace, err)
+	} else {
+		t.Logf("Pods in namespace %s: %d", namespace, len(pods.Items))
+		for _, p := range pods.Items {
+			t.Logf("  Pod: name=%s phase=%s", p.Name, p.Status.Phase)
+			for _, cs := range p.Status.ContainerStatuses {
+				switch {
+				case cs.State.Running != nil:
+					t.Logf("    container=%s ready=%v restarts=%d state=Running", cs.Name, cs.Ready, cs.RestartCount)
+				case cs.State.Waiting != nil:
+					t.Logf("    container=%s ready=%v restarts=%d state=Waiting reason=%s message=%s",
+						cs.Name, cs.Ready, cs.RestartCount, cs.State.Waiting.Reason, cs.State.Waiting.Message)
+				case cs.State.Terminated != nil:
+					t.Logf("    container=%s ready=%v restarts=%d state=Terminated reason=%s exitCode=%d",
+						cs.Name, cs.Ready, cs.RestartCount, cs.State.Terminated.Reason, cs.State.Terminated.ExitCode)
+				}
+			}
+		}
+	}
+
+	var events corev1.EventList
+	if err := f.K8sClient.List(ctx, &events, client.InNamespace(namespace)); err != nil {
+		t.Logf("Failed to list events in %s: %v", namespace, err)
+	} else {
+		t.Logf("Events in namespace %s: %d", namespace, len(events.Items))
+		for _, e := range events.Items {
+			t.Logf("  Event: involvedObject=%s/%s reason=%s message=%s type=%s count=%d",
+				e.InvolvedObject.Kind, e.InvolvedObject.Name, e.Reason, e.Message, e.Type, e.Count)
+		}
+	}
+}
diff --git a/test/e2e/uiplugin_cluster_health_analyzer_test.go b/test/e2e/uiplugin_cluster_health_analyzer_test.go
@@ -0,0 +1,212 @@
+package e2e
+
+import (
+	"fmt"
+	"strconv"
+	"testing"
+	"time"
+
+	monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
+	"github.com/prometheus/common/model"
+	"gotest.tools/v3/assert"
+	appsv1 "k8s.io/api/apps/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+
+	uiv1 "github.com/rhobs/observability-operator/pkg/apis/uiplugin/v1alpha1"
+	"github.com/rhobs/observability-operator/test/e2e/framework"
+)
+
+const (
+	healthAnalyzerDeploymentName = "health-analyzer"
+	prometheusRuleNamespace      = "openshift-monitoring"
+)
+
+func clusterHealthAnalyzer(t *testing.T) {
+	f.SkipIfClusterVersionBelow(t, "4.19")
+
+	testStart := time.Now()
+	t.Logf("[TIMING] Test started at %s", testStart.Format(time.RFC3339))
+
+	err := monv1.AddToScheme(f.K8sClient.Scheme())
+	assert.NilError(t, err, "failed to add monv1 to scheme")
+
+	stepStart := time.Now()
+	plugin := newMonitoringUIPlugin(t)
+	err = f.K8sClient.Create(t.Context(), plugin)
+	assert.NilError(t, err, "failed to create monitoring UIPlugin")
+	t.Logf("[TIMING] UIPlugin creation took %s", time.Since(stepStart))
+
+	t.Cleanup(func() {
+		if t.Failed() {
+			dumpClusterHealthAnalyzerDebug(t, plugin.Name)
+		}
+	})
+
+	stepStart = time.Now()
+	t.Log("Waiting for health-analyzer deployment to become ready...")
+	haDeployment := appsv1.Deployment{}
+	f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment)
+	f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t)
+	t.Logf("[TIMING] health-analyzer deployment ready after %s (elapsed since test start: %s)", time.Since(stepStart), time.Since(testStart))
+
+	suffix := strconv.FormatInt(time.Now().UnixNano()%100000, 10)
+	ruleName := "e2e-health-analyzer-" + suffix
+	alertName := "E2EHealthAnalyzer" + suffix
+
+	stepStart = time.Now()
+	rule := newAlwaysFiringRule(t, ruleName, alertName)
+	err = f.K8sClient.Create(t.Context(), rule)
+	assert.NilError(t, err, "failed to create PrometheusRule")
+	t.Logf("[TIMING] PrometheusRule creation took %s", time.Since(stepStart))
+
+	stepStart = time.Now()
+	t.Log("Waiting for alert to fire in Prometheus...")
+	alertQuery := fmt.Sprintf(`ALERTS{alertname="%s",alertstate="firing"}`, alertName)
+	err = f.AssertPromQLResultWithOptions(t, alertQuery,
+		func(v model.Value) error {
+			vec, ok := v.(model.Vector)
+			if !ok || len(vec) == 0 {
+				return fmt.Errorf("expected firing alert, got: %v", v)
+			}
+			return nil
+		},
+		framework.WithPollInterval(30*time.Second),
+		framework.WithTimeout(10*time.Minute),
+	)
+	assert.NilError(t, err, "alert %s never fired", alertName)
+	t.Logf("[TIMING] Alert firing took %s (elapsed since test start: %s)", time.Since(stepStart), time.Since(testStart))
+
+	stepStart = time.Now()
+	t.Log("Waiting for cluster-health-analyzer to expose incident metric...")
+	incidentQuery := fmt.Sprintf(`cluster_health_components_map{src_alertname="%s",src_severity="warning"}`, alertName)
+	err = f.AssertPromQLResultWithOptions(t, incidentQuery,
+		func(v model.Value) error {
+			vec, ok := v.(model.Vector)
+			if !ok || len(vec) == 0 {
+				return fmt.Errorf("expected incident metric, got: %v", v)
+			}
+			return nil
+		},
+		framework.WithPollInterval(30*time.Second),
+		framework.WithTimeout(15*time.Minute),
+	)
+	assert.NilError(t, err, "incident metric for %s never appeared", alertName)
+	t.Logf("[TIMING] Incident metric appeared after %s (elapsed since test start: %s)", time.Since(stepStart), time.Since(testStart))
+
+	t.Logf("[TIMING] Total test duration: %s", time.Since(testStart))
+}
+
+func newMonitoringUIPlugin(t *testing.T) *uiv1.UIPlugin {
+	plugin := &uiv1.UIPlugin{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "monitoring",
+		},
+		Spec: uiv1.UIPluginSpec{
+			Type: uiv1.TypeMonitoring,
+			Monitoring: &uiv1.MonitoringConfig{
+				ClusterHealthAnalyzer: &uiv1.ClusterHealthAnalyzerReference{
+					Enabled: true,
+				},
+			},
+		},
+	}
+
+	existing := &uiv1.UIPlugin{}
+	err := f.K8sClient.Get(t.Context(), client.ObjectKey{Name: plugin.Name}, existing)
+	if err == nil {
+		t.Log("UIPlugin 'monitoring' already exists, deleting before recreation...")
+		if err := f.K8sClient.Delete(t.Context(), existing); err != nil {
+			t.Fatalf("failed to delete existing UIPlugin: %v", err)
+		}
+		waitForUIPluginDeletion(existing)
+	} else if !errors.IsNotFound(err) {
+		t.Fatalf("failed to check for existing UIPlugin: %v", err)
+	}
+
+	f.CleanUp(t, func() {
+		if err := f.K8sClient.Delete(t.Context(), plugin); err != nil && !errors.IsNotFound(err) {
+			t.Logf("warning: failed to delete UIPlugin during cleanup: %v", err)
+		}
+		waitForUIPluginDeletion(plugin)
+	})
+	return plugin
+}
+
+func newAlwaysFiringRule(t *testing.T, ruleName, alertName string) *monv1.PrometheusRule {
+	rule := &monv1.PrometheusRule{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      ruleName,
+			Namespace: prometheusRuleNamespace,
+			Labels: map[string]string{
+				"app.kubernetes.io/name":    "kube-prometheus",
+				"app.kubernetes.io/part-of": "openshift-monitoring",
+				"prometheus":                "k8s",
+				"role":                      "alert-rules",
+			},
+		},
+		Spec: monv1.PrometheusRuleSpec{
+			Groups: []monv1.RuleGroup{{
+				Name: "health-analyzer-test-" + ruleName,
+				Rules: []monv1.Rule{{
+					Alert:  alertName,
+					Expr:   intstr.FromString(`vector(1)`),
+					Labels: map[string]string{"severity": "warning"},
+					Annotations: map[string]string{
+						"summary": "E2E static test alert for cluster health analyzer.",
+					},
+				}},
+			}},
+		},
+	}
+	f.CleanUp(t, func() {
+		if err := f.K8sClient.Delete(t.Context(), rule); err != nil && !errors.IsNotFound(err) {
+			t.Logf("warning: failed to delete PrometheusRule during cleanup: %v", err)
+		}
+	})
+	return rule
+}
+
+func dumpClusterHealthAnalyzerDebug(t *testing.T, pluginName string) {
+	t.Helper()
+	ctx := t.Context()
+
+	// UIPlugin-specific diagnostics
+	var plugin uiv1.UIPlugin
+	if err := f.K8sClient.Get(ctx, client.ObjectKey{Name: pluginName}, &plugin); err != nil {
+		t.Logf("Failed to get UIPlugin %q: %v", pluginName, err)
+	} else {
+		t.Logf("UIPlugin %q generation=%d, resourceVersion=%s", pluginName, plugin.Generation, plugin.ResourceVersion)
+		t.Logf("UIPlugin spec.type=%s", plugin.Spec.Type)
+		if plugin.Spec.Monitoring != nil {
+			if plugin.Spec.Monitoring.ClusterHealthAnalyzer != nil {
+				t.Logf("UIPlugin spec.monitoring.clusterHealthAnalyzer.enabled=%v", plugin.Spec.Monitoring.ClusterHealthAnalyzer.Enabled)
+			}
+			if plugin.Spec.Monitoring.Incidents != nil {
+				t.Logf("UIPlugin spec.monitoring.incidents.enabled=%v", plugin.Spec.Monitoring.Incidents.Enabled)
+			}
+		}
+		if len(plugin.Status.Conditions) == 0 {
+			t.Log("UIPlugin has no status conditions")
+		}
+		for _, c := range plugin.Status.Conditions {
+			t.Logf("UIPlugin condition: type=%s status=%s reason=%s message=%s", c.Type, c.Status, c.Reason, c.Message)
+		}
+	}
+
+	var plugins uiv1.UIPluginList
+	if err := f.K8sClient.List(ctx, &plugins); err != nil {
+		t.Logf("Failed to list UIPlugins: %v", err)
+	} else {
+		t.Logf("Total UIPlugins in cluster: %d", len(plugins.Items))
+		for _, p := range plugins.Items {
+			t.Logf("  UIPlugin: name=%s type=%s conditions=%d", p.Name, p.Spec.Type, len(p.Status.Conditions))
+		}
+	}
+
+	// Generic namespace diagnostics (deployments, pods, events)
+	f.DumpNamespaceDebug(t, uiPluginInstallNS)
+}
+
diff --git a/test/e2e/uiplugin_test.go b/test/e2e/uiplugin_test.go
@@ -34,6 +34,10 @@ func TestUIPlugin(t *testing.T) {
 			name:     "Create dashboards UIPlugin",
 			scenario: dashboardsUIPlugin,
 		},
+		{
+			name:     "Cluster health analyzer",
+			scenario: clusterHealthAnalyzer,
+		},
 	}
 
 	for _, tc := range ts {
@@ -63,13 +67,13 @@ func newDashboardsUIPlugin(t *testing.T) *uiv1.UIPlugin {
 	}
 	f.CleanUp(t, func() {
 		f.K8sClient.Delete(context.Background(), db)
-		waitForDBUIPluginDeletion(db)
+		waitForUIPluginDeletion(db)
 	})
 
 	return db
 }
 
-func waitForDBUIPluginDeletion(db *uiv1.UIPlugin) error {
+func waitForUIPluginDeletion(db *uiv1.UIPlugin) error {
 	return wait.PollUntilContextTimeout(context.Background(), 5*time.Second, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
 		err = f.K8sClient.Get(context.Background(),
 			client.ObjectKey{Name: db.Name},