Skip to content

Commit 25f4b52

Browse files
committed
test: add e2e tests for cluster-health-analyzer operand
Validate that the Monitoring UIPlugin with ClusterHealthAnalyzer enabled deploys the health-analyzer and correctly processes alerts into incident metrics. The test creates a crashing pod, waits for the corresponding PrometheusRule alert to fire, then verifies the cluster_health_components_map metric is exposed with the expected labels. Also extends the framework's AssertPromQLResult with configurable timeout and poll interval options (AssertPromQLResultWithOptions). Made-with: Cursor
1 parent cbd6ba3 commit 25f4b52

File tree

4 files changed

+328
-3
lines changed

4 files changed

+328
-3
lines changed

test/e2e/framework/assertions.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,12 +422,26 @@ func (f *Framework) GetPodMetrics(pod *v1.Pod, opts ...func(*HTTPOptions)) ([]by
422422
// It returns an error if the request fails. Otherwise the result is passed to
423423
// the callback function for additional checks.
424424
func (f *Framework) AssertPromQLResult(t *testing.T, expr string, callback func(model.Value) error) error {
425+
return f.AssertPromQLResultWithOptions(t, expr, callback)
426+
}
427+
428+
// AssertPromQLResultWithOptions is like AssertPromQLResult but accepts
429+
// WithTimeout and WithPollInterval options to override the default polling
430+
// parameters.
431+
func (f *Framework) AssertPromQLResultWithOptions(t *testing.T, expr string, callback func(model.Value) error, fns ...OptionFn) error {
425432
t.Helper()
433+
option := AssertOption{
434+
PollInterval: 20 * time.Second,
435+
WaitTimeout: 3 * DefaultTestTimeout,
436+
}
437+
for _, fn := range fns {
438+
fn(&option)
439+
}
426440
var (
427441
pollErr error
428442
v model.Value
429443
)
430-
if err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, 3*DefaultTestTimeout, true, func(context.Context) (bool, error) {
444+
if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(context.Context) (bool, error) {
431445
v, pollErr = f.getPromQLResult(context.Background(), expr)
432446
if pollErr != nil {
433447
t.Logf("error from getPromQLResult(): %s", pollErr)

test/e2e/framework/framework.go

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414

1515
configv1 "github.com/openshift/api/config/v1"
1616
"github.com/pkg/errors"
17+
"golang.org/x/mod/semver"
1718
appsv1 "k8s.io/api/apps/v1"
1819
corev1 "k8s.io/api/core/v1"
1920
v1 "k8s.io/api/core/v1"
@@ -25,6 +26,7 @@ import (
2526
"k8s.io/client-go/rest"
2627
"k8s.io/client-go/tools/portforward"
2728
"k8s.io/client-go/transport/spdy"
29+
"k8s.io/utils/ptr"
2830
"sigs.k8s.io/controller-runtime/pkg/client"
2931
)
3032

@@ -257,3 +259,96 @@ func (f *Framework) CleanUp(t *testing.T, cleanupFunc func()) {
257259
}
258260
})
259261
}
262+
263+
// SkipIfClusterVersionBelow skips the test if the cluster version is below
264+
// minVersion. The minVersion string should be a semver-compatible version
265+
// (e.g. "4.19" or "v4.19").
266+
func (f *Framework) SkipIfClusterVersionBelow(t *testing.T, minVersion string) {
267+
t.Helper()
268+
cv := &configv1.ClusterVersion{}
269+
err := f.K8sClient.Get(t.Context(), client.ObjectKey{Name: "version"}, cv)
270+
if err != nil {
271+
t.Skipf("Skipping: unable to determine cluster version: %v", err)
272+
return
273+
}
274+
275+
actual := cv.Status.Desired.Version
276+
if actual == "" {
277+
t.Skip("Skipping: cluster version is empty")
278+
return
279+
}
280+
t.Logf("Detected cluster version: %s", actual)
281+
282+
if !strings.HasPrefix(actual, "v") {
283+
actual = "v" + actual
284+
}
285+
if !strings.HasPrefix(minVersion, "v") {
286+
minVersion = "v" + minVersion
287+
}
288+
289+
canonicalActual := fmt.Sprintf("%s-0", semver.Canonical(actual))
290+
canonicalMin := fmt.Sprintf("%s-0", semver.Canonical(minVersion))
291+
292+
if semver.Compare(canonicalActual, canonicalMin) < 0 {
293+
t.Skipf("Skipping: cluster version %s is below minimum required %s", cv.Status.Desired.Version, minVersion)
294+
}
295+
}
296+
297+
// DumpNamespaceDebug logs deployments (with conditions), pods (with container
298+
// statuses), and events for the given namespace. Useful as a t.Cleanup or
299+
// on-failure diagnostic helper.
300+
func (f *Framework) DumpNamespaceDebug(t *testing.T, namespace string) {
301+
t.Helper()
302+
ctx := t.Context()
303+
304+
t.Log("=== BEGIN DEBUG DUMP ===")
305+
defer t.Log("=== END DEBUG DUMP ===")
306+
307+
var deployments appsv1.DeploymentList
308+
if err := f.K8sClient.List(ctx, &deployments, client.InNamespace(namespace)); err != nil {
309+
t.Logf("Failed to list deployments in %s: %v", namespace, err)
310+
} else {
311+
t.Logf("Deployments in namespace %s: %d", namespace, len(deployments.Items))
312+
for _, d := range deployments.Items {
313+
t.Logf(" Deployment: name=%s replicas=%d readyReplicas=%d availableReplicas=%d",
314+
d.Name, ptr.Deref(d.Spec.Replicas, 0), d.Status.ReadyReplicas, d.Status.AvailableReplicas)
315+
for _, c := range d.Status.Conditions {
316+
t.Logf(" condition: type=%s status=%s reason=%s message=%s",
317+
c.Type, c.Status, c.Reason, c.Message)
318+
}
319+
}
320+
}
321+
322+
var pods corev1.PodList
323+
if err := f.K8sClient.List(ctx, &pods, client.InNamespace(namespace)); err != nil {
324+
t.Logf("Failed to list pods in %s: %v", namespace, err)
325+
} else {
326+
t.Logf("Pods in namespace %s: %d", namespace, len(pods.Items))
327+
for _, p := range pods.Items {
328+
t.Logf(" Pod: name=%s phase=%s", p.Name, p.Status.Phase)
329+
for _, cs := range p.Status.ContainerStatuses {
330+
switch {
331+
case cs.State.Running != nil:
332+
t.Logf(" container=%s ready=%v restarts=%d state=Running", cs.Name, cs.Ready, cs.RestartCount)
333+
case cs.State.Waiting != nil:
334+
t.Logf(" container=%s ready=%v restarts=%d state=Waiting reason=%s message=%s",
335+
cs.Name, cs.Ready, cs.RestartCount, cs.State.Waiting.Reason, cs.State.Waiting.Message)
336+
case cs.State.Terminated != nil:
337+
t.Logf(" container=%s ready=%v restarts=%d state=Terminated reason=%s exitCode=%d",
338+
cs.Name, cs.Ready, cs.RestartCount, cs.State.Terminated.Reason, cs.State.Terminated.ExitCode)
339+
}
340+
}
341+
}
342+
}
343+
344+
var events corev1.EventList
345+
if err := f.K8sClient.List(ctx, &events, client.InNamespace(namespace)); err != nil {
346+
t.Logf("Failed to list events in %s: %v", namespace, err)
347+
} else {
348+
t.Logf("Events in namespace %s: %d", namespace, len(events.Items))
349+
for _, e := range events.Items {
350+
t.Logf(" Event: involvedObject=%s/%s reason=%s message=%s type=%s count=%d",
351+
e.InvolvedObject.Kind, e.InvolvedObject.Name, e.Reason, e.Message, e.Type, e.Count)
352+
}
353+
}
354+
}
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
package e2e
2+
3+
import (
4+
"fmt"
5+
"strconv"
6+
"testing"
7+
"time"
8+
9+
monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
10+
"github.com/prometheus/common/model"
11+
"gotest.tools/v3/assert"
12+
appsv1 "k8s.io/api/apps/v1"
13+
"k8s.io/apimachinery/pkg/api/errors"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
"k8s.io/apimachinery/pkg/util/intstr"
16+
"sigs.k8s.io/controller-runtime/pkg/client"
17+
18+
uiv1 "github.com/rhobs/observability-operator/pkg/apis/uiplugin/v1alpha1"
19+
"github.com/rhobs/observability-operator/test/e2e/framework"
20+
)
21+
22+
const (
23+
healthAnalyzerDeploymentName = "health-analyzer"
24+
prometheusRuleNamespace = "openshift-monitoring"
25+
)
26+
27+
func clusterHealthAnalyzer(t *testing.T) {
28+
f.SkipIfClusterVersionBelow(t, "4.19")
29+
30+
testStart := time.Now()
31+
t.Logf("[TIMING] Test started at %s", testStart.Format(time.RFC3339))
32+
33+
err := monv1.AddToScheme(f.K8sClient.Scheme())
34+
assert.NilError(t, err, "failed to add monv1 to scheme")
35+
36+
stepStart := time.Now()
37+
plugin := newMonitoringUIPlugin(t)
38+
err = f.K8sClient.Create(t.Context(), plugin)
39+
assert.NilError(t, err, "failed to create monitoring UIPlugin")
40+
t.Logf("[TIMING] UIPlugin creation took %s", time.Since(stepStart))
41+
42+
t.Cleanup(func() {
43+
if t.Failed() {
44+
dumpClusterHealthAnalyzerDebug(t, plugin.Name)
45+
}
46+
})
47+
48+
stepStart = time.Now()
49+
t.Log("Waiting for health-analyzer deployment to become ready...")
50+
haDeployment := appsv1.Deployment{}
51+
f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment)
52+
f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t)
53+
t.Logf("[TIMING] health-analyzer deployment ready after %s (elapsed since test start: %s)", time.Since(stepStart), time.Since(testStart))
54+
55+
suffix := strconv.FormatInt(time.Now().UnixNano()%100000, 10)
56+
ruleName := "e2e-health-analyzer-" + suffix
57+
alertName := "E2EHealthAnalyzer" + suffix
58+
59+
stepStart = time.Now()
60+
rule := newAlwaysFiringRule(t, ruleName, alertName)
61+
err = f.K8sClient.Create(t.Context(), rule)
62+
assert.NilError(t, err, "failed to create PrometheusRule")
63+
t.Logf("[TIMING] PrometheusRule creation took %s", time.Since(stepStart))
64+
65+
stepStart = time.Now()
66+
t.Log("Waiting for alert to fire in Prometheus...")
67+
alertQuery := fmt.Sprintf(`ALERTS{alertname="%s",alertstate="firing"}`, alertName)
68+
err = f.AssertPromQLResultWithOptions(t, alertQuery,
69+
func(v model.Value) error {
70+
vec, ok := v.(model.Vector)
71+
if !ok || len(vec) == 0 {
72+
return fmt.Errorf("expected firing alert, got: %v", v)
73+
}
74+
return nil
75+
},
76+
framework.WithPollInterval(30*time.Second),
77+
framework.WithTimeout(10*time.Minute),
78+
)
79+
assert.NilError(t, err, "alert %s never fired", alertName)
80+
t.Logf("[TIMING] Alert firing took %s (elapsed since test start: %s)", time.Since(stepStart), time.Since(testStart))
81+
82+
stepStart = time.Now()
83+
t.Log("Waiting for cluster-health-analyzer to expose incident metric...")
84+
incidentQuery := fmt.Sprintf(`cluster_health_components_map{src_alertname="%s",src_severity="warning"}`, alertName)
85+
err = f.AssertPromQLResultWithOptions(t, incidentQuery,
86+
func(v model.Value) error {
87+
vec, ok := v.(model.Vector)
88+
if !ok || len(vec) == 0 {
89+
return fmt.Errorf("expected incident metric, got: %v", v)
90+
}
91+
return nil
92+
},
93+
framework.WithPollInterval(30*time.Second),
94+
framework.WithTimeout(15*time.Minute),
95+
)
96+
assert.NilError(t, err, "incident metric for %s never appeared", alertName)
97+
t.Logf("[TIMING] Incident metric appeared after %s (elapsed since test start: %s)", time.Since(stepStart), time.Since(testStart))
98+
99+
t.Logf("[TIMING] Total test duration: %s", time.Since(testStart))
100+
}
101+
102+
func newMonitoringUIPlugin(t *testing.T) *uiv1.UIPlugin {
103+
plugin := &uiv1.UIPlugin{
104+
ObjectMeta: metav1.ObjectMeta{
105+
Name: "monitoring",
106+
},
107+
Spec: uiv1.UIPluginSpec{
108+
Type: uiv1.TypeMonitoring,
109+
Monitoring: &uiv1.MonitoringConfig{
110+
ClusterHealthAnalyzer: &uiv1.ClusterHealthAnalyzerReference{
111+
Enabled: true,
112+
},
113+
},
114+
},
115+
}
116+
117+
existing := &uiv1.UIPlugin{}
118+
err := f.K8sClient.Get(t.Context(), client.ObjectKey{Name: plugin.Name}, existing)
119+
if err == nil {
120+
t.Log("UIPlugin 'monitoring' already exists, deleting before recreation...")
121+
if err := f.K8sClient.Delete(t.Context(), existing); err != nil {
122+
t.Fatalf("failed to delete existing UIPlugin: %v", err)
123+
}
124+
waitForUIPluginDeletion(existing)
125+
} else if !errors.IsNotFound(err) {
126+
t.Fatalf("failed to check for existing UIPlugin: %v", err)
127+
}
128+
129+
f.CleanUp(t, func() {
130+
if err := f.K8sClient.Delete(t.Context(), plugin); err != nil && !errors.IsNotFound(err) {
131+
t.Logf("warning: failed to delete UIPlugin during cleanup: %v", err)
132+
}
133+
waitForUIPluginDeletion(plugin)
134+
})
135+
return plugin
136+
}
137+
138+
func newAlwaysFiringRule(t *testing.T, ruleName, alertName string) *monv1.PrometheusRule {
139+
rule := &monv1.PrometheusRule{
140+
ObjectMeta: metav1.ObjectMeta{
141+
Name: ruleName,
142+
Namespace: prometheusRuleNamespace,
143+
Labels: map[string]string{
144+
"app.kubernetes.io/name": "kube-prometheus",
145+
"app.kubernetes.io/part-of": "openshift-monitoring",
146+
"prometheus": "k8s",
147+
"role": "alert-rules",
148+
},
149+
},
150+
Spec: monv1.PrometheusRuleSpec{
151+
Groups: []monv1.RuleGroup{{
152+
Name: "health-analyzer-test-" + ruleName,
153+
Rules: []monv1.Rule{{
154+
Alert: alertName,
155+
Expr: intstr.FromString(`vector(1)`),
156+
Labels: map[string]string{"severity": "warning"},
157+
Annotations: map[string]string{
158+
"summary": "E2E static test alert for cluster health analyzer.",
159+
},
160+
}},
161+
}},
162+
},
163+
}
164+
f.CleanUp(t, func() {
165+
if err := f.K8sClient.Delete(t.Context(), rule); err != nil && !errors.IsNotFound(err) {
166+
t.Logf("warning: failed to delete PrometheusRule during cleanup: %v", err)
167+
}
168+
})
169+
return rule
170+
}
171+
172+
func dumpClusterHealthAnalyzerDebug(t *testing.T, pluginName string) {
173+
t.Helper()
174+
ctx := t.Context()
175+
176+
// UIPlugin-specific diagnostics
177+
var plugin uiv1.UIPlugin
178+
if err := f.K8sClient.Get(ctx, client.ObjectKey{Name: pluginName}, &plugin); err != nil {
179+
t.Logf("Failed to get UIPlugin %q: %v", pluginName, err)
180+
} else {
181+
t.Logf("UIPlugin %q generation=%d, resourceVersion=%s", pluginName, plugin.Generation, plugin.ResourceVersion)
182+
t.Logf("UIPlugin spec.type=%s", plugin.Spec.Type)
183+
if plugin.Spec.Monitoring != nil {
184+
if plugin.Spec.Monitoring.ClusterHealthAnalyzer != nil {
185+
t.Logf("UIPlugin spec.monitoring.clusterHealthAnalyzer.enabled=%v", plugin.Spec.Monitoring.ClusterHealthAnalyzer.Enabled)
186+
}
187+
if plugin.Spec.Monitoring.Incidents != nil {
188+
t.Logf("UIPlugin spec.monitoring.incidents.enabled=%v", plugin.Spec.Monitoring.Incidents.Enabled)
189+
}
190+
}
191+
if len(plugin.Status.Conditions) == 0 {
192+
t.Log("UIPlugin has no status conditions")
193+
}
194+
for _, c := range plugin.Status.Conditions {
195+
t.Logf("UIPlugin condition: type=%s status=%s reason=%s message=%s", c.Type, c.Status, c.Reason, c.Message)
196+
}
197+
}
198+
199+
var plugins uiv1.UIPluginList
200+
if err := f.K8sClient.List(ctx, &plugins); err != nil {
201+
t.Logf("Failed to list UIPlugins: %v", err)
202+
} else {
203+
t.Logf("Total UIPlugins in cluster: %d", len(plugins.Items))
204+
for _, p := range plugins.Items {
205+
t.Logf(" UIPlugin: name=%s type=%s conditions=%d", p.Name, p.Spec.Type, len(p.Status.Conditions))
206+
}
207+
}
208+
209+
// Generic namespace diagnostics (deployments, pods, events)
210+
f.DumpNamespaceDebug(t, uiPluginInstallNS)
211+
}
212+

test/e2e/uiplugin_test.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ func TestUIPlugin(t *testing.T) {
3434
name: "Create dashboards UIPlugin",
3535
scenario: dashboardsUIPlugin,
3636
},
37+
{
38+
name: "Cluster health analyzer",
39+
scenario: clusterHealthAnalyzer,
40+
},
3741
}
3842

3943
for _, tc := range ts {
@@ -63,13 +67,13 @@ func newDashboardsUIPlugin(t *testing.T) *uiv1.UIPlugin {
6367
}
6468
f.CleanUp(t, func() {
6569
f.K8sClient.Delete(context.Background(), db)
66-
waitForDBUIPluginDeletion(db)
70+
waitForUIPluginDeletion(db)
6771
})
6872

6973
return db
7074
}
7175

72-
func waitForDBUIPluginDeletion(db *uiv1.UIPlugin) error {
76+
func waitForUIPluginDeletion(db *uiv1.UIPlugin) error {
7377
return wait.PollUntilContextTimeout(context.Background(), 5*time.Second, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
7478
err = f.K8sClient.Get(context.Background(),
7579
client.ObjectKey{Name: db.Name},

0 commit comments

Comments
 (0)