Skip to content

Commit 6fa1c36

Browse files
TRACING-6127: feat: add span_kind filter, show p95 latency instead of avg operation duration (#1044)
* feat: add span_kind filter, show p95 latency instead of avg operation duration * add Span Kind variable to filter by span kind (defaults to SPAN_KIND_SERVER to avoid double-counting) * rename "Duration" to "Latency", and use P95 histogram quantile instead of average operation duration * update error rate unit Signed-off-by: Andreas Gerstmayr <agerstmayr@redhat.com> * revert error rate unit Signed-off-by: Andreas Gerstmayr <agerstmayr@redhat.com> --------- Signed-off-by: Andreas Gerstmayr <agerstmayr@redhat.com>
1 parent 0319ab7 commit 6fa1c36

File tree

1 file changed

+20
-6
lines changed
  • pkg/controllers/uiplugin

1 file changed

+20
-6
lines changed

pkg/controllers/uiplugin/apm.go

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ func withServiceMetrics(variableMatchers string) dashboard.Option {
4040
),
4141
),
4242
),
43-
panelgroup.AddPanel("Duration",
43+
panelgroup.AddPanel("Latency",
4444
timeseries.Chart(
4545
timeseries.WithYAxis(timeseries.YAxis{
4646
Format: &common.Format{
@@ -108,7 +108,7 @@ func withOperationMetrics(variableMatchers string) dashboard.Option {
108108
},
109109
{
110110
Name: "value #3",
111-
Header: "Duration",
111+
Header: "P95 Latency",
112112
Format: &common.Format{
113113
Unit: ptr.To(string(common.MilliSecondsUnit)),
114114
DecimalPlaces: 3,
@@ -123,7 +123,7 @@ func withOperationMetrics(variableMatchers string) dashboard.Option {
123123
panel.AddQuery(
124124
query.PromQL(
125125
fmt.Sprintf(`sum(rate({__name__=~"traces_span_metrics_calls(_total)?", %s}[$__rate_interval])) by (span_name) > 0`, variableMatchers),
126-
query.SeriesNameFormat("req/s"),
126+
query.SeriesNameFormat("Request rate"),
127127
),
128128
),
129129
panel.AddQuery(
@@ -134,16 +134,16 @@ func withOperationMetrics(variableMatchers string) dashboard.Option {
134134
),
135135
panel.AddQuery(
136136
query.PromQL(
137-
fmt.Sprintf(`sum(rate({__name__=~"traces_span_metrics_duration(_milliseconds)?_sum", %s}[$__rate_interval])) by (span_name) / sum(rate({__name__=~"traces_span_metrics_duration(_milliseconds)?_count", %s}[$__rate_interval])) by (span_name) > 0`, variableMatchers, variableMatchers),
138-
query.SeriesNameFormat("Duration"),
137+
fmt.Sprintf(`histogram_quantile(.95, sum(rate({__name__=~"traces_span_metrics_duration(_milliseconds)?_bucket", %s}[$__rate_interval])) by (span_name, le)) > 0`, variableMatchers),
138+
query.SeriesNameFormat("P95 Latency"),
139139
),
140140
),
141141
),
142142
)
143143
}
144144

145145
func buildAPMDashboard() (dashboard.Builder, error) {
146-
variableMatchers := `namespace="$namespace", service="$collector", service_name="$service"`
146+
variableMatchers := `namespace="$namespace", service="$collector", service_name="$service", span_kind=~"${span_kind}"`
147147

148148
return dashboard.New("apm",
149149
dashboard.Name("Application Performance Monitoring (APM)"),
@@ -171,6 +171,20 @@ func buildAPMDashboard() (dashboard.Builder, error) {
171171
),
172172
),
173173
),
174+
dashboard.AddVariable("span_kind",
175+
listvariable.List(
176+
listvariable.DisplayName("Span Kind"),
177+
// Filter by SPAN_KIND_SERVER by default to avoid double-counting requests when both the caller and callee are instrumented,
178+
// as each side generates its own span (CLIENT and SERVER).
179+
listvariable.DefaultValue("SPAN_KIND_SERVER"),
180+
listvariable.AllowMultiple(true),
181+
listvariable.AllowAllValue(true),
182+
listvariable.CustomAllValue(".*"),
183+
labelvalues.PrometheusLabelValues("span_kind",
184+
labelvalues.Matchers(`{__name__=~"traces_span_metrics_calls(_total)?"}`),
185+
),
186+
),
187+
),
174188
withServiceMetrics(variableMatchers),
175189
withOperationMetrics(variableMatchers),
176190
)

0 commit comments

Comments
 (0)