Skip to content

Commit dac3075

Browse files
Phillip Kuznetsovcopybaranaut
authored andcommitted
Add node, service, pod, and namespace metadata columns to pxviews
Summary: As title says. Also add latency_sum to the two pxviews that have latency quantiles. We need these columns because during export they won't be available. If users don't use them in the non-import context, the columns will just be dropped so they shouldn't have a perf impact. Test Plan: Ran each function and they all still exported the expected data. Reviewers: jamesbartlett, michelle, nserrino, vihang Reviewed By: vihang Signed-off-by: Phillip Kuznetsov <pkuznetsov@pixielabs.ai> Differential Revision: https://phab.corp.pixielabs.ai/D11988 GitOrigin-RevId: 1061292
1 parent 2f6761a commit dac3075

1 file changed

Lines changed: 74 additions & 14 deletions

File tree

src/carnot/planner/pxl_lib/pxviews.pxl

Lines changed: 74 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ def _http_events(start_time, end_time, include_health_checks, include_ready_chec
137137
def inbound_http_summary(start_time, end_time):
138138
''' Gets a summary of requests inbound to `pod`.
139139

140+
Equivalent to a single window of `pxviews.inbound_http_latency_timeseries`
141+
if you aggregate the results of this function by pod.
142+
140143
Args:
141144
@start_time Starting time of the data to examine.
142145
@end_time Ending time of the data to examine.
@@ -154,6 +157,7 @@ def inbound_http_summary(start_time, end_time):
154157

155158
df = df.groupby(['pod_id', 'remote_addr']).agg(
156159
latency_quantiles=('latency', px.quantiles),
160+
latency_sum=('latency', px.sum),
157161
num_requests=('time_', px.count),
158162
stop_time=('time_', px.max),
159163
num_errors=('failure', px.sum),
@@ -167,9 +171,14 @@ def inbound_http_summary(start_time, end_time):
167171
df.requestor_pod = px.pod_id_to_pod_name(df.requestor_pod_id)
168172
df.requestor_service = px.pod_id_to_service_name(df.requestor_pod_id)
169173
df.time_ = df.stop_time
174+
df.pod = df.ctx['pod']
175+
df.service = df.ctx['service']
176+
df.namespace = px.pod_name_to_namespace(df.pod)
177+
df.node = df.ctx['node']
170178

171-
return df[['time_', 'pod_id', 'requestor_ip', 'requestor_pod', 'requestor_service', 'latency_quantiles',
172-
'num_errors', 'num_requests', 'req_bytes', 'resp_bytes']]
179+
return df[['time_', 'pod_id', 'pod', 'service', 'namespace', 'node',
180+
'requestor_ip', 'requestor_pod', 'requestor_service',
181+
'latency_quantiles', 'latency_sum', 'num_requests', 'num_errors', 'req_bytes', 'resp_bytes']]
173182

174183

175184
def http_graph(start_time, end_time):
@@ -182,7 +191,6 @@ def http_graph(start_time, end_time):
182191
@start_time Starting time of the data to examine.
183192
@end_time Ending time of the data to examine.
184193
'''
185-
# TODO(philkuz) fix pxviews can't use False because it's not in scope.
186194
df = _http_events(start_time,
187195
end_time,
188196
include_health_checks=False,
@@ -191,6 +199,7 @@ def http_graph(start_time, end_time):
191199
df.pod_id = df.ctx['pod_id']
192200
df = df.groupby(['pod_id', 'remote_addr', 'trace_role']).agg(
193201
latency_quantiles=('latency', px.quantiles),
202+
latency_sum=('latency', px.sum),
194203
total_request_count=('latency', px.count)
195204
num_requests=('time_', px.count),
196205
stop_time=('time_', px.max),
@@ -201,6 +210,7 @@ def http_graph(start_time, end_time):
201210

202211
df.traced_pod = df.ctx['pod']
203212
df.traced_ip = px.pod_name_to_pod_ip(df.traced_pod)
213+
df.traced_pod_id = df.pod_id
204214
df.traced_service = df.ctx['service']
205215

206216
# Get the traced and remote pod/service/IP information.
@@ -214,6 +224,10 @@ def http_graph(start_time, end_time):
214224
df.traced_service,
215225
px.service_id_to_service_name(px.ip_to_service_id(df.remote_ip)))
216226

227+
df.remote_pod_id = px.select(df.is_remote_addr_localhost,
228+
df.traced_pod_id,
229+
px.ip_to_pod_id(df.remote_ip))
230+
217231
# Assign requestor and responder based on the trace_role.
218232
df.is_server_side_tracing = df.trace_role == 2
219233
df.responder_ip = px.select(df.is_server_side_tracing, df.traced_ip, df.remote_ip)
@@ -222,6 +236,9 @@ def http_graph(start_time, end_time):
222236
df.responder_pod = px.select(df.is_server_side_tracing, df.traced_pod, df.remote_pod)
223237
df.requestor_pod = px.select(df.is_server_side_tracing, df.remote_pod, df.traced_pod)
224238

239+
df.responder_pod_id = px.select(df.is_server_side_tracing, df.traced_pod_id, df.remote_pod_id)
240+
df.requestor_pod_id = px.select(df.is_server_side_tracing, df.remote_pod_id, df.traced_pod_id)
241+
225242
df.responder_service = px.select(df.is_server_side_tracing, df.traced_service, df.remote_service)
226243
df.requestor_service = px.select(df.is_server_side_tracing, df.remote_service, df.traced_service)
227244

@@ -230,6 +247,8 @@ def http_graph(start_time, end_time):
230247
df.latency_p99 = px.DurationNanos(px.floor(px.pluck_float64(df.latency_quantiles, 'p99')))
231248

232249
df = df.groupby([
250+
'responder_pod_id',
251+
'requestor_pod_id',
233252
'responder_pod',
234253
'requestor_pod',
235254
'responder_ip',
@@ -240,6 +259,7 @@ def http_graph(start_time, end_time):
240259
# TODO(philkuz) build a combine_quantiles udf that does this properly.
241260
# latency_quantiles=('latency_quantiles', px.combine_quantiles),
242261
latency_quantiles=('latency_quantiles', px.any),
262+
latency_sum=('latency_sum', px.sum),
243263
latency_p50=('latency_p50', px.mean),
244264
latency_p90=('latency_p90', px.mean),
245265
latency_p99=('latency_p99', px.mean),
@@ -250,11 +270,21 @@ def http_graph(start_time, end_time):
250270
stop_time=('stop_time', px.max),
251271
)
252272

273+
253274
df.time_ = df.stop_time
254-
return df[['time_', 'responder_pod', 'requestor_pod',
275+
df.responder_node = px.pod_id_to_node_name(df.responder_pod_id)
276+
df.requestor_node = px.pod_id_to_node_name(df.requestor_pod_id)
277+
278+
df.responder_namespace = px.pod_name_to_namespace(df.responder_pod)
279+
df.requestor_namespace = px.pod_name_to_namespace(df.requestor_pod)
280+
df.window_start_time = start_time
281+
df.window_end_time = end_time
282+
return df[['time_', 'responder_pod', 'requestor_pod', 'responder_namespace', 'requestor_namespace',
283+
'responder_node', 'requestor_node',
255284
'responder_ip', 'requestor_ip','responder_service', 'requestor_service',
256-
'latency_p50', 'latency_p90', 'latency_p99', 'latency_quantiles', 'num_requests',
257-
'num_errors', 'req_bytes', 'resp_bytes']]
285+
'latency_p50', 'latency_p90', 'latency_p99', 'latency_quantiles', 'latency_sum', 'num_requests',
286+
'num_errors', 'req_bytes', 'resp_bytes',
287+
'window_start_time', 'window_end_time']]
258288

259289

260290
def container_process_timeseries(start_time, end_time, window_ns):
@@ -273,7 +303,7 @@ def container_process_timeseries(start_time, end_time, window_ns):
273303

274304
# First calculate CPU usage by process (UPID) in each k8s_object
275305
# over all windows.
276-
df = df.groupby(['upid', 'container','pod_id', 'timestamp']).agg(
306+
df = df.groupby(['upid', 'container', 'pod_id', 'timestamp']).agg(
277307
rss=('rss_bytes', px.mean),
278308
vsize=('vsize_bytes', px.mean),
279309
# The fields below are counters, so we take the min and the max to subtract them.
@@ -315,9 +345,15 @@ def container_process_timeseries(start_time, end_time, window_ns):
315345
# Finally, calculate total (kernel + user time) percentage used over window.
316346
df.cpu_usage = px.Percent((df.cpu_ktime_ns + df.cpu_utime_ns) / window_ns)
317347
df.time_ = df.timestamp
318-
return df[['time_', 'pod_id', 'container', 'cpu_usage', 'actual_disk_read_throughput',
319-
'actual_disk_write_throughput', 'total_disk_read_throughput',
320-
'total_disk_write_throughput', 'rss', 'vsize']]
348+
df.pod = df.ctx['pod']
349+
df.service = df.ctx['service']
350+
df.namespace = px.pod_name_to_namespace(df.pod)
351+
df.node = df.ctx['node']
352+
353+
return df[['time_', 'pod_id', 'container', 'pod', 'service', 'namespace', 'node',
354+
'cpu_usage', 'actual_disk_read_throughput',
355+
'actual_disk_write_throughput', 'total_disk_read_throughput',
356+
'total_disk_write_throughput', 'rss', 'vsize']]
321357

322358

323359
def pod_network_timeseries(start_time, end_time, window_ns):
@@ -362,7 +398,12 @@ def pod_network_timeseries(start_time, end_time, window_ns):
362398
df.tx_errors_per_ns = (df.tx_errors_end - df.tx_errors_start) / window_ns
363399

364400
df.time_ = df.timestamp
365-
return df[['time_', 'pod_id', 'rx_bytes_per_ns', 'tx_bytes_per_ns',
401+
df.pod = df.ctx['pod']
402+
df.service = df.ctx['service']
403+
df.namespace = df.ctx['namespace']
404+
df.node = df.ctx['node']
405+
return df[['time_', 'pod_id', 'pod', 'service', 'namespace', 'node',
406+
'rx_bytes_per_ns', 'tx_bytes_per_ns',
366407
'rx_drops_per_ns', 'tx_drops_per_ns', 'rx_errors_per_ns', 'tx_errors_per_ns']]
367408

368409

@@ -395,14 +436,22 @@ def inbound_http_throughput_timeseries(start_time, end_time, window_ns):
395436
num_errors=('failure', px.sum),
396437
)
397438
df.time_ = df.timestamp
439+
df.pod = df.ctx['pod']
440+
df.service = df.ctx['service']
441+
df.namespace = df.ctx['namespace']
442+
df.node = df.ctx['node']
398443

399-
return df[['time_', 'pod_id', 'container', 'num_errors', 'num_requests']]
444+
return df[['time_', 'pod_id', 'pod', 'service', 'namespace', 'node',
445+
'container', 'num_requests', 'num_errors']]
400446

401447

402448
def inbound_http_latency_timeseries(start_time, end_time, window_ns):
403449
''' Compute the inbound HTTP request latency timeseries for each pod
404450
in the cluster.
405451

452+
Equivalent to `pxviews.inbound_http_summary()` extended over multiple
453+
time windows.
454+
406455
Args:
407456
@start_time Starting time of the data to examine.
408457
@end_time Ending time of the data to examine.
@@ -428,8 +477,13 @@ def inbound_http_latency_timeseries(start_time, end_time, window_ns):
428477
resp_bytes=('resp_body_size', px.sum),
429478
)
430479
df.time_ = df.timestamp
480+
df.pod = df.ctx['pod']
481+
df.service = df.ctx['service']
482+
df.namespace = df.ctx['namespace']
483+
df.node = df.ctx['node']
431484

432-
return df[['time_', 'pod_id', 'latency_quantiles', 'num_requests', 'num_errors', 'latency_sum', 'req_bytes', 'resp_bytes']]
485+
return df[['time_', 'pod_id', 'pod', 'service', 'namespace', 'node',
486+
'latency_quantiles', 'num_requests', 'num_errors', 'latency_sum', 'req_bytes', 'resp_bytes']]
433487

434488

435489
def stacktraces(start_time, end_time):
@@ -533,10 +587,16 @@ def connection_throughput_stats(start_time, end_time):
533587
df.pod_id = px.select(df.pod_id != '', df.pod_id, df.pod_id_x)
534588
df.remote_addr = px.select(df.remote_addr != '', df.remote_addr, df.remote_addr_x)
535589

590+
df.pod = df.ctx['pod']
591+
df.namespace = df.ctx['namespace']
592+
df.service = df.ctx['service']
593+
df.node = df.ctx['node']
594+
536595
df.inbound_conn_throughput = df.rx_server + df.tx_server
537596
df.outbound_conn_throughput = df.tx_client + df.rx_client
538597

539-
return df[['time_', 'remote_addr', 'pod_id', 'inbound_conn_throughput', 'outbound_conn_throughput']]
598+
return df[['time_', 'remote_addr', 'node', 'pod_id', 'pod', 'namespace', 'service',
599+
'inbound_conn_throughput', 'outbound_conn_throughput']]
540600

541601

542602
def processes(start_time, end_time):

0 commit comments

Comments
 (0)