Skip to content

Commit b8c2817

Browse files
Phillip Kuznetsovcopybaranaut
authored andcommitted
Add summary views for each of the timeseries functions in pxviews
Summary: Added pod_network_summary,container_process_summary, inbound_http_throughput_summary to the pxviews library. The timeseries versions are not simple to export because it has many windows. A single window is easy to export. Each of these are basically the single window version of each timeseries script. Test Plan: Ran each script and ensured the results are as expected Reviewers: jamesbartlett, michelle, vihang Reviewed By: vihang Signed-off-by: Phillip Kuznetsov <pkuznetsov@pixielabs.ai> Differential Revision: https://phab.corp.pixielabs.ai/D11989 GitOrigin-RevId: 39a502d
1 parent dac3075 commit b8c2817

1 file changed

Lines changed: 129 additions & 0 deletions

File tree

src/carnot/planner/pxl_lib/pxviews.pxl

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,9 @@ def container_process_timeseries(start_time, end_time, window_ns):
291291
''' Compute the timeseries of CPU, memory, and IO usage for each container
292292
sourced from the `process_stats` table.
293293

294+
Equivalent to `pxviews.container_process_summary()` extended over multiple
295+
windows.
296+
294297
Args:
295298
@start_time Starting time of the data to examine.
296299
@end_time Ending time of the data to examine.
@@ -356,12 +359,87 @@ def container_process_timeseries(start_time, end_time, window_ns):
356359
'total_disk_write_throughput', 'rss', 'vsize']]
357360

358361

362+
def container_process_summary(start_time, end_time):
363+
''' Compute the summary of CPU, memory, and IO usage for each container
364+
sourced from the `process_stats` table.
365+
366+
Equivalent to a single window from `pxviews.container_process_timeseries()`.
367+
368+
Args:
369+
@start_time Starting time of the data to examine.
370+
@end_time Ending time of the data to examine.
371+
'''
372+
df = px.DataFrame(table='process_stats', start_time=start_time, end_time=end_time)
373+
df.container = df.ctx['container_name']
374+
df.pod_id = df.ctx['pod_id']
375+
376+
# First calculate CPU usage by process (UPID) in each k8s_object
377+
# over all windows.
378+
df = df.groupby(['upid', 'container', 'pod_id']).agg(
379+
rss=('rss_bytes', px.mean),
380+
vsize=('vsize_bytes', px.mean),
381+
# The fields below are counters, so we take the min and the max to subtract them.
382+
cpu_utime_ns_max=('cpu_utime_ns', px.max),
383+
cpu_utime_ns_min=('cpu_utime_ns', px.min),
384+
cpu_ktime_ns_max=('cpu_ktime_ns', px.max),
385+
cpu_ktime_ns_min=('cpu_ktime_ns', px.min),
386+
read_bytes_max=('read_bytes', px.max),
387+
read_bytes_min=('read_bytes', px.min),
388+
write_bytes_max=('write_bytes', px.max),
389+
write_bytes_min=('write_bytes', px.min),
390+
rchar_bytes_max=('rchar_bytes', px.max),
391+
rchar_bytes_min=('rchar_bytes', px.min),
392+
wchar_bytes_max=('wchar_bytes', px.max),
393+
wchar_bytes_min=('wchar_bytes', px.min),
394+
time_=('time_', px.max),
395+
)
396+
397+
# Next calculate cpu usage and memory stats per window.
398+
df.cpu_utime_ns = df.cpu_utime_ns_max - df.cpu_utime_ns_min
399+
df.cpu_ktime_ns = df.cpu_ktime_ns_max - df.cpu_ktime_ns_min
400+
401+
window = end_time - start_time
402+
df.actual_disk_read_throughput = (df.read_bytes_max - df.read_bytes_min) / window
403+
df.actual_disk_write_throughput = (df.write_bytes_max - df.write_bytes_min) / window
404+
df.total_disk_read_throughput = (df.rchar_bytes_max - df.rchar_bytes_min) / window
405+
df.total_disk_write_throughput = (df.wchar_bytes_max - df.wchar_bytes_min) / window
406+
407+
408+
# Then aggregate per container.
409+
df = df.groupby(['pod_id', 'container']).agg(
410+
cpu_ktime_ns=('cpu_ktime_ns', px.sum),
411+
cpu_utime_ns=('cpu_utime_ns', px.sum),
412+
actual_disk_read_throughput=('actual_disk_read_throughput', px.sum),
413+
actual_disk_write_throughput=('actual_disk_write_throughput', px.sum),
414+
total_disk_read_throughput=('total_disk_read_throughput', px.sum),
415+
total_disk_write_throughput=('total_disk_write_throughput', px.sum),
416+
rss=('rss', px.sum),
417+
vsize=('vsize', px.sum),
418+
time_=('time_', px.max),
419+
)
420+
421+
# Finally, calculate total (kernel + user time) percentage used over window.
422+
df.cpu_usage = px.Percent((df.cpu_ktime_ns + df.cpu_utime_ns) / window)
423+
df.pod = df.ctx['pod']
424+
df.service = df.ctx['service']
425+
df.namespace = px.pod_name_to_namespace(df.pod)
426+
df.node = df.ctx['node']
427+
428+
return df[['time_', 'pod_id', 'container', 'pod', 'service', 'namespace', 'node',
429+
'cpu_usage', 'actual_disk_read_throughput',
430+
'actual_disk_write_throughput', 'total_disk_read_throughput',
431+
'total_disk_write_throughput', 'rss', 'vsize']]
432+
433+
359434
def pod_network_timeseries(start_time, end_time, window_ns):
360435
''' Compute the timeseries of network events for each pod.
361436

362437
Returns timeseries data summarizing the bytes, drops, and errors for
363438
the incoming (rx) and outgoing(tx) network connections for each pod.
364439

440+
This is equivalent to `pxviews.pod_network_summary()` but over several
441+
time windows.
442+
365443
Args:
366444
@start_time Starting time of the data to examine.
367445
@end_time Ending time of the data to examine.
@@ -407,6 +485,57 @@ def pod_network_timeseries(start_time, end_time, window_ns):
407485
'rx_drops_per_ns', 'tx_drops_per_ns', 'rx_errors_per_ns', 'tx_errors_per_ns']]
408486

409487

488+
def pod_network_summary(start_time, end_time):
489+
''' Compute the summary of network events for each pod.
490+
491+
Returns data summarizing the bytes, drops, and errors for
492+
the incoming (rx) and outgoing(tx) network connections for each pod.
493+
494+
Equivalent to a single window of `pxviews.pod_network_timeseries()`.
495+
496+
Args:
497+
@start_time Starting time of the data to examine.
498+
@end_time Ending time of the data to examine.
499+
'''
500+
df = px.DataFrame(table='network_stats', start_time=start_time, end_time=end_time)
501+
502+
# First calculate network usage by node over all windows.
503+
# Data is sharded by Pod in network_stats.
504+
df = df.groupby(['pod_id']).agg(
505+
rx_bytes_end=('rx_bytes', px.max),
506+
rx_bytes_start=('rx_bytes', px.min),
507+
tx_bytes_end=('tx_bytes', px.max),
508+
tx_bytes_start=('tx_bytes', px.min),
509+
tx_errors_end=('tx_errors', px.max),
510+
tx_errors_start=('tx_errors', px.min),
511+
rx_errors_end=('rx_errors', px.max),
512+
rx_errors_start=('rx_errors', px.min),
513+
tx_drops_end=('tx_drops', px.max),
514+
tx_drops_start=('tx_drops', px.min),
515+
rx_drops_end=('rx_drops', px.max),
516+
rx_drops_start=('rx_drops', px.min),
517+
time_=('time_', px.max),
518+
)
519+
520+
window = end_time - start_time
521+
# Calculate the network statistics rate over the window.
522+
# We subtract the counter value at the beginning ('_start')
523+
# from the value at the end ('_end').
524+
df.rx_bytes_per_ns = (df.rx_bytes_end - df.rx_bytes_start) / window
525+
df.tx_bytes_per_ns = (df.tx_bytes_end - df.tx_bytes_start) / window
526+
df.rx_drops_per_ns = (df.rx_drops_end - df.rx_drops_start) / window
527+
df.tx_drops_per_ns = (df.tx_drops_end - df.tx_drops_start) / window
528+
df.rx_errors_per_ns = (df.rx_errors_end - df.rx_errors_start) / window
529+
df.tx_errors_per_ns = (df.tx_errors_end - df.tx_errors_start) / window
530+
df.pod = df.ctx['pod']
531+
df.service = df.ctx['service']
532+
df.namespace = df.ctx['namespace']
533+
df.node = df.ctx['node']
534+
return df[['time_', 'pod_id', 'pod', 'service', 'namespace', 'node',
535+
'rx_bytes_per_ns', 'tx_bytes_per_ns',
536+
'rx_drops_per_ns', 'tx_drops_per_ns', 'rx_errors_per_ns', 'tx_errors_per_ns']]
537+
538+
410539
def inbound_http_throughput_timeseries(start_time, end_time, window_ns):
411540
''' Compute the timeseries statistics of inbound HTTP requests for each container
412541
in the cluster.

0 commit comments

Comments
 (0)