Add summary views for each of the timeseries functions in pxviews

Phillip Kuznetsov · copybaranaut · commit b8c28176a69f · 2022-08-03T22:24:39.000Z
Summary: Added pod_network_summary,container_process_summary, inbound_http_throughput_summary to the pxviews library. The timeseries versions are not simple to export because it has many windows. A single window is easy to export. Each of these are basically the single window version of each timeseries script. Test Plan: Ran each script and ensured the results are as expected Reviewers: jamesbartlett, michelle, vihang Reviewed By: vihang Signed-off-by: Phillip Kuznetsov <pkuznetsov@pixielabs.ai> Differential Revision: https://phab.corp.pixielabs.ai/D11989 GitOrigin-RevId: 39a502d
diff --git a/src/carnot/planner/pxl_lib/pxviews.pxl b/src/carnot/planner/pxl_lib/pxviews.pxl
@@ -291,6 +291,9 @@ def container_process_timeseries(start_time, end_time, window_ns):
     ''' Compute the timeseries of CPU, memory, and IO usage for each container
     sourced from the `process_stats` table.
 
+    Equivalent to `pxviews.container_process_summary()` extended over multiple
+    windows.
+
     Args:
     @start_time Starting time of the data to examine.
     @end_time Ending time of the data to examine.
@@ -356,12 +359,87 @@ def container_process_timeseries(start_time, end_time, window_ns):
                'total_disk_write_throughput', 'rss', 'vsize']]
 
 
+def container_process_summary(start_time, end_time):
+    ''' Compute the summary of CPU, memory, and IO usage for each container
+    sourced from the `process_stats` table.
+
+    Equivalent to a single window from `pxviews.container_process_timeseries()`.
+
+    Args:
+    @start_time Starting time of the data to examine.
+    @end_time Ending time of the data to examine.
+    '''
+    df = px.DataFrame(table='process_stats', start_time=start_time, end_time=end_time)
+    df.container = df.ctx['container_name']
+    df.pod_id = df.ctx['pod_id']
+
+    # First calculate CPU usage by process (UPID) in each k8s_object
+    # over all windows.
+    df = df.groupby(['upid', 'container', 'pod_id']).agg(
+        rss=('rss_bytes', px.mean),
+        vsize=('vsize_bytes', px.mean),
+        # The fields below are counters, so we take the min and the max to subtract them.
+        cpu_utime_ns_max=('cpu_utime_ns', px.max),
+        cpu_utime_ns_min=('cpu_utime_ns', px.min),
+        cpu_ktime_ns_max=('cpu_ktime_ns', px.max),
+        cpu_ktime_ns_min=('cpu_ktime_ns', px.min),
+        read_bytes_max=('read_bytes', px.max),
+        read_bytes_min=('read_bytes', px.min),
+        write_bytes_max=('write_bytes', px.max),
+        write_bytes_min=('write_bytes', px.min),
+        rchar_bytes_max=('rchar_bytes', px.max),
+        rchar_bytes_min=('rchar_bytes', px.min),
+        wchar_bytes_max=('wchar_bytes', px.max),
+        wchar_bytes_min=('wchar_bytes', px.min),
+        time_=('time_', px.max),
+    )
+
+    # Next calculate cpu usage and memory stats per window.
+    df.cpu_utime_ns = df.cpu_utime_ns_max - df.cpu_utime_ns_min
+    df.cpu_ktime_ns = df.cpu_ktime_ns_max - df.cpu_ktime_ns_min
+
+    window = end_time - start_time
+    df.actual_disk_read_throughput = (df.read_bytes_max - df.read_bytes_min) / window
+    df.actual_disk_write_throughput = (df.write_bytes_max - df.write_bytes_min) / window
+    df.total_disk_read_throughput = (df.rchar_bytes_max - df.rchar_bytes_min) / window
+    df.total_disk_write_throughput = (df.wchar_bytes_max - df.wchar_bytes_min) / window
+
+
+    # Then aggregate per container.
+    df = df.groupby(['pod_id', 'container']).agg(
+        cpu_ktime_ns=('cpu_ktime_ns', px.sum),
+        cpu_utime_ns=('cpu_utime_ns', px.sum),
+        actual_disk_read_throughput=('actual_disk_read_throughput', px.sum),
+        actual_disk_write_throughput=('actual_disk_write_throughput', px.sum),
+        total_disk_read_throughput=('total_disk_read_throughput', px.sum),
+        total_disk_write_throughput=('total_disk_write_throughput', px.sum),
+        rss=('rss', px.sum),
+        vsize=('vsize', px.sum),
+        time_=('time_', px.max),
+    )
+
+    # Finally, calculate total (kernel + user time)  percentage used over window.
+    df.cpu_usage = px.Percent((df.cpu_ktime_ns + df.cpu_utime_ns) / window)
+    df.pod = df.ctx['pod']
+    df.service = df.ctx['service']
+    df.namespace = px.pod_name_to_namespace(df.pod)
+    df.node = df.ctx['node']
+
+    return df[['time_', 'pod_id', 'container', 'pod', 'service', 'namespace', 'node',
+               'cpu_usage', 'actual_disk_read_throughput',
+               'actual_disk_write_throughput', 'total_disk_read_throughput',
+               'total_disk_write_throughput', 'rss', 'vsize']]
+
+
 def pod_network_timeseries(start_time, end_time, window_ns):
     ''' Compute the timeseries of network events for each pod.
 
     Returns timeseries data summarizing the bytes, drops, and errors for
     the incoming (rx) and outgoing(tx) network connections for each pod.
 
+    This is equivalent to `pxviews.pod_network_summary()` but over several
+    time windows.
+
     Args:
     @start_time Starting time of the data to examine.
     @end_time Ending time of the data to examine.
@@ -407,6 +485,57 @@ def pod_network_timeseries(start_time, end_time, window_ns):
                'rx_drops_per_ns', 'tx_drops_per_ns', 'rx_errors_per_ns', 'tx_errors_per_ns']]
 
 
+def pod_network_summary(start_time, end_time):
+    ''' Compute the summary of network events for each pod.
+
+    Returns data summarizing the bytes, drops, and errors for
+    the incoming (rx) and outgoing(tx) network connections for each pod.
+
+    Equivalent to a single window of `pxviews.pod_network_timeseries()`.
+
+    Args:
+    @start_time Starting time of the data to examine.
+    @end_time Ending time of the data to examine.
+    '''
+    df = px.DataFrame(table='network_stats', start_time=start_time, end_time=end_time)
+
+    # First calculate network usage by node over all windows.
+    # Data is sharded by Pod in network_stats.
+    df = df.groupby(['pod_id']).agg(
+        rx_bytes_end=('rx_bytes', px.max),
+        rx_bytes_start=('rx_bytes', px.min),
+        tx_bytes_end=('tx_bytes', px.max),
+        tx_bytes_start=('tx_bytes', px.min),
+        tx_errors_end=('tx_errors', px.max),
+        tx_errors_start=('tx_errors', px.min),
+        rx_errors_end=('rx_errors', px.max),
+        rx_errors_start=('rx_errors', px.min),
+        tx_drops_end=('tx_drops', px.max),
+        tx_drops_start=('tx_drops', px.min),
+        rx_drops_end=('rx_drops', px.max),
+        rx_drops_start=('rx_drops', px.min),
+        time_=('time_', px.max),
+    )
+
+    window = end_time - start_time
+    # Calculate the network statistics rate over the window.
+    # We subtract the counter value at the beginning ('_start')
+    # from the value at the end ('_end').
+    df.rx_bytes_per_ns = (df.rx_bytes_end - df.rx_bytes_start) / window
+    df.tx_bytes_per_ns = (df.tx_bytes_end - df.tx_bytes_start) / window
+    df.rx_drops_per_ns = (df.rx_drops_end - df.rx_drops_start) / window
+    df.tx_drops_per_ns = (df.tx_drops_end - df.tx_drops_start) / window
+    df.rx_errors_per_ns = (df.rx_errors_end - df.rx_errors_start) / window
+    df.tx_errors_per_ns = (df.tx_errors_end - df.tx_errors_start) / window
+    df.pod = df.ctx['pod']
+    df.service = df.ctx['service']
+    df.namespace = df.ctx['namespace']
+    df.node = df.ctx['node']
+    return df[['time_', 'pod_id', 'pod', 'service', 'namespace', 'node',
+               'rx_bytes_per_ns', 'tx_bytes_per_ns',
+               'rx_drops_per_ns', 'tx_drops_per_ns', 'rx_errors_per_ns', 'tx_errors_per_ns']]
+
+
 def inbound_http_throughput_timeseries(start_time, end_time, window_ns):
     ''' Compute the timeseries statistics of inbound HTTP requests for each container
     in the cluster.