-
Notifications
You must be signed in to change notification settings - Fork 939
Expand file tree
/
Copy pathanalyze_fct.py
More file actions
150 lines (125 loc) · 7.59 KB
/
analyze_fct.py
File metadata and controls
150 lines (125 loc) · 7.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
import numpy as np
import csv
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
##################################
# Setup
#
print("Flow Completion Time Analysis Tool v0.01")
# Create analysis folder if it doesn't exist
analysis_folder_path = 'analysis'
if not os.path.exists(analysis_folder_path):
os.makedirs(analysis_folder_path)
##################################
# Analyze flow completion
#
def analyze_flow_completion(csv_file_path):
print(f"Analyzing flow completion times from: {csv_file_path}")
# Read the CSV file using pandas
df = pd.read_csv(csv_file_path)
# Extract relevant columns
flow_ids = df['flow_id'].values
source_ids = df['sender'].values
target_ids = df['receiver'].values
flow_sizes = df['flow_size'].values
durations = df['duration'].values # These are already in seconds
progress = df['progress'].values
# Determine completion status (considering flows with ≥99.9% progress as completed)
completed = progress >= 99.9
print("Calculating statistics...")
statistics = {
'general_num_flows': len(flow_ids),
'general_num_unique_sources': len(set(source_ids)),
'general_num_unique_targets': len(set(target_ids)),
'general_flow_size_bytes_mean': np.mean(flow_sizes),
'general_flow_size_bytes_std': np.std(flow_sizes)
}
# Define flow size ranges
range_low = [-1, -1, -1, 100000, 2434900, 1000000, 10000000]
range_high = [-1, 100000, 2434900, -1, -1, -1, -1]
range_name = ["all", "less_100KB", "less_2.4349MB", "geq_100KB", "geq_2.4349MB", "geq_1MB", "geq_10MB"]
range_completed_duration = [[], [], [], [], [], [], []]
range_num_finished_flows = [0, 0, 0, 0, 0, 0, 0]
range_num_unfinished_flows = [0, 0, 0, 0, 0, 0, 0]
range_low_eq = [0, 0, 0, 1, 1, 1, 1]
range_high_eq = [0, 0, 0, 1, 1, 1, 1]
# Go over all flows
for i in range(0, len(flow_ids)):
# Range-specific
for j in range(0, len(range_name)):
if (
(range_low[j] == -1 or (range_low_eq[j] == 0 and flow_sizes[i] > range_low[j]) or (range_low_eq[j] == 1 and flow_sizes[i] >= range_low[j])) and
(range_high[j] == -1 or (range_high_eq[j] == 0 and flow_sizes[i] < range_high[j]) or (range_high_eq[j] == 1 and flow_sizes[i] <= range_high[j]))
):
if completed[i]:
range_num_finished_flows[j] += 1
range_completed_duration[j].append(durations[i] * 1000000000) # Convert seconds to nanoseconds
else:
range_num_unfinished_flows[j] += 1
# Ranges statistics
for j in range(0, len(range_name)):
# Number of finished flows
statistics[range_name[j] + '_num_flows'] = range_num_finished_flows[j] + range_num_unfinished_flows[j]
print(range_name[j] + '_num_flows', range_num_finished_flows[j] + range_num_unfinished_flows[j])
statistics[range_name[j] + '_num_finished_flows'] = range_num_finished_flows[j]
print(range_name[j] + '_num_finished_flows', range_num_finished_flows[j])
statistics[range_name[j] + '_num_unfinished_flows'] = range_num_unfinished_flows[j]
print(range_name[j] + '_num_unfinished_flows', range_num_unfinished_flows[j])
total = (range_num_finished_flows[j] + range_num_unfinished_flows[j])
if range_num_finished_flows[j] != 0:
statistics[range_name[j] + '_flows_completed_fraction'] = float(range_num_finished_flows[j]) / float(total)
# Duration is stored in nanoseconds in the statistics
statistics[range_name[j] + '_mean_fct_ns'] = np.mean(range_completed_duration[j])
print(range_name[j] + '_mean_fct_ns', np.mean(range_completed_duration[j]))
statistics[range_name[j] + '_median_fct_ns'] = np.median(range_completed_duration[j])
statistics[range_name[j] + '_99th_fct_ns'] = np.percentile(range_completed_duration[j], 99)
statistics[range_name[j] + '_99.9th_fct_ns'] = np.percentile(range_completed_duration[j], 99.9)
# Convert to milliseconds for display purposes
statistics[range_name[j] + '_mean_fct_ms'] = statistics[range_name[j] + '_mean_fct_ns'] / 1000000
statistics[range_name[j] + '_median_fct_ms'] = statistics[range_name[j] + '_median_fct_ns'] / 1000000
statistics[range_name[j] + '_99th_fct_ms'] = statistics[range_name[j] + '_99th_fct_ns'] / 1000000
statistics[range_name[j] + '_99.9th_fct_ms'] = statistics[range_name[j] + '_99.9th_fct_ns'] / 1000000
else:
statistics[range_name[j] + '_flows_completed_fraction'] = 0
# Add the original duration values for comparing with other analysis
print("\nOriginal duration values (seconds) for comparison:")
print(f"All flows - Mean duration: {np.mean(durations):.4f} seconds")
print(f"Small flows (<100KB) - Mean duration: {np.mean(durations[flow_sizes <= 100000]):.4f} seconds")
print(f"Medium flows (>100KB, <2.4MB) - Mean duration: {np.mean(durations[(flow_sizes > 100000) & (flow_sizes < 2434900)]):.4f} seconds")
print(f"Large flows (>1MB) - Mean duration: {np.mean(durations[flow_sizes > 1000000]):.4f} seconds")
# Create CDF plot for flow completion times
plt.figure(figsize=(10, 6))
for j in range(len(range_name)):
if range_num_finished_flows[j] > 0:
sorted_fct = np.sort(range_completed_duration[j]) / 1000000 # Convert ns to ms for readability
cdf = np.arange(1, len(sorted_fct)+1) / len(sorted_fct)
plt.plot(sorted_fct, cdf, label=range_name[j])
plt.xlabel('Flow Completion Time (ms)')
plt.ylabel('CDF')
plt.title('CDF of Flow Completion Times by Flow Size Category')
plt.grid(True, linestyle='--', linewidth=0.5)
plt.legend()
plt.savefig(f'{analysis_folder_path}/fct_cdf.png')
# Print raw results
print('Writing to result file flow_completion.statistics...')
with open(analysis_folder_path + '/flow_completion.statistics', 'w+') as outfile:
for key, value in sorted(statistics.items()):
outfile.write(str(key) + "=" + str(value) + "\n")
# Print summary statistics for comparison
print("\nSummary of Flow Completion Times:")
print("Flow Size Category | Mean (sec) | Median (sec) | 99th (sec)")
print("------------------------|-------------|--------------|------------")
for j in range(len(range_name)):
if range_num_finished_flows[j] > 0:
mean_sec = statistics[range_name[j] + '_mean_fct_ns'] / 1e9
median_sec = statistics[range_name[j] + '_median_fct_ns'] / 1e9
p99_sec = statistics[range_name[j] + '_99th_fct_ns'] / 1e9
print(f"{range_name[j].ljust(24)} | {mean_sec:11.4f} | {median_sec:12.4f} | {p99_sec:10.4f}")
if __name__ == "__main__":
# Path to the CSV file
csv_file_path = "flow_analysis.csv"
# Analyze flow completion
analyze_flow_completion(csv_file_path)