Skip to content

Commit

Permalink
Add dynamic analysis data viz script
Browse files Browse the repository at this point in the history
  • Loading branch information
LLazarek authored and EtomicBomb committed Jan 9, 2025
1 parent 64d7542 commit 4cf0a29
Showing 1 changed file with 141 additions and 0 deletions.
141 changes: 141 additions & 0 deletions infrastructure/viz/dynamic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Data format example: (column titles not in the file, all times are seconds)
# benchmark_script, user_time, system_time, max_unique_set_size, read_chars, write_chars, user_time_in_shell, system_time_in_shell, all_input_files, wall_time
# covid-mts/scripts/1.sh,17.32,4.08,13512704,10364829524,7538064195,0.0,0.0,covid-mts/input/in.csv,22.41
# file-enc/scripts/encrypt_files.sh,1.55,0.54,2998272,925500199,923880800,0.0,0.01,file-enc/input/pcaps/http-download101c.pcapng;file-enc/input/pcaps/ftp-download101.pcapng;file-enc/input/pcaps/tr-twohosts.pcapng;file-enc/input/pcaps/split250_00004_20160704110759.pcapng;file-enc/input/pcaps/http-pcaprnet101.pcapng;file-enc/input/pcaps/sec-suspicious101.pcapng;file-enc/input/pcaps/challenge101-8.pcapng;file-enc/input/pcaps/challenge101-3.pcapng;file-enc/input/pcaps/challenge101-6.pcapng;file-enc/input/pcaps/http-openoffice101a.pcapng;file-enc/input/pcaps/tr-winsize.pcapng;file-enc/input/pcaps/ftp-crack101.pcapng;file-enc/input/pcaps/http-misctraffic101.pcapng;file-enc/input/pcaps/http-google101.pcapng;file-enc/input/pcaps/http-wiresharkdownload101.pcapng;file-enc/input/pcaps/general101d.pcapng;file-enc/input/pcaps/http-download101.pcapng;file-enc/input/pcaps/http-chappellu101.pcapng;file-enc/input/pcaps/http-college101.pcapng;file-enc/input/pcaps/http-download101d.pcapng;file-enc/input/pcaps/net-lost-route.pcapng;file-enc/input/pcaps/general101.pcapng;file-enc/input/pcaps/http-sfgate101.pcapng;file-enc/input/pcaps;file-enc/input/pcaps/http-browse101b.pcapng;file-enc/input/pcaps/split250_00001_20160704110759.pcapng;file-enc/input/pcaps/http-download-a.pcapng,2.09
# log-analysis/scripts/nginx.sh,0.57,0.17,15114240,417362164,232406881,0.0,0.0,log-analysis/input/nginx-logs;log-analysis/input/nginx-logs/log5;log-analysis/input/nginx-logs/log7,0.75

data_path = 'target/dynamic_analysis.csv'
benchmark_mapping_path = 'target/scripts_to_benchmark.csv'
input_size_path = 'data/size-inputs.csv'

def plot_benchmark_times_split(df):
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x='benchmark', y='user_time', data=df, color='blue', label='User time')
sns.barplot(x='benchmark', y='system_time', data=df, color='red', label='System time')
plt.xticks(rotation=90)
plt.yscale('symlog', linthresh=0.1)
plt.legend()
plt.show()

def plot_benchmark_times(df,
ticks = ([0, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
['0', '1ms', '10ms', '100ms', '1s', '10s', '100s', '1000s']),
ylabel='Time (s)',
linthresh=0.001):
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x='benchmark', y='time_in_commands', data=df, color='blue', label='Commands')
sns.barplot(x='benchmark', y='time_in_shell', data=df, color='green', label='Shell')
plt.xticks(rotation=90)
plt.yscale('symlog', linthresh=linthresh)
plt.yticks(*ticks)
plt.ylabel(ylabel)
plt.legend()
plt.show()

def plot_io(df,
ticks=([0, 100000000, 1000000000, 10000000000, 100000000000, 1000000000000],
['0', '100MB', '1GB', '10GB', '100GB', '1TB']),
ylabel='IO bytes',
linthresh=100000000):
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x='benchmark', y='io_chars', data=df, color='green', label='IO bytes')
plt.yscale('symlog', linthresh=linthresh)
plt.yticks(*ticks)
plt.ylabel(ylabel)
plt.xticks(rotation=90)
plt.legend()
plt.show()

def plot_memory(df,
ticks=([0, 1000000, 10000000, 100000000, 1000000000, 10000000000, 100000000000],
['0', '1MB', '10MB', '100MB', '1GB', '10GB', '100GB']),
ylabel='Memory (bytes)',
linthresh=1000000):
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.barplot(x='benchmark', y='max_unique_set_size', data=df, color='purple', label='Max unique set size')
plt.xticks(rotation=90)
plt.yscale('symlog', linthresh=linthresh)
plt.yticks(*ticks)
plt.ylabel(ylabel)
plt.legend()
plt.show()

def main(data_path):
df = pd.read_csv(data_path, header=None)
df.columns = ['script', 'user_time', 'system_time', 'max_unique_set_size', 'read_chars', 'write_chars', 'user_time_in_shell', 'system_time_in_shell', 'all_input_files', 'wall_time']
print()
for col in list(df.columns[1:7]) + list(df.columns[9:10]):
df[col] = df[col].astype(float)
df['all_input_files'] = df['all_input_files'].apply(lambda x: str(x).split(';'))

# aggregate by benchmark
map_df = pd.read_csv(benchmark_mapping_path, header=None)
map_df.columns = ['script', 'benchmark']
df = df.merge(map_df, on='script')
# sum all times
df = df.groupby('benchmark').agg({'user_time': 'sum',
'system_time': 'sum',
'max_unique_set_size': 'sum',
'read_chars': 'sum',
'write_chars': 'sum',
'user_time_in_shell': 'sum',
'system_time_in_shell': 'sum',
'all_input_files': 'sum',
'wall_time': 'sum'}).reset_index()

# merge the read and write_chars
df['io_chars'] = df['read_chars'] + df['write_chars']
df = df.drop(columns=['read_chars', 'write_chars'])

# calculate time in shell and time in commands
df['time'] = df['user_time'] + df['system_time']
df['time_in_shell'] = df['user_time_in_shell'] + df['system_time_in_shell']
df['time_in_commands'] = df['time'] - df['time_in_shell']

# report any benchmarks where the wall time is not approximately equal to the sum of user and system time
for _, row in df.iterrows():
if abs(row['wall_time'] - (row['user_time'] + row['system_time'])) > 0.1:
print(f"Wall time for benchmark {row['benchmark']} maybe suspicious: {row['wall_time']} vs (u{row['user_time']} + s{row['system_time']})")

# relative numbers to input size
input_sizes = pd.read_csv(input_size_path, header=None)
input_sizes.columns = ['input_size', # bytes
'input_file']
input_sizes['input_size'] = input_sizes['input_size'].apply(lambda x: int(x))
input_sizes['benchmark'] = input_sizes['input_file'].apply(lambda x: str(x).split('/')[0])
input_sizes = input_sizes.groupby('benchmark').agg({'input_size': 'sum'}).reset_index()

df_rel_to_input = df.merge(input_sizes, on='benchmark')
df_rel_to_input['io_chars'] = df_rel_to_input['io_chars'] / df_rel_to_input['input_size']
df_rel_to_input['max_unique_set_size'] = df_rel_to_input['max_unique_set_size'] / df_rel_to_input['input_size']
df_rel_to_input['time_in_shell'] = df_rel_to_input['time_in_shell'] / df_rel_to_input['input_size']
df_rel_to_input['time_in_commands'] = df_rel_to_input['time_in_commands'] / df_rel_to_input['input_size']

plot_benchmark_times(df)
plot_io(df)
plot_memory(df)

plot_benchmark_times(df_rel_to_input, ylabel='Time per input byte',
ticks=([0, 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001],
['0', '10ns', '100ns', '1us', '10us', '100us']),
linthresh=0.00000001)
plot_io(df_rel_to_input, ylabel='IO per input byte',
ticks=([0, 1, 10, 100, 1000],
['0', '1B', '10B', '100B', '1KB']),
linthresh=1)
plot_memory(df_rel_to_input, ylabel='Memory per input byte',
ticks=([0, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
['0', '0.001B', '0.01B', '0.1B', '1B', '10B', '100B', '1KB', '10KB']),
linthresh=0.001)


if __name__ == '__main__':
main(data_path)

0 comments on commit 4cf0a29

Please sign in to comment.