From 0b3d1d6cc0eea1fa6080442d7d6d3579ea580334 Mon Sep 17 00:00:00 2001 From: Lukas Lazarek Date: Tue, 14 Jan 2025 17:29:25 +0200 Subject: [PATCH] Add final syscall numbers to colossal table --- infrastructure/colossal_table.py | 32 +++++++++++++++++--------- infrastructure/data/no_of_syscalls.csv | 17 ++++++++++++++ infrastructure/data/sys_summary.csv | 14 ----------- 3 files changed, 38 insertions(+), 25 deletions(-) create mode 100644 infrastructure/data/no_of_syscalls.csv delete mode 100644 infrastructure/data/sys_summary.csv diff --git a/infrastructure/colossal_table.py b/infrastructure/colossal_table.py index 9886ecb9..12dc9995 100644 --- a/infrastructure/colossal_table.py +++ b/infrastructure/colossal_table.py @@ -13,12 +13,12 @@ data_path = root / 'infrastructure/target/dynamic_analysis.jsonl' input_size_path = root / 'infrastructure/data/size_inputs.jsonl' loc_data_path = root / 'infrastructure/target/lines_of_code.csv' -csv_file_path = root / 'infrastructure/data/sys_summary.csv' +syscall_data_path = root / 'infrastructure/data/no_of_syscalls.csv' def read_sys_results(): - csv_data = pd.read_csv(csv_file_path) - csv_data.rename(columns={'Benchmark': 'benchmark', 'Sys Calls': 'sys_calls', 'File Descriptors': 'file_descriptors'}, inplace=True) - return csv_data + df = pd.read_csv(syscall_data_path) + df.rename(columns={'Benchmark-(small)': 'benchmark', 'System Calls': 'sys_calls'}, inplace=True) + return df benchmark_category_style = { 'bio': ('Automation', 'Biology', '\\cite{Cappellini2019,puritz2019bio594}'), @@ -158,11 +158,11 @@ def prettify_bytes_number(n): if n < 1024: value, unit = n, "B" elif n < 1024 * 1024: - value, unit = n / 1024, "K" + value, unit = n / 1024, "KB" elif n < 1024 * 1024 * 1024: - value, unit = n / (1024 * 1024), "M" + value, unit = n / (1024 * 1024), "MB" else: - value, unit = n / (1024 * 1024 * 1024), "G" + value, unit = n / (1024 * 1024 * 1024), "GB" if value < 10: decimals = 2 @@ -171,9 +171,19 @@ def prettify_bytes_number(n): else: decimals = 0 - color = 'black' if unit == 'G' else 'gray' + color = 'black' if unit == 'GB' else 'gray' return f"{value:.{decimals}f}\\textcolor{{{color}}}{{{unit}}}" +def prettify_big_count(n): + if n < 1000: + return str(n) + elif n >= 1000 and n < 10**6: + return f"{(n/1000):.1f}k" + elif n >= 10**6 and n < 10**9: + return f"{n/(10**6):.1f}m" + else: + return f"{n/(10**9):.1f}g" + def make_input_description(row): if row['input_description']: desc = prettify_bytes_number(row['input_size']) # + ' of ' + row['input_description'] @@ -194,7 +204,7 @@ def main(): # fill any benchmarks missing from sys_results for benchmark in syntax_bench['benchmark']: if benchmark not in sys_results['benchmark'].values: - new_row = pd.DataFrame([{'benchmark': benchmark, 'sys_calls': '\\xxx', 'file_descriptors': '\\xxx'}]) + new_row = pd.DataFrame([{'benchmark': benchmark, 'sys_calls': '\\xxx'}]) sys_results = pd.concat([sys_results, new_row], ignore_index=True) sys_results.reset_index(drop=True, inplace=True) # replace sys_results file_descriptors numbers with those from children_num_fds in dyn_bench @@ -271,7 +281,7 @@ def main(): for _, row in big_bench.iterrows(): numscripts_shown = 0 numscripts = row['number_of_scripts'] - print(f"\\bs{{{row['benchmark']}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {make_input_description(row)} & {row['constructs']} & {row['unique_cmds']} & {format_number(row['time_in_shell'])} & {format_number(row['time_in_commands'])} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} & {row['sys_calls']} & {row['file_descriptors']} & {citation(row['benchmark'])} \\\\") + print(f"\\bs{{{row['benchmark']}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {make_input_description(row)} & {row['constructs']} & {row['unique_cmds']} & {format_number(row['time_in_shell'])} & {format_number(row['time_in_commands'])} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} & {prettify_big_count(row['sys_calls'])} & {row['file_descriptors']} & {citation(row['benchmark'])} \\\\") # now print the details of all scripts in the benchmark for _, row_script in big_script.iterrows(): if row_script['benchmark'] == row['benchmark'] and any([fnmatch.fnmatch(row_script['script'], pattern) for pattern in scripts_to_include]): @@ -294,7 +304,7 @@ def round_whole(numstr): return round_whole(f"{value:.1f}") if isinstance(value, float) else f"{int(value)}" return value # For non-numeric values - print(f"{{\\textbf{{\\centering {row['benchmark']}}}}} & & {format_value(row['number_of_scripts'])} & {format_value(row['loc'])} & {make_input_description(row):s} & {format_value(row['constructs'])} & {format_value(row['unique_cmds'])} & {format_value(row['time_in_shell'])} & {format_value(row['time_in_commands'])} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} & {format_value(row['sys_calls'])} & {format_value(row['file_descriptors'])} & \\\\") + print(f"{{\\textbf{{\\centering {row['benchmark']}}}}} & & {format_value(row['number_of_scripts'])} & {format_value(row['loc'])} & {make_input_description(row):s} & {format_value(row['constructs'])} & {format_value(row['unique_cmds'])} & {format_value(row['time_in_shell'])} & {format_value(row['time_in_commands'])} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} & {prettify_big_count(row['sys_calls'])} & {format_value(row['file_descriptors'])} & \\\\") print(""" \\bottomrule diff --git a/infrastructure/data/no_of_syscalls.csv b/infrastructure/data/no_of_syscalls.csv new file mode 100644 index 00000000..cfe04083 --- /dev/null +++ b/infrastructure/data/no_of_syscalls.csv @@ -0,0 +1,17 @@ +Benchmark-(small),System Calls +aurpkg,881299 +bio,132714 +covid-mts,30258 +file-enc,241927 +log-analysis,116448888 +makeself,2212019 +max-temp,1576 +media-conv,794398 +nlp,1928929 +oneliners,1675820 +riker,2579121 +sklearn,2403510 +unix50,48667 +vps-audit,362870 +vps-audit-negate,362878 +web-index,1544549 \ No newline at end of file diff --git a/infrastructure/data/sys_summary.csv b/infrastructure/data/sys_summary.csv deleted file mode 100644 index 908a0069..00000000 --- a/infrastructure/data/sys_summary.csv +++ /dev/null @@ -1,14 +0,0 @@ -Benchmark,Sys Calls,File Descriptors -aurpkg,207,0 -covid-mts,252,0 -file-enc,246,13 -log-analysis,225,13 -makeself,1237,13 -max-temp,202,0 -media-conv,225,13 -nlp,2054,15 -oneliners,677,13 -riker,254,13 -unix50,1556,0 -vps-audit,207,13 -vps-audit-negate,207,13