From 0b3d1d6cc0eea1fa6080442d7d6d3579ea580334 Mon Sep 17 00:00:00 2001
From: Lukas Lazarek <LukasALazarek@gmail.com>
Date: Tue, 14 Jan 2025 17:29:25 +0200
Subject: [PATCH] Add final syscall numbers to colossal table

---
 infrastructure/colossal_table.py       | 32 +++++++++++++++++---------
 infrastructure/data/no_of_syscalls.csv | 17 ++++++++++++++
 infrastructure/data/sys_summary.csv    | 14 -----------
 3 files changed, 38 insertions(+), 25 deletions(-)
 create mode 100644 infrastructure/data/no_of_syscalls.csv
 delete mode 100644 infrastructure/data/sys_summary.csv

diff --git a/infrastructure/colossal_table.py b/infrastructure/colossal_table.py
index 9886ecb9..12dc9995 100644
--- a/infrastructure/colossal_table.py
+++ b/infrastructure/colossal_table.py
@@ -13,12 +13,12 @@
 data_path = root / 'infrastructure/target/dynamic_analysis.jsonl'
 input_size_path = root / 'infrastructure/data/size_inputs.jsonl'
 loc_data_path = root / 'infrastructure/target/lines_of_code.csv'
-csv_file_path = root / 'infrastructure/data/sys_summary.csv'
+syscall_data_path = root / 'infrastructure/data/no_of_syscalls.csv'
 
 def read_sys_results():
-    csv_data = pd.read_csv(csv_file_path)
-    csv_data.rename(columns={'Benchmark': 'benchmark', 'Sys Calls': 'sys_calls', 'File Descriptors': 'file_descriptors'}, inplace=True)
-    return csv_data
+    df = pd.read_csv(syscall_data_path)
+    df.rename(columns={'Benchmark-(small)': 'benchmark', 'System Calls': 'sys_calls'}, inplace=True)
+    return df
 
 benchmark_category_style = {
     'bio': ('Automation', 'Biology', '\\cite{Cappellini2019,puritz2019bio594}'),
@@ -158,11 +158,11 @@ def prettify_bytes_number(n):
     if n < 1024:
         value, unit = n, "B"
     elif n < 1024 * 1024:
-        value, unit = n / 1024, "K"
+        value, unit = n / 1024, "KB"
     elif n < 1024 * 1024 * 1024:
-        value, unit = n / (1024 * 1024), "M"
+        value, unit = n / (1024 * 1024), "MB"
     else:
-        value, unit = n / (1024 * 1024 * 1024), "G"
+        value, unit = n / (1024 * 1024 * 1024), "GB"
     
     if value < 10:
         decimals = 2
@@ -171,9 +171,19 @@ def prettify_bytes_number(n):
     else:
         decimals = 0
     
-    color = 'black' if unit == 'G' else 'gray'
+    color = 'black' if unit == 'GB' else 'gray'
     return f"{value:.{decimals}f}\\textcolor{{{color}}}{{{unit}}}"
 
+def prettify_big_count(n):
+    if n < 1000:
+        return str(n)
+    elif n >= 1000 and n < 10**6:
+        return f"{(n/1000):.1f}k"
+    elif n >= 10**6 and n < 10**9:
+        return f"{n/(10**6):.1f}m"
+    else:
+        return f"{n/(10**9):.1f}g"
+
 def make_input_description(row):
     if row['input_description']:
         desc = prettify_bytes_number(row['input_size']) # + ' of ' + row['input_description']
@@ -194,7 +204,7 @@ def main():
     # fill any benchmarks missing from sys_results
     for benchmark in syntax_bench['benchmark']:
         if benchmark not in sys_results['benchmark'].values:
-            new_row = pd.DataFrame([{'benchmark': benchmark, 'sys_calls': '\\xxx', 'file_descriptors': '\\xxx'}])
+            new_row = pd.DataFrame([{'benchmark': benchmark, 'sys_calls': '\\xxx'}])
             sys_results = pd.concat([sys_results, new_row], ignore_index=True)
     sys_results.reset_index(drop=True, inplace=True)
     # replace sys_results file_descriptors numbers with those from children_num_fds in dyn_bench
@@ -271,7 +281,7 @@ def main():
     for _, row in big_bench.iterrows():
         numscripts_shown = 0
         numscripts = row['number_of_scripts']
-        print(f"\\bs{{{row['benchmark']}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {make_input_description(row)}  & {row['constructs']} & {row['unique_cmds']} & {format_number(row['time_in_shell'])} & {format_number(row['time_in_commands'])} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} & {row['sys_calls']} & {row['file_descriptors']} & {citation(row['benchmark'])} \\\\")
+        print(f"\\bs{{{row['benchmark']}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {make_input_description(row)}  & {row['constructs']} & {row['unique_cmds']} & {format_number(row['time_in_shell'])} & {format_number(row['time_in_commands'])} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} & {prettify_big_count(row['sys_calls'])} & {row['file_descriptors']} & {citation(row['benchmark'])} \\\\")
         # now print the details of all scripts in the benchmark
         for _, row_script in big_script.iterrows():
             if row_script['benchmark'] == row['benchmark'] and any([fnmatch.fnmatch(row_script['script'], pattern) for pattern in scripts_to_include]):
@@ -294,7 +304,7 @@ def round_whole(numstr):
                 return round_whole(f"{value:.1f}") if isinstance(value, float) else f"{int(value)}"
             return value  # For non-numeric values
 
-        print(f"{{\\textbf{{\\centering {row['benchmark']}}}}} & & {format_value(row['number_of_scripts'])} & {format_value(row['loc'])} & {make_input_description(row):s} & {format_value(row['constructs'])} & {format_value(row['unique_cmds'])} & {format_value(row['time_in_shell'])} & {format_value(row['time_in_commands'])} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} & {format_value(row['sys_calls'])} & {format_value(row['file_descriptors'])} & \\\\")
+        print(f"{{\\textbf{{\\centering {row['benchmark']}}}}} & & {format_value(row['number_of_scripts'])} & {format_value(row['loc'])} & {make_input_description(row):s} & {format_value(row['constructs'])} & {format_value(row['unique_cmds'])} & {format_value(row['time_in_shell'])} & {format_value(row['time_in_commands'])} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} & {prettify_big_count(row['sys_calls'])} & {format_value(row['file_descriptors'])} & \\\\")
 
     print("""
     \\bottomrule
diff --git a/infrastructure/data/no_of_syscalls.csv b/infrastructure/data/no_of_syscalls.csv
new file mode 100644
index 00000000..cfe04083
--- /dev/null
+++ b/infrastructure/data/no_of_syscalls.csv
@@ -0,0 +1,17 @@
+Benchmark-(small),System Calls
+aurpkg,881299
+bio,132714
+covid-mts,30258
+file-enc,241927
+log-analysis,116448888
+makeself,2212019
+max-temp,1576
+media-conv,794398
+nlp,1928929
+oneliners,1675820
+riker,2579121
+sklearn,2403510
+unix50,48667
+vps-audit,362870
+vps-audit-negate,362878
+web-index,1544549
\ No newline at end of file
diff --git a/infrastructure/data/sys_summary.csv b/infrastructure/data/sys_summary.csv
deleted file mode 100644
index 908a0069..00000000
--- a/infrastructure/data/sys_summary.csv
+++ /dev/null
@@ -1,14 +0,0 @@
-Benchmark,Sys Calls,File Descriptors
-aurpkg,207,0
-covid-mts,252,0
-file-enc,246,13
-log-analysis,225,13
-makeself,1237,13
-max-temp,202,0
-media-conv,225,13
-nlp,2054,15
-oneliners,677,13
-riker,254,13
-unix50,1556,0
-vps-audit,207,13
-vps-audit-negate,207,13