Artificially merge vps-audit benchmarks for presentation

TODO: actually merge them
binpash · Jan 14, 2025 · 5e3cc4e · 5e3cc4e
1 parent fd639b3
commit 5e3cc4e
Showing 3 changed files with 19 additions and 14 deletions.
diff --git a/infrastructure/all_scripts.py b/infrastructure/all_scripts.py
@@ -6,11 +6,19 @@
 
 from project_root import get_project_root
 
+# TODO: deleteme after merging vps-audits
+benchmark_rename_map = {
+    'vps-audit-negate': 'vps-audit'
+}
+
 def get_all_scripts(
     scripts_file: Path = get_project_root() / 'infrastructure/data/script-globs.json'
 ) -> list[Path]:
     scripts = scripts_file.read_text()
     benchmark_data: dict[str, dict[str, any]] = json.loads(scripts)
+    # TODO: deleteme after merging vps-audits
+    for old, new in benchmark_rename_map.items():
+        benchmark_data[new]["scripts"] = benchmark_data[new]["scripts"] + benchmark_data.pop(old)["scripts"]
     return {
         benchmark_name: [
             script
@@ -23,3 +31,4 @@ def get_all_scripts(
 if __name__ == "__main__":
     for bench in get_all_scripts().keys():
         print(bench)
+    print(get_all_scripts())
diff --git a/infrastructure/colossal_table.py b/infrastructure/colossal_table.py
@@ -6,7 +6,7 @@
 import viz.dynamic as dyn
 import sys
 
-from all_scripts import get_all_scripts
+from all_scripts import get_all_scripts, benchmark_rename_map
 from project_root import get_project_root
 
 root = get_project_root()
@@ -111,19 +111,10 @@ def citation(benchmark):
     # aurpkg is just 1
     # bio is just 1
     'web-index/scripts/ngrams.sh',
-    # vps-audit is just 1
+    'vps-audit/scripts/vps-audit.sh',
     'makeself/makeself/test/lsmtest/lsmtest.sh'
 ]
 
-def benchmark_name(benchmark):
-    benchmark_name_map = {
-            'vps-audit-negate': 'vps-audit-n'
-    }
-    if benchmark in benchmark_name_map:
-        return benchmark_name_map[benchmark]
-    else:
-        return benchmark
-
 def script_name(script):
     script_name_map = {
             "encrypt_files.sh": "encrypt.sh",
@@ -154,7 +145,8 @@ def count_constructs(series):
 def read_loc_data():
     loc_data = pd.read_csv(loc_data_path, header=None)
     loc_data.columns = ['script', 'loc']
-    loc_data['benchmark'] = loc_data['script'].apply(lambda x: x.split('/')[0])
+    map_df = stx.get_map_df()
+    loc_data = loc_data.merge(map_df, on='script')
     loc_data_bench = loc_data.groupby('benchmark').agg({
         'loc': 'sum',
         'script': 'count'
@@ -279,7 +271,7 @@ def main():
     for _, row in big_bench.iterrows():
         numscripts_shown = 0
         numscripts = row['number_of_scripts']
-        print(f"\\bs{{{benchmark_name(row['benchmark'])}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {make_input_description(row)}  & {row['constructs']} & {row['unique_cmds']} & {format_number(row['time_in_shell'])} & {format_number(row['time_in_commands'])} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} & {row['sys_calls']} & {row['file_descriptors']} & {citation(row['benchmark'])} \\\\")
+        print(f"\\bs{{{row['benchmark']}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {make_input_description(row)}  & {row['constructs']} & {row['unique_cmds']} & {format_number(row['time_in_shell'])} & {format_number(row['time_in_commands'])} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} & {row['sys_calls']} & {row['file_descriptors']} & {citation(row['benchmark'])} \\\\")
         # now print the details of all scripts in the benchmark
         for _, row_script in big_script.iterrows():
             if row_script['benchmark'] == row['benchmark'] and any([fnmatch.fnmatch(row_script['script'], pattern) for pattern in scripts_to_include]):
@@ -309,6 +301,10 @@ def round_whole(numstr):
   \\end{tabular}
 """)
 
+    print('time', file=sys.stderr)
+    print(agg_order, file=sys.stderr)
+    print([format_value(v) for v in big_bench['time'].agg(agg_order).values], file=sys.stderr)
+
 
 if __name__ == '__main__':
     main()
diff --git a/infrastructure/viz/syntax.py b/infrastructure/viz/syntax.py
@@ -141,7 +141,7 @@ def node_heatmap(df, outdir=None):
 
     plt.figure(figsize=(5.5, 6))
     sns.heatmap(heatmap_data, 
-                cmap='Greys',
+                cmap='Reds',
                 annot=annot_data, 
                 fmt='', 
                 cbar_kws={'label': 'Occurrences (* denotes more than 5)',