|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +import fnmatch |
| 5 | +import viz_syntax as stx |
| 6 | +import viz_dynamic as dyn |
| 7 | + |
| 8 | +from all_scripts import get_all_scripts |
| 9 | +from project_root import get_project_root |
| 10 | + |
| 11 | +root = get_project_root() |
| 12 | +data_path = root / 'infrastructure/target/dynamic_analysis.jsonl' |
| 13 | +input_size_path = root / 'infrastructure/data/size_inputs.jsonl' |
| 14 | +loc_data_path = root / 'infrastructure/target/lines_of_code.csv' |
| 15 | + |
| 16 | +benchmark_category_style = { |
| 17 | + 'bio': ('XXX', 'XXX', 'XXX'), |
| 18 | + 'vps-audit': ('XXX', 'XXX', 'XXX'), |
| 19 | + 'vps-audit-negate': ('XXX', 'XXX', 'XXX'), |
| 20 | + 'aurpkg': ('XXX', 'XXX', 'XXX'), |
| 21 | + 'makeself': ('XXX', 'XXX', 'XXX'), |
| 22 | + 'infrastructure/standards/100-files': ('XXX', 'XXX', 'XXX'), |
| 23 | + 'infrastructure/standards/read-write': ('XXX', 'XXX', 'XXX'), |
| 24 | + 'infrastructure/standards/shell-memory': ('XXX', 'XXX', 'XXX'), |
| 25 | + 'infrastructure/standards/sleep': ('XXX', 'XXX', 'XXX'), |
| 26 | + 'infrastructure/standards/time-in-shell-subprocess': ('XXX', 'XXX', 'XXX'), |
| 27 | + 'infrastructure/standards/user-time': ('XXX', 'XXX', 'XXX'), |
| 28 | + 'infrastructure/standards/user-time-in-shell': ('XXX', 'XXX', 'XXX'), |
| 29 | + 'infrastructure/standards/write-only': ('XXX', 'XXX', 'XXX'), |
| 30 | + 'covid-mts': ('Data analysis', 'Data extraction', '\\cite{covid-mts-source}'), |
| 31 | + 'file-enc': ('Cryptography', 'Automation', '\\cite{cito2020empirical}'), |
| 32 | + 'log-analysis': ('System admin.', 'Data extraction', '\\cite{spinellis2017extending, raghavan2020posh}'), |
| 33 | + 'max-temp': ('Data analysis', 'Data extraction', '\\cite{hadoop-guide-2009}'), |
| 34 | + 'media-conv': ('Misc.', 'Automation', '\\cite{spinellis2017extending, raghavan2020posh}'), |
| 35 | + 'nlp': ('Machine learning', 'Text processing', '\\cite{unix-for-poets-church}'), |
| 36 | + 'oneliners': ('Misc.', 'Text processing', '\\cite{bentley-pearl-cacm-1985, bentley-pearl-cacm-1986, unix-cacm-1974, wicked-cool-shell-scripts}'), |
| 37 | + 'riker': ('Development', 'Build scripts', ''), |
| 38 | + 'sklearn': ('Machine learning', 'Automation', ''), |
| 39 | + 'uniq-ips': ('System admin.', 'Automation', ''), |
| 40 | + 'unix50': ('Misc.', 'Text processing', '\\cite{bhandari2020solutions}'), |
| 41 | + 'web-index': ('Development', 'Text processing', '\\cite{pash2021}') |
| 42 | +} |
| 43 | + |
| 44 | +def short_category(benchmark): |
| 45 | + dom, style, _ = benchmark_category_style[benchmark] |
| 46 | + def shorten(str): |
| 47 | + return ''.join([x[0].upper() for x in str.split(' ')]) |
| 48 | + return shorten(dom) + '/' + shorten(style) |
| 49 | + |
| 50 | +benchmark_input_description = { |
| 51 | + 'aurpkg': 'package files', |
| 52 | + 'bio': 'biological data files', |
| 53 | + 'covid-mts': 'transit data', |
| 54 | + 'file-enc': 'pcap files', |
| 55 | + 'log-analysis': 'log files', |
| 56 | + 'max-temp': 'temperature data', |
| 57 | + 'media-conv': 'media files', |
| 58 | + 'nlp': 'text files', |
| 59 | + 'oneliners': 'text files', |
| 60 | + 'riker': 'source code files', |
| 61 | + 'sklearn': 'CSV files', |
| 62 | + 'uniq-ips': 'text files', |
| 63 | + 'unix50': 'text files', |
| 64 | + 'web-index': 'HTML files', |
| 65 | + 'bio': 'XXX', |
| 66 | + 'vps-audit': 'system status', |
| 67 | + 'vps-audit-negate': 'system status', |
| 68 | + 'makeself': 'XXX', |
| 69 | + 'aurpkg': 'XXX', |
| 70 | + 'infrastructure/standards/100-files': 'XXX', |
| 71 | + 'infrastructure/standards/read-write': 'XXX', |
| 72 | + 'infrastructure/standards/shell-memory': 'XXX', |
| 73 | + 'infrastructure/standards/sleep': 'XXX', |
| 74 | + 'infrastructure/standards/time-in-shell-subprocess': 'XXX', |
| 75 | + 'infrastructure/standards/user-time': 'XXX', |
| 76 | + 'infrastructure/standards/user-time-in-shell': 'XXX', |
| 77 | + 'infrastructure/standards/write-only': 'XXX', |
| 78 | +} |
| 79 | + |
| 80 | +scripts_to_include = [ |
| 81 | + 'covid-mts/scripts/1.sh', |
| 82 | + 'file-enc/scripts/encrypt_files.sh', |
| 83 | + 'log-analysis/scripts/nginx.sh', |
| 84 | + 'media-conv/scripts/img_convert.sh', |
| 85 | + 'nlp/scripts/bigrams.sh', |
| 86 | + 'oneliners/*', |
| 87 | + 'unix50/scripts/1.sh', |
| 88 | + 'riker/scripts/redis/build.sh', |
| 89 | + # max-temp is just 1 |
| 90 | + # sklearn is just 1 |
| 91 | + # aurpkg is just 1 |
| 92 | + # bio is just 1 |
| 93 | + # makeself?? |
| 94 | + 'web-index/scripts/ngrams.sh', |
| 95 | + # vps-audit is just 1 |
| 96 | +] |
| 97 | + |
| 98 | + |
| 99 | +def count_unique_cmds(series): |
| 100 | + return len({node for node in series if 'command(' in node}) |
| 101 | + |
| 102 | +def count_constructs(series): |
| 103 | + return len(set(series)) |
| 104 | + |
| 105 | +def read_loc_data(): |
| 106 | + loc_data = pd.read_csv(loc_data_path, header=None) |
| 107 | + loc_data.columns = ['script', 'loc'] |
| 108 | + loc_data['benchmark'] = loc_data['script'].apply(lambda x: x.split('/')[0]) |
| 109 | + loc_data_bench = loc_data.groupby('benchmark').agg({ |
| 110 | + 'loc': 'sum', |
| 111 | + 'script': 'count' |
| 112 | + }).reset_index() |
| 113 | + loc_data_bench.rename(columns={'script': 'number_of_scripts'}, inplace=True) |
| 114 | + return loc_data, loc_data_bench |
| 115 | + |
| 116 | +def prettify_bytes_number(n): |
| 117 | + if n < 1024: |
| 118 | + value, unit = n, "B" |
| 119 | + elif n < 1024 * 1024: |
| 120 | + value, unit = n / 1024, "KB" |
| 121 | + elif n < 1024 * 1024 * 1024: |
| 122 | + value, unit = n / (1024 * 1024), "MB" |
| 123 | + else: |
| 124 | + value, unit = n / (1024 * 1024 * 1024), "GB" |
| 125 | + |
| 126 | + if value < 10: |
| 127 | + decimals = 2 |
| 128 | + elif value < 100: |
| 129 | + decimals = 1 |
| 130 | + else: |
| 131 | + decimals = 0 |
| 132 | + |
| 133 | + color = 'black' if unit == 'GB' else 'gray' |
| 134 | + return f"{value:.{decimals}f} \\textcolor{{{color}}}{{{unit}}}" |
| 135 | + |
| 136 | +def main(): |
| 137 | + syntax_script, syntax_bench = stx.read_data(True) |
| 138 | + syntax_script_all_cmds, syntax_bench_all_cmds = stx.read_data(False) |
| 139 | + dyn_script, dyn_bench = dyn.read_data() |
| 140 | + loc_data_script, loc_data_bench = read_loc_data() |
| 141 | + |
| 142 | + syntax_script_all_cmds['unique_cmds'] = syntax_script_all_cmds['nodes'].apply(count_unique_cmds) |
| 143 | + syntax_bench_all_cmds['unique_cmds'] = syntax_bench_all_cmds['nodes'].apply(count_unique_cmds) |
| 144 | + syntax_script['constructs'] = syntax_script['nodes'].apply(count_constructs) |
| 145 | + syntax_bench['constructs'] = syntax_bench['nodes'].apply(count_constructs) |
| 146 | + |
| 147 | + # all_scripts = set(syntax_script['script'].unique()) |
| 148 | + |
| 149 | + # missing_in_dyn = all_scripts - set(dyn_script['script'].unique()) |
| 150 | + # missing_in_loc_data = all_scripts - set(loc_data_script['script'].unique()) |
| 151 | + # missing_in_cmds = all_scripts - set(syntax_script_all_cmds['script'].unique()) |
| 152 | + |
| 153 | + # print("Missing in dyn_script:", missing_in_dyn) |
| 154 | + # print("Missing in loc_data_script:", missing_in_loc_data) |
| 155 | + # print("Missing in syntax_script_all_cmds:", missing_in_cmds) |
| 156 | + |
| 157 | + dyn_bench['input_description'] = dyn_bench['benchmark'].apply(lambda x: benchmark_input_description[x]) |
| 158 | + |
| 159 | + big_bench = syntax_bench.merge(dyn_bench, on='benchmark')\ |
| 160 | + .merge(loc_data_bench, on='benchmark')\ |
| 161 | + .merge(syntax_bench_all_cmds[['benchmark', 'unique_cmds']], on='benchmark') |
| 162 | + |
| 163 | + big_script = syntax_script.merge(dyn_script, on='script')\ |
| 164 | + .merge(loc_data_script, on='script')\ |
| 165 | + .merge(syntax_script_all_cmds[['script', 'unique_cmds']], on='script') |
| 166 | + |
| 167 | + |
| 168 | + print(""" |
| 169 | + \\def\\idw{5em} |
| 170 | +\\begin{tabular}{l|lrr|rr|l|rrrr|lr} |
| 171 | + \\toprule |
| 172 | +\\multirow{2}{*}{Benchmark/Script} & \\multicolumn{3}{c|}{Surface} & \\multicolumn{2}{c|}{Syntax} & \\multicolumn{1}{c|}{Inputs} & \\multicolumn{4}{c|}{Dynamic} & \\multicolumn{2}{c}{System} \\\\ |
| 173 | + & Dom & \\#.sh & LOC & \\# Cons & \\# Cmd & & T.sh & T.cmd & Mem & I/O & \\# s/c & \\# fd \\\\ |
| 174 | + \\midrule |
| 175 | +""") |
| 176 | + # generate a big latex table with the following columns: |
| 177 | + # benchmark, short_category, number of scripts, LOC, number of constructs, number of unique commands, input description, time in shell, time in commands, max memory, IO |
| 178 | + for _, row in big_bench.iterrows(): |
| 179 | + numscripts_shown = 0 |
| 180 | + numscripts = row['number_of_scripts'] |
| 181 | + print("\\rule{0pt}{5ex}") |
| 182 | + print(f"\\textbf{{\\tt {row['benchmark']}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {row['constructs']} & {row['unique_cmds']} & \\multirow{{2}}{{*}}{{\\parbox{{\\idw}}{{{prettify_bytes_number(row['input_size']) + ' of ' + row['input_description']}}}}} & {row['time_in_shell']:.2f} & {row['time_in_commands']:.2f} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} \\\\") |
| 183 | + # now print the details of all scripts in the benchmark |
| 184 | + for _, row_script in big_script.iterrows(): |
| 185 | + if row_script['benchmark'] == row['benchmark'] and any([fnmatch.fnmatch(row_script['script'], pattern) for pattern in scripts_to_include]): |
| 186 | + # all columns except leave blank benchmark, category, number of scripts, input description |
| 187 | + print(f"\\hspace{{0.5em}} {row_script['script'].split('/')[-1]} & & & {row_script['loc']} & {row_script['constructs']} & {row_script['unique_cmds']} & & {row_script['time_in_shell']:.2f} & {row_script['time_in_commands']:.2f} & {prettify_bytes_number(row_script['max_unique_set_size'])} & {prettify_bytes_number(row_script['io_chars'])} \\\\") |
| 188 | + numscripts_shown += 1 |
| 189 | + if numscripts_shown < numscripts and numscripts > 1: |
| 190 | + print(f"\\hspace{{0.5em}} \\ldots & & & & & & & & & & \\\\") |
| 191 | + print(""" |
| 192 | + \\bottomrule |
| 193 | + \\end{tabular} |
| 194 | +""") |
| 195 | + |
| 196 | + |
| 197 | +if __name__ == '__main__': |
| 198 | + main() |
0 commit comments