kbensh
diff --git a/‎aurpkg/run.sh
Lines changed: 5 additions & 0 deletions b/‎aurpkg/run.sh
Lines changed: 5 additions & 0 deletions
diff --git a/‎bio/deps.sh
Lines changed: 3 additions & 1 deletion b/‎bio/deps.sh
Lines changed: 3 additions & 1 deletion
diff --git a/‎bio/input.sh
Lines changed: 2 additions & 0 deletions b/‎bio/input.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎bio/run.sh
Lines changed: 8 additions & 1 deletion b/‎bio/run.sh
Lines changed: 8 additions & 1 deletion
diff --git a/‎covid-mts/run.sh
Lines changed: 6 additions & 0 deletions b/‎covid-mts/run.sh
Lines changed: 6 additions & 0 deletions
diff --git a/‎file-enc/deps.sh
Lines changed: 2 additions & 0 deletions b/‎file-enc/deps.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎file-enc/run.sh
Lines changed: 7 additions & 2 deletions b/‎file-enc/run.sh
Lines changed: 7 additions & 2 deletions
diff --git a/‎infrastructure/Makefile
Lines changed: 12 additions & 3 deletions b/‎infrastructure/Makefile
Lines changed: 12 additions & 3 deletions
diff --git a/‎infrastructure/all_scripts.py
Lines changed: 4 additions & 0 deletions b/‎infrastructure/all_scripts.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎infrastructure/collect_dynamic_logs.sh
Lines changed: 19 additions & 0 deletions b/‎infrastructure/collect_dynamic_logs.sh
Lines changed: 19 additions & 0 deletions
diff --git a/‎infrastructure/colossal_table.py
Lines changed: 198 additions & 0 deletions b/‎infrastructure/colossal_table.py
Lines changed: 198 additions & 0 deletions
diff --git a/‎infrastructure/count_nodes_in_scripts.py
Lines changed: 1 addition & 0 deletions b/‎infrastructure/count_nodes_in_scripts.py
Lines changed: 1 addition & 0 deletions
@@ -7,6 +7,11 @@ mkdir -p ${OUT}
 
 script="./scripts/pacaur.sh"
 
+BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
+export BENCHMARK_CATEGORY="aurpkg"
+export BENCHMARK_SCRIPT="$(realpath "$script")"
+export BENCHMARK_INPUT_FILE="$(realpath "$IN")"
+
 # Switch to user "user" to avoid permission issues
 
 echo "$script"
 
@@ -1,3 +1,4 @@
+#!/bin/bash
 # install dependencies
 required_version="1.7"
 
@@ -43,4 +44,5 @@ else
         echo "Failed to install the correct version of Samtools."
         exit 1
     fi
-fi
+fi
+
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 IN=inputs
 IN_NAME=input.txt
 
 
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # create bam files with regions
 ################### 1KG SAMPLES
 IN=inputs
@@ -8,6 +10,11 @@ if [[ "$@" == *"--small"* ]]; then
     IN_NAME=input_small.txt
 fi
 
+export BENCHMARK_CATEGORY="bio"
 BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
 
-"$BENCHMARK_SHELL" ./scripts/bio.sh "$IN" "$IN_NAME" "$OUT"
+script_file=./scripts/bio.sh 
+export BENCHMARK_SCRIPT="$(realpath "$script_file")"
+export BENCHMARK_INPUT_FILE="$(realpath "$IN_NAME")"
+
+$BENCHMARK_SHELL "$script_file" "$IN" "$IN_NAME" "$OUT"
@@ -16,9 +16,15 @@ output_scoped="$outputs_dir/outputs$suffix"
 mkdir -p "$output_scoped"
 
 BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
+export BENCHMARK_CATEGORY="covid-mts"
+export BENCHMARK_INPUT_FILE="$(realpath "$input_file")"
 
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/1.sh")"
 $BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/2.sh")"
 $BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/3.sh")"
 $BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/4.sh")"
 $BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"
 
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 sudo apt-get update
 
 pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'
 
@@ -17,6 +17,11 @@ if [[ "$1" == "--small" ]]; then
     suffix=".small"
 fi
 
+export BENCHMARK_CATEGORY="file-enc"
+export BENCHMARK_INPUT_FILE="$(realpath "$input_pcaps")"
 BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
-$BENCHMARK_SHELL $scripts_dir/compress_files.sh $input_pcaps $results_dir/compress_files$suffix
-$BENCHMARK_SHELL $scripts_dir/encrypt_files.sh $input_pcaps $results_dir/encrypt_files$suffix
+
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/compress_files.sh")"
+$BENCHMARK_SHELL "$scripts_dir/compress_files.sh" "$input_pcaps" "$results_dir/compress_files$suffix"
+export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/encrypt_files.sh")"
+$BENCHMARK_SHELL "$scripts_dir/encrypt_files.sh" "$input_pcaps" "$results_dir/encrypt_files$suffix"
@@ -1,7 +1,10 @@
-STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/cyclomatic.csv target/shellmetrics.sh
+STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/cyclomatic.csv target/shellmetrics.sh target/dynamic_analysis.jsonl
 
 static: $(STATIC_OUTPUTS)
 
+target/dynamic_analysis.jsonl: dynamic_analysis.py target/collect_dynamic_logs.touch target/collect_standard_dynamic_logs.touch
+	python3 $< | sort > $@
+
 target/scripts_to_benchmark.csv: scripts_to_benchmark.py
 	python3 $< | sort > $@
 
@@ -24,6 +27,12 @@ target/shellmetrics.sh:
 target/cyclomatic.csv: get_cyclomatic.py target/shellmetrics.sh
 	python3 get_cyclomatic.py > $@
 
-dynamic:
+target/collect_dynamic_logs.touch: 
+	python3 collect_dynamic_logs.py 
+	touch $@
+
+target/collect_standard_dynamic_logs.touch: 
+	python3 standard_run.py
+	touch $@
 
-.PHONY: static dynamic clean-static static-test
+.PHONY: static clean-static static-test
@@ -19,3 +19,7 @@ def get_all_scripts(
         ]
         for benchmark_name, benchmark_data in benchmark_data.items()
     }
+
+if __name__ == "__main__":
+    for bench in get_all_scripts().keys():
+        print(bench)
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+REPO_TOP="$(git rev-parse --show-toplevel)"
+
+benches=$(python3 "$REPO_TOP/infrastructure/all_scripts.py" | sort)
+
+for bench in $benches; do
+    bash $REPO_TOP/$bench/deps.sh
+done
+
+for bench in $benches; do
+    bash $REPO_TOP/$bench/input.sh
+done
+
+for bench in $benches; do
+    python3 $REPO_TOP/infrastructure/run_dynamic.py $bench
+done
+
+touch "$REPO_TOP/infrastructure/target/collect_dynamic_logs.touch"
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import fnmatch
+import viz_syntax as stx
+import viz_dynamic as dyn
+
+from all_scripts import get_all_scripts
+from project_root import get_project_root
+
+root = get_project_root()
+data_path = root / 'infrastructure/target/dynamic_analysis.jsonl'
+input_size_path = root / 'infrastructure/data/size_inputs.jsonl'
+loc_data_path = root / 'infrastructure/target/lines_of_code.csv'
+
+benchmark_category_style = {
+    'bio': ('XXX', 'XXX', 'XXX'),
+    'vps-audit': ('XXX', 'XXX', 'XXX'),
+    'vps-audit-negate': ('XXX', 'XXX', 'XXX'),
+    'aurpkg': ('XXX', 'XXX', 'XXX'),
+    'makeself': ('XXX', 'XXX', 'XXX'),
+    'infrastructure/standards/100-files': ('XXX', 'XXX', 'XXX'),
+    'infrastructure/standards/read-write': ('XXX', 'XXX', 'XXX'),
+    'infrastructure/standards/shell-memory': ('XXX', 'XXX', 'XXX'),
+    'infrastructure/standards/sleep': ('XXX', 'XXX', 'XXX'),
+    'infrastructure/standards/time-in-shell-subprocess': ('XXX', 'XXX', 'XXX'),
+    'infrastructure/standards/user-time': ('XXX', 'XXX', 'XXX'),
+    'infrastructure/standards/user-time-in-shell': ('XXX', 'XXX', 'XXX'),
+    'infrastructure/standards/write-only': ('XXX', 'XXX', 'XXX'),
+    'covid-mts': ('Data analysis', 'Data extraction', '\\cite{covid-mts-source}'),
+    'file-enc': ('Cryptography', 'Automation', '\\cite{cito2020empirical}'),
+    'log-analysis': ('System admin.', 'Data extraction', '\\cite{spinellis2017extending, raghavan2020posh}'),
+    'max-temp': ('Data analysis', 'Data extraction', '\\cite{hadoop-guide-2009}'),
+    'media-conv': ('Misc.', 'Automation', '\\cite{spinellis2017extending, raghavan2020posh}'),
+    'nlp': ('Machine learning', 'Text processing', '\\cite{unix-for-poets-church}'),
+    'oneliners': ('Misc.', 'Text processing', '\\cite{bentley-pearl-cacm-1985, bentley-pearl-cacm-1986, unix-cacm-1974, wicked-cool-shell-scripts}'),
+    'riker': ('Development', 'Build scripts', ''),
+    'sklearn': ('Machine learning', 'Automation', ''),
+    'uniq-ips': ('System admin.', 'Automation', ''),
+    'unix50': ('Misc.', 'Text processing', '\\cite{bhandari2020solutions}'),
+    'web-index': ('Development', 'Text processing', '\\cite{pash2021}')
+}
+
+def short_category(benchmark):
+    dom, style, _ = benchmark_category_style[benchmark]
+    def shorten(str):
+        return ''.join([x[0].upper() for x in str.split(' ')])
+    return shorten(dom) + '/' + shorten(style)
+
+benchmark_input_description = {
+    'aurpkg': 'package files',
+    'bio': 'biological data files',
+    'covid-mts': 'transit data',
+    'file-enc': 'pcap files',
+    'log-analysis': 'log files',
+    'max-temp': 'temperature data',
+    'media-conv': 'media files',
+    'nlp': 'text files',
+    'oneliners': 'text files',
+    'riker': 'source code files',
+    'sklearn': 'CSV files',
+    'uniq-ips': 'text files',
+    'unix50': 'text files',
+    'web-index': 'HTML files',
+    'bio': 'XXX',
+    'vps-audit': 'system status',
+    'vps-audit-negate': 'system status',
+    'makeself': 'XXX',
+    'aurpkg': 'XXX',
+    'infrastructure/standards/100-files': 'XXX',
+    'infrastructure/standards/read-write': 'XXX',
+    'infrastructure/standards/shell-memory': 'XXX',
+    'infrastructure/standards/sleep': 'XXX',
+    'infrastructure/standards/time-in-shell-subprocess': 'XXX',
+    'infrastructure/standards/user-time': 'XXX',
+    'infrastructure/standards/user-time-in-shell': 'XXX',
+    'infrastructure/standards/write-only': 'XXX',
+}
+
+scripts_to_include = [
+    'covid-mts/scripts/1.sh',
+    'file-enc/scripts/encrypt_files.sh',
+    'log-analysis/scripts/nginx.sh',
+    'media-conv/scripts/img_convert.sh',
+    'nlp/scripts/bigrams.sh',
+    'oneliners/*',
+    'unix50/scripts/1.sh',
+    'riker/scripts/redis/build.sh',
+    # max-temp is just 1
+    # sklearn is just 1
+    # aurpkg is just 1
+    # bio is just 1
+    # makeself??
+    'web-index/scripts/ngrams.sh',
+    # vps-audit is just 1
+]
+
+
+def count_unique_cmds(series):
+    return len({node for node in series if 'command(' in node})
+
+def count_constructs(series):
+    return len(set(series))
+
+def read_loc_data():
+    loc_data = pd.read_csv(loc_data_path, header=None)
+    loc_data.columns = ['script', 'loc']
+    loc_data['benchmark'] = loc_data['script'].apply(lambda x: x.split('/')[0])
+    loc_data_bench = loc_data.groupby('benchmark').agg({
+        'loc': 'sum',
+        'script': 'count'
+    }).reset_index()
+    loc_data_bench.rename(columns={'script': 'number_of_scripts'}, inplace=True)
+    return loc_data, loc_data_bench
+
+def prettify_bytes_number(n):
+    if n < 1024:
+        value, unit = n, "B"
+    elif n < 1024 * 1024:
+        value, unit = n / 1024, "KB"
+    elif n < 1024 * 1024 * 1024:
+        value, unit = n / (1024 * 1024), "MB"
+    else:
+        value, unit = n / (1024 * 1024 * 1024), "GB"
+    
+    if value < 10:
+        decimals = 2
+    elif value < 100:
+        decimals = 1
+    else:
+        decimals = 0
+    
+    color = 'black' if unit == 'GB' else 'gray'
+    return f"{value:.{decimals}f} \\textcolor{{{color}}}{{{unit}}}"
+
+def main():
+    syntax_script, syntax_bench = stx.read_data(True)
+    syntax_script_all_cmds, syntax_bench_all_cmds = stx.read_data(False)
+    dyn_script,    dyn_bench = dyn.read_data()
+    loc_data_script, loc_data_bench = read_loc_data()
+
+    syntax_script_all_cmds['unique_cmds'] = syntax_script_all_cmds['nodes'].apply(count_unique_cmds)
+    syntax_bench_all_cmds['unique_cmds'] = syntax_bench_all_cmds['nodes'].apply(count_unique_cmds)
+    syntax_script['constructs'] = syntax_script['nodes'].apply(count_constructs)
+    syntax_bench['constructs'] = syntax_bench['nodes'].apply(count_constructs)
+    
+    # all_scripts = set(syntax_script['script'].unique())
+    
+    # missing_in_dyn = all_scripts - set(dyn_script['script'].unique())
+    # missing_in_loc_data = all_scripts - set(loc_data_script['script'].unique())
+    # missing_in_cmds = all_scripts - set(syntax_script_all_cmds['script'].unique())
+    
+    # print("Missing in dyn_script:", missing_in_dyn)
+    # print("Missing in loc_data_script:", missing_in_loc_data)
+    # print("Missing in syntax_script_all_cmds:", missing_in_cmds)
+
+    dyn_bench['input_description'] = dyn_bench['benchmark'].apply(lambda x: benchmark_input_description[x])
+
+    big_bench = syntax_bench.merge(dyn_bench, on='benchmark')\
+        .merge(loc_data_bench, on='benchmark')\
+        .merge(syntax_bench_all_cmds[['benchmark', 'unique_cmds']], on='benchmark')
+    
+    big_script = syntax_script.merge(dyn_script, on='script')\
+        .merge(loc_data_script, on='script')\
+        .merge(syntax_script_all_cmds[['script', 'unique_cmds']], on='script')
+    
+
+    print("""
+          \\def\\idw{5em}
+\\begin{tabular}{l|lrr|rr|l|rrrr|lr}
+    \\toprule
+\\multirow{2}{*}{Benchmark/Script} & \\multicolumn{3}{c|}{Surface} & \\multicolumn{2}{c|}{Syntax} & \\multicolumn{1}{c|}{Inputs} & \\multicolumn{4}{c|}{Dynamic} & \\multicolumn{2}{c}{System} \\\\
+                                  & Dom     & \\#.sh     & LOC    & \\# Cons       & \\# Cmd      &                             & T.sh  & T.cmd  & Mem   & I/O & \\# s/c       & \\# fd       \\\\
+    \\midrule
+""")
+    # generate a big latex table with the following columns:
+    # benchmark, short_category, number of scripts, LOC, number of constructs, number of unique commands, input description, time in shell, time in commands, max memory, IO
+    for _, row in big_bench.iterrows():
+        numscripts_shown = 0
+        numscripts = row['number_of_scripts']
+        print("\\rule{0pt}{5ex}")
+        print(f"\\textbf{{\\tt {row['benchmark']}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {row['constructs']} & {row['unique_cmds']} & \\multirow{{2}}{{*}}{{\\parbox{{\\idw}}{{{prettify_bytes_number(row['input_size']) + ' of ' + row['input_description']}}}}} & {row['time_in_shell']:.2f} & {row['time_in_commands']:.2f} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} \\\\")
+        # now print the details of all scripts in the benchmark
+        for _, row_script in big_script.iterrows():
+            if row_script['benchmark'] == row['benchmark'] and any([fnmatch.fnmatch(row_script['script'], pattern) for pattern in scripts_to_include]):
+                # all columns except leave blank benchmark, category, number of scripts, input description
+                print(f"\\hspace{{0.5em}} {row_script['script'].split('/')[-1]} & & & {row_script['loc']} & {row_script['constructs']} & {row_script['unique_cmds']} & & {row_script['time_in_shell']:.2f} & {row_script['time_in_commands']:.2f} & {prettify_bytes_number(row_script['max_unique_set_size'])} & {prettify_bytes_number(row_script['io_chars'])} \\\\")
+                numscripts_shown += 1
+        if numscripts_shown < numscripts and numscripts > 1:
+            print(f"\\hspace{{0.5em}} \\ldots & & & & & & & & & & \\\\")
+    print("""
+    \\bottomrule
+  \\end{tabular}
+""")
+    
+
+if __name__ == '__main__':
+    main()
@@ -13,6 +13,7 @@
 root = get_project_root()
 for benchmark_name, scripts in get_all_scripts().items():
     for script in scripts:
+        print(script)
         asts = parse_shell_script(script)
         count = Counter()
         count_nodes(asts, count)
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+#!/bin/bash`
	`2`	`+`
`1`	`3`	`IN=inputs`
`2`	`4`	`IN_NAME=input.txt`
`3`	`5`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+#!/bin/bash`
	`2`	`+`
`1`	`3`	`sudo apt-get update`
`2`	`4`
`3`	`5`	`pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'`
Original file line number	Diff line number	Diff line change
`@@ -19,3 +19,7 @@ def get_all_scripts(`
`19`	`19`	`]`
`20`	`20`	`for benchmark_name, benchmark_data in benchmark_data.items()`
`21`	`21`	`}`
	`22`	`+`
	`23`	`+if __name__ == "__main__":`
	`24`	`+ for bench in get_all_scripts().keys():`
	`25`	`+ print(bench)`