Skip to content

Commit 54c6f01

Browse files
committed
dynamic analysis 2
1 parent 4cf0a29 commit 54c6f01

File tree

271 files changed

+8606
-297
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

271 files changed

+8606
-297
lines changed

aurpkg/run.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ mkdir -p ${OUT}
77

88
script="./scripts/pacaur.sh"
99

10+
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
11+
export BENCHMARK_CATEGORY="aurpkg"
12+
export BENCHMARK_SCRIPT="$(realpath "$script")"
13+
export BENCHMARK_INPUT_FILE="$(realpath "$IN")"
14+
1015
# Switch to user "user" to avoid permission issues
1116

1217
echo "$script"

bio/deps.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#!/bin/bash
12
# install dependencies
23
required_version="1.7"
34

@@ -43,4 +44,5 @@ else
4344
echo "Failed to install the correct version of Samtools."
4445
exit 1
4546
fi
46-
fi
47+
fi
48+

bio/input.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/bin/bash
2+
13
IN=inputs
24
IN_NAME=input.txt
35

bio/run.sh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/bin/bash
2+
13
# create bam files with regions
24
################### 1KG SAMPLES
35
IN=inputs
@@ -8,6 +10,11 @@ if [[ "$@" == *"--small"* ]]; then
810
IN_NAME=input_small.txt
911
fi
1012

13+
export BENCHMARK_CATEGORY="bio"
1114
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
1215

13-
"$BENCHMARK_SHELL" ./scripts/bio.sh "$IN" "$IN_NAME" "$OUT"
16+
script_file=./scripts/bio.sh
17+
export BENCHMARK_SCRIPT="$(realpath "$script_file")"
18+
export BENCHMARK_INPUT_FILE="$(realpath "$IN_NAME")"
19+
20+
$BENCHMARK_SHELL "$script_file" "$IN" "$IN_NAME" "$OUT"

covid-mts/run.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,15 @@ output_scoped="$outputs_dir/outputs$suffix"
1616
mkdir -p "$output_scoped"
1717

1818
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
19+
export BENCHMARK_CATEGORY="covid-mts"
20+
export BENCHMARK_INPUT_FILE="$(realpath "$input_file")"
1921

22+
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/1.sh")"
2023
$BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
24+
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/2.sh")"
2125
$BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
26+
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/3.sh")"
2227
$BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
28+
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/4.sh")"
2329
$BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"
2430

file-enc/deps.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#!/bin/bash
2+
13
sudo apt-get update
24

35
pkgs='ffmpeg unrtf imagemagick libarchive-tools libncurses5-dev libncursesw5-dev zstd liblzma-dev libbz2-dev zip unzip nodejs tcpdump'

file-enc/run.sh

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ if [[ "$1" == "--small" ]]; then
1717
suffix=".small"
1818
fi
1919

20+
export BENCHMARK_CATEGORY="file-enc"
21+
export BENCHMARK_INPUT_FILE="$(realpath "$input_pcaps")"
2022
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
21-
$BENCHMARK_SHELL $scripts_dir/compress_files.sh $input_pcaps $results_dir/compress_files$suffix
22-
$BENCHMARK_SHELL $scripts_dir/encrypt_files.sh $input_pcaps $results_dir/encrypt_files$suffix
23+
24+
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/compress_files.sh")"
25+
$BENCHMARK_SHELL "$scripts_dir/compress_files.sh" "$input_pcaps" "$results_dir/compress_files$suffix"
26+
export BENCHMARK_SCRIPT="$(realpath "$scripts_dir/encrypt_files.sh")"
27+
$BENCHMARK_SHELL "$scripts_dir/encrypt_files.sh" "$input_pcaps" "$results_dir/encrypt_files$suffix"

infrastructure/Makefile

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/cyclomatic.csv target/shellmetrics.sh
1+
STATIC_OUTPUTS = target/lines_of_code.csv target/nodes_in_scripts.csv target/scripts_to_benchmark.csv target/cyclomatic.csv target/shellmetrics.sh target/dynamic_analysis.jsonl
22

33
static: $(STATIC_OUTPUTS)
44

5+
target/dynamic_analysis.jsonl: dynamic_analysis.py target/collect_dynamic_logs.touch target/collect_standard_dynamic_logs.touch
6+
python3 $< | sort > $@
7+
58
target/scripts_to_benchmark.csv: scripts_to_benchmark.py
69
python3 $< | sort > $@
710

@@ -24,6 +27,12 @@ target/shellmetrics.sh:
2427
target/cyclomatic.csv: get_cyclomatic.py target/shellmetrics.sh
2528
python3 get_cyclomatic.py > $@
2629

27-
dynamic:
30+
target/collect_dynamic_logs.touch:
31+
python3 collect_dynamic_logs.py
32+
touch $@
33+
34+
target/collect_standard_dynamic_logs.touch:
35+
python3 standard_run.py
36+
touch $@
2837

29-
.PHONY: static dynamic clean-static static-test
38+
.PHONY: static clean-static static-test

infrastructure/all_scripts.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,7 @@ def get_all_scripts(
1919
]
2020
for benchmark_name, benchmark_data in benchmark_data.items()
2121
}
22+
23+
if __name__ == "__main__":
24+
for bench in get_all_scripts().keys():
25+
print(bench)
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
3+
REPO_TOP="$(git rev-parse --show-toplevel)"
4+
5+
benches=$(python3 "$REPO_TOP/infrastructure/all_scripts.py" | sort)
6+
7+
for bench in $benches; do
8+
bash $REPO_TOP/$bench/deps.sh
9+
done
10+
11+
for bench in $benches; do
12+
bash $REPO_TOP/$bench/input.sh
13+
done
14+
15+
for bench in $benches; do
16+
python3 $REPO_TOP/infrastructure/run_dynamic.py $bench
17+
done
18+
19+
touch "$REPO_TOP/infrastructure/target/collect_dynamic_logs.touch"

infrastructure/colossal_table.py

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
#!/usr/bin/env python3
2+
3+
import pandas as pd
4+
import fnmatch
5+
import viz_syntax as stx
6+
import viz_dynamic as dyn
7+
8+
from all_scripts import get_all_scripts
9+
from project_root import get_project_root
10+
11+
root = get_project_root()
12+
data_path = root / 'infrastructure/target/dynamic_analysis.jsonl'
13+
input_size_path = root / 'infrastructure/data/size_inputs.jsonl'
14+
loc_data_path = root / 'infrastructure/target/lines_of_code.csv'
15+
16+
benchmark_category_style = {
17+
'bio': ('XXX', 'XXX', 'XXX'),
18+
'vps-audit': ('XXX', 'XXX', 'XXX'),
19+
'vps-audit-negate': ('XXX', 'XXX', 'XXX'),
20+
'aurpkg': ('XXX', 'XXX', 'XXX'),
21+
'makeself': ('XXX', 'XXX', 'XXX'),
22+
'infrastructure/standards/100-files': ('XXX', 'XXX', 'XXX'),
23+
'infrastructure/standards/read-write': ('XXX', 'XXX', 'XXX'),
24+
'infrastructure/standards/shell-memory': ('XXX', 'XXX', 'XXX'),
25+
'infrastructure/standards/sleep': ('XXX', 'XXX', 'XXX'),
26+
'infrastructure/standards/time-in-shell-subprocess': ('XXX', 'XXX', 'XXX'),
27+
'infrastructure/standards/user-time': ('XXX', 'XXX', 'XXX'),
28+
'infrastructure/standards/user-time-in-shell': ('XXX', 'XXX', 'XXX'),
29+
'infrastructure/standards/write-only': ('XXX', 'XXX', 'XXX'),
30+
'covid-mts': ('Data analysis', 'Data extraction', '\\cite{covid-mts-source}'),
31+
'file-enc': ('Cryptography', 'Automation', '\\cite{cito2020empirical}'),
32+
'log-analysis': ('System admin.', 'Data extraction', '\\cite{spinellis2017extending, raghavan2020posh}'),
33+
'max-temp': ('Data analysis', 'Data extraction', '\\cite{hadoop-guide-2009}'),
34+
'media-conv': ('Misc.', 'Automation', '\\cite{spinellis2017extending, raghavan2020posh}'),
35+
'nlp': ('Machine learning', 'Text processing', '\\cite{unix-for-poets-church}'),
36+
'oneliners': ('Misc.', 'Text processing', '\\cite{bentley-pearl-cacm-1985, bentley-pearl-cacm-1986, unix-cacm-1974, wicked-cool-shell-scripts}'),
37+
'riker': ('Development', 'Build scripts', ''),
38+
'sklearn': ('Machine learning', 'Automation', ''),
39+
'uniq-ips': ('System admin.', 'Automation', ''),
40+
'unix50': ('Misc.', 'Text processing', '\\cite{bhandari2020solutions}'),
41+
'web-index': ('Development', 'Text processing', '\\cite{pash2021}')
42+
}
43+
44+
def short_category(benchmark):
45+
dom, style, _ = benchmark_category_style[benchmark]
46+
def shorten(str):
47+
return ''.join([x[0].upper() for x in str.split(' ')])
48+
return shorten(dom) + '/' + shorten(style)
49+
50+
benchmark_input_description = {
51+
'aurpkg': 'package files',
52+
'bio': 'biological data files',
53+
'covid-mts': 'transit data',
54+
'file-enc': 'pcap files',
55+
'log-analysis': 'log files',
56+
'max-temp': 'temperature data',
57+
'media-conv': 'media files',
58+
'nlp': 'text files',
59+
'oneliners': 'text files',
60+
'riker': 'source code files',
61+
'sklearn': 'CSV files',
62+
'uniq-ips': 'text files',
63+
'unix50': 'text files',
64+
'web-index': 'HTML files',
65+
'bio': 'XXX',
66+
'vps-audit': 'system status',
67+
'vps-audit-negate': 'system status',
68+
'makeself': 'XXX',
69+
'aurpkg': 'XXX',
70+
'infrastructure/standards/100-files': 'XXX',
71+
'infrastructure/standards/read-write': 'XXX',
72+
'infrastructure/standards/shell-memory': 'XXX',
73+
'infrastructure/standards/sleep': 'XXX',
74+
'infrastructure/standards/time-in-shell-subprocess': 'XXX',
75+
'infrastructure/standards/user-time': 'XXX',
76+
'infrastructure/standards/user-time-in-shell': 'XXX',
77+
'infrastructure/standards/write-only': 'XXX',
78+
}
79+
80+
scripts_to_include = [
81+
'covid-mts/scripts/1.sh',
82+
'file-enc/scripts/encrypt_files.sh',
83+
'log-analysis/scripts/nginx.sh',
84+
'media-conv/scripts/img_convert.sh',
85+
'nlp/scripts/bigrams.sh',
86+
'oneliners/*',
87+
'unix50/scripts/1.sh',
88+
'riker/scripts/redis/build.sh',
89+
# max-temp is just 1
90+
# sklearn is just 1
91+
# aurpkg is just 1
92+
# bio is just 1
93+
# makeself??
94+
'web-index/scripts/ngrams.sh',
95+
# vps-audit is just 1
96+
]
97+
98+
99+
def count_unique_cmds(series):
100+
return len({node for node in series if 'command(' in node})
101+
102+
def count_constructs(series):
103+
return len(set(series))
104+
105+
def read_loc_data():
106+
loc_data = pd.read_csv(loc_data_path, header=None)
107+
loc_data.columns = ['script', 'loc']
108+
loc_data['benchmark'] = loc_data['script'].apply(lambda x: x.split('/')[0])
109+
loc_data_bench = loc_data.groupby('benchmark').agg({
110+
'loc': 'sum',
111+
'script': 'count'
112+
}).reset_index()
113+
loc_data_bench.rename(columns={'script': 'number_of_scripts'}, inplace=True)
114+
return loc_data, loc_data_bench
115+
116+
def prettify_bytes_number(n):
117+
if n < 1024:
118+
value, unit = n, "B"
119+
elif n < 1024 * 1024:
120+
value, unit = n / 1024, "KB"
121+
elif n < 1024 * 1024 * 1024:
122+
value, unit = n / (1024 * 1024), "MB"
123+
else:
124+
value, unit = n / (1024 * 1024 * 1024), "GB"
125+
126+
if value < 10:
127+
decimals = 2
128+
elif value < 100:
129+
decimals = 1
130+
else:
131+
decimals = 0
132+
133+
color = 'black' if unit == 'GB' else 'gray'
134+
return f"{value:.{decimals}f} \\textcolor{{{color}}}{{{unit}}}"
135+
136+
def main():
137+
syntax_script, syntax_bench = stx.read_data(True)
138+
syntax_script_all_cmds, syntax_bench_all_cmds = stx.read_data(False)
139+
dyn_script, dyn_bench = dyn.read_data()
140+
loc_data_script, loc_data_bench = read_loc_data()
141+
142+
syntax_script_all_cmds['unique_cmds'] = syntax_script_all_cmds['nodes'].apply(count_unique_cmds)
143+
syntax_bench_all_cmds['unique_cmds'] = syntax_bench_all_cmds['nodes'].apply(count_unique_cmds)
144+
syntax_script['constructs'] = syntax_script['nodes'].apply(count_constructs)
145+
syntax_bench['constructs'] = syntax_bench['nodes'].apply(count_constructs)
146+
147+
# all_scripts = set(syntax_script['script'].unique())
148+
149+
# missing_in_dyn = all_scripts - set(dyn_script['script'].unique())
150+
# missing_in_loc_data = all_scripts - set(loc_data_script['script'].unique())
151+
# missing_in_cmds = all_scripts - set(syntax_script_all_cmds['script'].unique())
152+
153+
# print("Missing in dyn_script:", missing_in_dyn)
154+
# print("Missing in loc_data_script:", missing_in_loc_data)
155+
# print("Missing in syntax_script_all_cmds:", missing_in_cmds)
156+
157+
dyn_bench['input_description'] = dyn_bench['benchmark'].apply(lambda x: benchmark_input_description[x])
158+
159+
big_bench = syntax_bench.merge(dyn_bench, on='benchmark')\
160+
.merge(loc_data_bench, on='benchmark')\
161+
.merge(syntax_bench_all_cmds[['benchmark', 'unique_cmds']], on='benchmark')
162+
163+
big_script = syntax_script.merge(dyn_script, on='script')\
164+
.merge(loc_data_script, on='script')\
165+
.merge(syntax_script_all_cmds[['script', 'unique_cmds']], on='script')
166+
167+
168+
print("""
169+
\\def\\idw{5em}
170+
\\begin{tabular}{l|lrr|rr|l|rrrr|lr}
171+
\\toprule
172+
\\multirow{2}{*}{Benchmark/Script} & \\multicolumn{3}{c|}{Surface} & \\multicolumn{2}{c|}{Syntax} & \\multicolumn{1}{c|}{Inputs} & \\multicolumn{4}{c|}{Dynamic} & \\multicolumn{2}{c}{System} \\\\
173+
& Dom & \\#.sh & LOC & \\# Cons & \\# Cmd & & T.sh & T.cmd & Mem & I/O & \\# s/c & \\# fd \\\\
174+
\\midrule
175+
""")
176+
# generate a big latex table with the following columns:
177+
# benchmark, short_category, number of scripts, LOC, number of constructs, number of unique commands, input description, time in shell, time in commands, max memory, IO
178+
for _, row in big_bench.iterrows():
179+
numscripts_shown = 0
180+
numscripts = row['number_of_scripts']
181+
print("\\rule{0pt}{5ex}")
182+
print(f"\\textbf{{\\tt {row['benchmark']}}} & {short_category(row['benchmark'])} & {row['number_of_scripts']} & {row['loc']} & {row['constructs']} & {row['unique_cmds']} & \\multirow{{2}}{{*}}{{\\parbox{{\\idw}}{{{prettify_bytes_number(row['input_size']) + ' of ' + row['input_description']}}}}} & {row['time_in_shell']:.2f} & {row['time_in_commands']:.2f} & {prettify_bytes_number(row['max_unique_set_size'])} & {prettify_bytes_number(row['io_chars'])} \\\\")
183+
# now print the details of all scripts in the benchmark
184+
for _, row_script in big_script.iterrows():
185+
if row_script['benchmark'] == row['benchmark'] and any([fnmatch.fnmatch(row_script['script'], pattern) for pattern in scripts_to_include]):
186+
# all columns except leave blank benchmark, category, number of scripts, input description
187+
print(f"\\hspace{{0.5em}} {row_script['script'].split('/')[-1]} & & & {row_script['loc']} & {row_script['constructs']} & {row_script['unique_cmds']} & & {row_script['time_in_shell']:.2f} & {row_script['time_in_commands']:.2f} & {prettify_bytes_number(row_script['max_unique_set_size'])} & {prettify_bytes_number(row_script['io_chars'])} \\\\")
188+
numscripts_shown += 1
189+
if numscripts_shown < numscripts and numscripts > 1:
190+
print(f"\\hspace{{0.5em}} \\ldots & & & & & & & & & & \\\\")
191+
print("""
192+
\\bottomrule
193+
\\end{tabular}
194+
""")
195+
196+
197+
if __name__ == '__main__':
198+
main()

infrastructure/count_nodes_in_scripts.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
root = get_project_root()
1414
for benchmark_name, scripts in get_all_scripts().items():
1515
for script in scripts:
16+
print(script)
1617
asts = parse_shell_script(script)
1718
count = Counter()
1819
count_nodes(asts, count)

0 commit comments

Comments
 (0)