Skip to content

Commit

Permalink
Changes in GNU parallel benchmarks and added Shark
Browse files Browse the repository at this point in the history
  • Loading branch information
Geoka1 committed Jan 10, 2025
1 parent 70778dc commit 5eb8c40
Show file tree
Hide file tree
Showing 1,163 changed files with 18,044 additions and 145 deletions.
15 changes: 10 additions & 5 deletions infrastructure/systems/GNU-Parallel/covid-mts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@ input_file="$input_dir/in$suffix.csv"
output_scoped="$outputs_dir/outputs$suffix"
mkdir -p "$output_scoped"

BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}
file_size=$(du -b "$input_file" | awk '{print $1}')
nproc=$(nproc)
chunk_size=$((file_size / nproc))

export chunk_size

$BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
$BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
$BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
$BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"
BENCHMARK_SHELL=${BENCHMARK_SHELL:-bash}

time $BENCHMARK_SHELL "$scripts_dir/1.sh" "$input_file" > "$output_scoped/1.out"
time $BENCHMARK_SHELL "$scripts_dir/2.sh" "$input_file" > "$output_scoped/2.out"
time $BENCHMARK_SHELL "$scripts_dir/3.sh" "$input_file" > "$output_scoped/3.out"
time $BENCHMARK_SHELL "$scripts_dir/4.sh" "$input_file" > "$output_scoped/4.out"
27 changes: 9 additions & 18 deletions infrastructure/systems/GNU-Parallel/covid-mts/scripts/1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,18 @@
INPUT="$1"

process_chunk() {
sed 's/T..:..:..//' "$1" |
sed 's/T..:..:..//'|
cut -d ',' -f 1,3
}
export -f process_chunk

lines=$(wc -l < "$1")
nproc=$(nproc)
chunk_size=$((lines / nproc))
tmp_dir=$(mktemp -d)
trap "rm -rf $tmp_dir" EXIT

cat "$INPUT" | parallel --pipe --block "$chunk_size" -j "$nproc" process_chunk > "$tmp_dir/combined.tmp"

split -l "$chunk_size" "$INPUT" chunk_
ls chunk_* | parallel -j "$(nproc)" process_chunk > combined.tmp

cat combined.tmp |
sort -u | # global deduplication
cut -d ',' -f 1 | # keep all dates
sort | # preparing for uniq
uniq -c | # count unique dates
awk '{print $2,$1}' # print first date, then count


# Clean up temporary files
rm chunk_*
rm combined.tmp
sort -u "$tmp_dir/combined.tmp" |
cut -d ',' -f 1 |
sort |
uniq -c |
awk '{print $2,$1}'
20 changes: 6 additions & 14 deletions infrastructure/systems/GNU-Parallel/covid-mts/scripts/2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,28 +26,20 @@
INPUT="$1"

process_chunk() {
local chunk="$1"
sed 's/T..:..:..//' "$chunk" |
sed 's/T..:..:..//'|
cut -d ',' -f 3,1
}
export -f process_chunk

lines=$(wc -l < "$1")
nproc=$(nproc)
chunk_size=$((lines / nproc))
tmp_dir=$(mktemp -d)
trap "rm -rf $tmp_dir" EXIT

cat "$INPUT" |
parallel --pipe --block "$chunk_size" -j "$nproc" process_chunk > "$tmp_dir/combined.tmp"

split -l "$chunk_size" "$INPUT" chunk_
ls chunk_* | parallel -j "$(nproc)" process_chunk > combined.tmp

# Combine and process the results sequentially
cat combined.tmp |
sort -u | # global deduplication
sort -u "$tmp_dir/combined.tmp" |
cut -d ',' -f 2 |
sort |
uniq -c |
sort -k 1 -n |
awk '{print $2,$1}'

rm chunk_*
rm combined.tmp
32 changes: 12 additions & 20 deletions infrastructure/systems/GNU-Parallel/covid-mts/scripts/3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,28 +26,20 @@
INPUT="$1"

process_chunk() {
local chunk="$1"
sed 's/T\(..\):..:../,\1/' "$chunk" | # keep times only
cut -d ',' -f 1,2,4 # keep only time, date, and bus ID
sed 's/T\(..\):..:../,\1/' |
cut -d ',' -f 1,2,4
}
export -f process_chunk

lines=$(wc -l < "$1")
nproc=$(nproc)
chunk_size=$((lines / nproc))
tmp_dir=$(mktemp -d)
trap "rm -rf $tmp_dir" EXIT

cat "$INPUT" |
parallel --pipe --block "$chunk_size" -j "$nproc" process_chunk > "$tmp_dir/combined.tmp"

split -l "$chunk_size" "$INPUT" chunk_

ls chunk_* | parallel -j "$(nproc)" process_chunk > combined.tmp

cat combined.tmp |
sort -u | # global deduplication
cut -d ',' -f 3 | # keep only bus ID
sort | # prepare for counting
uniq -c | # count hours per bus
sort -k 1 -n | # sort in numerical order
awk '{print $2,$1}' # print bus ID, then count

rm chunk_*
rm combined.tmp
sort -u "$tmp_dir/combined.tmp" |
cut -d ',' -f 3 |
sort |
uniq -c |
sort -k 1 -n |
awk '{print $2,$1}'
29 changes: 11 additions & 18 deletions infrastructure/systems/GNU-Parallel/covid-mts/scripts/4.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,19 @@ INPUT="$1"

process_chunk() {
local chunk="$1"
sed 's/T\(..\):..:../,\1/' "$chunk" | # keep times only
cut -d ',' -f 1,2 # keep only time and date
sed 's/T\(..\):..:../,\1/'|
cut -d ',' -f 1,2
}
export -f process_chunk

lines=$(wc -l < "$1")
nproc=$(nproc)
chunk_size=$((lines / nproc))
tmp_dir=$(mktemp -d)
trap "rm -rf $tmp_dir" EXIT

cat "$INPUT" |
parallel --pipe --block "$chunk_size" -j "$nproc" process_chunk > "$tmp_dir/combined.tmp"

split -l "$chunk_size""$INPUT" chunk_

ls chunk_* | parallel -j "$(nproc)" process_chunk > combined.tmp

cat combined.tmp |
sort -u | # global deduplication
cut -d ',' -f 1 | # keep only date
sort | # prepare for counting
uniq -c | # count unique dates
awk '{print $2,$1}' # print date, then count

rm chunk_*
rm combined.tmp
sort -u "$tmp_dir/combined.tmp" |
cut -d ',' -f 1 |
sort |
uniq -c |
awk '{print $2,$1}'
13 changes: 5 additions & 8 deletions infrastructure/systems/GNU-Parallel/covid-mts/scripts/5.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ process_chunk() {

export -f process_chunk

# Split the input file into chunks
split -l 10000 "$INPUT" chunk_
tmp_dir=$(mktemp -d)
trap "rm -rf $tmp_dir" EXIT

# Process each chunk in parallel and combine results
ls chunk_* | parallel -j "$(nproc)" process_chunk > combined.tmp
cat "$INPUT" | parallel --pipe --block "$chunk_size" -j "$(nproc)" process_chunk > "$tmp_dir/combined.tmp"

# Aggregate results globally
awk '
Expand All @@ -61,8 +61,5 @@ END {
printf("%d\t", hours[d " " b] ? hours[d " " b] : 0);
printf("\n");
}
}' combined.tmp > out

# Clean up temporary files
rm chunk_*
rm combined.tmp
}' "$tmp_dir/combined.tmp" > out

2 changes: 1 addition & 1 deletion infrastructure/systems/GNU-Parallel/nlp/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,6 @@ while IFS= read -r script; do
mkdir -p "$output_dir"

echo "$script"
$BENCHMARK_SHELL "$script_file" "$output_dir"
time $BENCHMARK_SHELL "$script_file" "$output_dir"
echo "$?"
done <<< "$script_names"
3 changes: 0 additions & 3 deletions infrastructure/systems/GNU-Parallel/nlp/scripts/bigrams.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,4 @@ export -f pure_func

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" pure_func {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,5 @@ export -f pure_func

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" pure_func {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,4 @@ export -f pure_func

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" pure_func {} "${IN}" "${INPUT2}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,5 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"

Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,4 @@ export -f pure_func

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" pure_func {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

rm -f file_list.txt

echo "done"
2 changes: 0 additions & 2 deletions infrastructure/systems/GNU-Parallel/nlp/scripts/sort.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,5 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"

Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,4 @@ export -f process_file

ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,4 @@ export -f process_file
# Use GNU Parallel to process files concurrently
ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,4 @@ export -f process_file
# Use GNU Parallel to process files concurrently
ls "${IN}" | head -n "${ENTRIES}" | parallel -j "$(nproc)" process_file {} "${IN}" "${OUT}"

# Cleanup
rm -f file_list.txt

echo "done"
Loading

0 comments on commit 5eb8c40

Please sign in to comment.