Skip to content

Commit

Permalink
sample sort final
Browse files Browse the repository at this point in the history
  • Loading branch information
milindasf committed Jul 31, 2023
1 parent 0bc8f6f commit 89b9d11
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 11 deletions.
33 changes: 25 additions & 8 deletions miniapps/samplesort/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,32 @@ python sample_sort_cupy.py -r 10 -w 5 -n 500000000 -gpu 3 -check 0
python sample_sort_cupy.py -r 10 -w 5 -n 500000000 -gpu 4 -check 0

echo "crosspy w thread"
python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 1 -check 0 -m crosspy
python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 2 -check 0 -m crosspy
python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 3 -check 0 -m crosspy
python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 4 -check 0 -m crosspy
python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 1 -check 0 -m crosspy
python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 2 -check 0 -m crosspy
python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 3 -check 0 -m crosspy
python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 4 -check 0 -m crosspy

echo "crosspy w parla"
python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 1 -check 0 -m parla
python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 2 -check 0 -m parla
python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 3 -check 0 -m parla
python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 4 -check 0 -m parla
python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 1 -check 0 -m parla
python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 2 -check 0 -m parla
python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 3 -check 0 -m parla
python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 4 -check 0 -m parla


echo "cupy only"
python sample_sort_cupy.py -r 10 -w 10 -n 100000000 -gpu 1 -check 0
python sample_sort_cupy.py -r 10 -w 10 -n 200000000 -gpu 2 -check 0
python sample_sort_cupy.py -r 10 -w 10 -n 300000000 -gpu 3 -check 0
python sample_sort_cupy.py -r 10 -w 10 -n 400000000 -gpu 4 -check 0

echo "crosspy w thread"
python sample_sort.py -r 10 -w 10 -n 100000000 -gpu 1 -check 0 -m crosspy
python sample_sort.py -r 10 -w 10 -n 200000000 -gpu 2 -check 0 -m crosspy
python sample_sort.py -r 10 -w 10 -n 300000000 -gpu 3 -check 0 -m crosspy
python sample_sort.py -r 10 -w 10 -n 400000000 -gpu 4 -check 0 -m crosspy

echo "crosspy w parla"
python sample_sort.py -r 10 -w 10 -n 100000000 -gpu 1 -check 0 -m parla
python sample_sort.py -r 10 -w 10 -n 200000000 -gpu 2 -check 0 -m parla
python sample_sort.py -r 10 -w 10 -n 300000000 -gpu 3 -check 0 -m parla
python sample_sort.py -r 10 -w 10 -n 400000000 -gpu 4 -check 0 -m parla
13 changes: 10 additions & 3 deletions miniapps/samplesort/sample_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from threading import Thread
from time import perf_counter as time
from multiprocessing.pool import ThreadPool as WorkerPool
import sys

class pp(enum.IntEnum):
ALL = 0
Expand Down Expand Up @@ -130,8 +131,11 @@ def t1():
with cp.cuda.Device(i):
a[i] = cp.zeros(recieve_partitions[i], dtype = sbuff.dtype)
for j in range(num_gpu):
#with cp.cuda.Stream(non_blocking=True) as stream:
a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]] = cp.asarray(sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]])
with cp.cuda.Stream(non_blocking=True) as stream:
#a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]] = cp.asarray(sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]])
dst = a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]]
src = sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]]
dst.data.copy_from_async(src.data, src.nbytes, stream=stream)

cp.cuda.runtime.deviceSynchronize()

Expand Down Expand Up @@ -161,7 +165,10 @@ def t1(i):
a[i] = cp.zeros(recieve_partitions[i])
for j in range(num_gpu):
with cp.cuda.Stream(non_blocking=True) as stream:
a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]] = cp.asarray(sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]])
#a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]] = cp.asarray(sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]])
dst = a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]]
src = sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]]
dst.data.copy_from_async(src.data, src.nbytes, stream=stream)

cp.cuda.runtime.deviceSynchronize()

Expand Down

0 comments on commit 89b9d11

Please sign in to comment.