diff --git a/miniapps/samplesort/run.sh b/miniapps/samplesort/run.sh index 93b87c71..239d6a9a 100755 --- a/miniapps/samplesort/run.sh +++ b/miniapps/samplesort/run.sh @@ -5,15 +5,32 @@ python sample_sort_cupy.py -r 10 -w 5 -n 500000000 -gpu 3 -check 0 python sample_sort_cupy.py -r 10 -w 5 -n 500000000 -gpu 4 -check 0 echo "crosspy w thread" -python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 1 -check 0 -m crosspy -python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 2 -check 0 -m crosspy -python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 3 -check 0 -m crosspy -python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 4 -check 0 -m crosspy +python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 1 -check 0 -m crosspy +python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 2 -check 0 -m crosspy +python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 3 -check 0 -m crosspy +python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 4 -check 0 -m crosspy echo "crosspy w parla" -python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 1 -check 0 -m parla -python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 2 -check 0 -m parla -python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 3 -check 0 -m parla -python sample_sort.py -r 10 -w 5 -n 500000000 -gpu 4 -check 0 -m parla +python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 1 -check 0 -m parla +python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 2 -check 0 -m parla +python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 3 -check 0 -m parla +python sample_sort.py -r 10 -w 10 -n 500000000 -gpu 4 -check 0 -m parla +echo "cupy only" +python sample_sort_cupy.py -r 10 -w 10 -n 100000000 -gpu 1 -check 0 +python sample_sort_cupy.py -r 10 -w 10 -n 200000000 -gpu 2 -check 0 +python sample_sort_cupy.py -r 10 -w 10 -n 300000000 -gpu 3 -check 0 +python sample_sort_cupy.py -r 10 -w 10 -n 400000000 -gpu 4 -check 0 + +echo "crosspy w thread" +python sample_sort.py -r 10 -w 10 -n 100000000 -gpu 1 -check 0 -m crosspy +python sample_sort.py -r 10 -w 10 -n 200000000 -gpu 2 -check 0 -m crosspy +python sample_sort.py -r 10 -w 10 -n 300000000 -gpu 3 -check 0 -m crosspy +python sample_sort.py -r 10 -w 10 -n 400000000 -gpu 4 -check 0 -m crosspy + +echo "crosspy w parla" +python sample_sort.py -r 10 -w 10 -n 100000000 -gpu 1 -check 0 -m parla +python sample_sort.py -r 10 -w 10 -n 200000000 -gpu 2 -check 0 -m parla +python sample_sort.py -r 10 -w 10 -n 300000000 -gpu 3 -check 0 -m parla +python sample_sort.py -r 10 -w 10 -n 400000000 -gpu 4 -check 0 -m parla \ No newline at end of file diff --git a/miniapps/samplesort/sample_sort.py b/miniapps/samplesort/sample_sort.py index 22e85821..97759b11 100755 --- a/miniapps/samplesort/sample_sort.py +++ b/miniapps/samplesort/sample_sort.py @@ -10,6 +10,7 @@ from threading import Thread from time import perf_counter as time from multiprocessing.pool import ThreadPool as WorkerPool +import sys class pp(enum.IntEnum): ALL = 0 @@ -130,8 +131,11 @@ def t1(): with cp.cuda.Device(i): a[i] = cp.zeros(recieve_partitions[i], dtype = sbuff.dtype) for j in range(num_gpu): - #with cp.cuda.Stream(non_blocking=True) as stream: - a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]] = cp.asarray(sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]]) + with cp.cuda.Stream(non_blocking=True) as stream: + #a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]] = cp.asarray(sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]]) + dst = a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]] + src = sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]] + dst.data.copy_from_async(src.data, src.nbytes, stream=stream) cp.cuda.runtime.deviceSynchronize() @@ -161,7 +165,10 @@ def t1(i): a[i] = cp.zeros(recieve_partitions[i]) for j in range(num_gpu): with cp.cuda.Stream(non_blocking=True) as stream: - a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]] = cp.asarray(sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]]) + #a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]] = cp.asarray(sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]]) + dst = a[i][roffsets[i,j] : roffsets[i,j] + rcounts[i,j]] + src = sbuff.blockview[j][soffsets[j, i] : soffsets[j, i] + scounts[j, i]] + dst.data.copy_from_async(src.data, src.nbytes, stream=stream) cp.cuda.runtime.deviceSynchronize()