Skip to content

Commit

Permalink
global memory bitonic sort
Browse files Browse the repository at this point in the history
  • Loading branch information
AndrewBoessen committed Jan 21, 2025
1 parent b2cc210 commit 7bf42aa
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 2 deletions.
10 changes: 8 additions & 2 deletions makefile → Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,17 @@ CUDA_PATH = /opt/cuda
INCLUDES = -I$(CUDA_PATH)/include
LDFLAGS = -L$(CUDA_PATH)/lib64 -lcudart

all: cpu_bitonic_sort warp_bitonic_sort smem_bitonic_sort
all: cpu_bitonic_sort warp_bitonic_sort smem_bitonic_sort global_bitonic_sort

warp_bitonic_sort: main.o warp_bitonic_sort.o
$(CXX) $^ -o $@ $(LDFLAGS)

smem_bitonic_sort: main.o smem_bitonic_sort.o
$(CXX) $^ -o $@ $(LDFLAGS)

global_bitonic_sort: main.o global_bitonic_sort.o
$(CXX) $^ -o $@ $(LDFLAGS)

cpu_bitonic_sort: cpu_bitonic_sort.cpp
$(CXX) $^ -o $@

Expand All @@ -26,5 +29,8 @@ warp_bitonic_sort.o: warp_bitonic_sort.cu bitonic_sort.cuh
smem_bitonic_sort.o: smem_bitonic_sort.cu bitonic_sort.cuh
$(NVCC) $(NVCCFLAGS) -c $< -o $@

global_bitonic_sort.o: global_bitonic_sort.cu bitonic_sort.cuh
$(NVCC) $(NVCCFLAGS) -c $< -o $@

clean:
rm -f *.o warp_bitonic_sort smem_bitonic_sort cpu_bitonic_sort
rm -f *.o warp_bitonic_sort smem_bitonic_sort cpu_bitonic_sort global_bitonic_sort
84 changes: 84 additions & 0 deletions global_bitonic_sort.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/**
* Global Memory Bitoic Sort
*
* This uses gpu global memory to sort arrays to sort long arrays of ints
*
* Author: Andrew Boessen
*/

#include "bitonic_sort.cuh"

/**
* Swap
*
* This is used for swapping elements in bitonic sorting
*
* @param x caller line id's value
* @param mask source lane id = caller line id ^ mask
* @param dir direction to swap
* @param arr global memory array
*
*/
__device__ void swap(int x, int mask, int dir, int *arr) {
// get correspondin element to x in butterfly diagram
int y = x ^ mask;
// lower ids thread perform swap
if (y > x) {
if (dir) {
// sort ascending
if (arr[x] < arr[y]) {
int temp = arr[x];
arr[x] = arr[y];
arr[y] = temp;
}
} else {
// sort descending
if (arr[x] > arr[y]) {
int temp = arr[x];
arr[x] = arr[y];
arr[y] = temp;
}
}
}
}

/**
* Global Memory Bitonic Sort
*
* The function uses the butterfly network pattern of bitonic sort, leveraging
* CUDA's warp-level primitives for efficient sorting within a warp (32
* threads). The swaps are tiled into warps of 32 threads. This is able to do
* swaps without allocating extra memory for temporary variable.
*
* @param arr Pointer to the array of integers to be sorted
* @param size Total number of elements in the array
*
* @note This function assumes that the number of threads per block is at least
* equal to the warp size. Elements beyond the array size are padded with
* INT_MAX.
*
* @see swap() for the element comparison and swapping logic
*/
__global__ void globalBitonicSort(int *arr, int size) {
// local thread id in block
int thread_id = threadIdx.x + blockIdx.x * blockDim.x;

// make bitonic sequence and sort
for (int i = 0; (1 << i) <= blockDim.x; i++) {
for (int j = 1; j <= i; j++) {
// distance between caller and source lanes
int mask = 1 << (i - j);

// perform compare and swap
int dir = thread_id & (1 << i);
swap(thread_id, mask, dir, arr);
__syncthreads();
}
}
}

void launchBitonicSort(int *arr, int size) {
const int BLOCK_SIZE = 512;
const int NUM_BLOCKS = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
globalBitonicSort<<<NUM_BLOCKS, BLOCK_SIZE>>>(arr, size);
}

0 comments on commit 7bf42aa

Please sign in to comment.