Skip to content

Commit

Permalink
Edges are now defined as ID on the host, but used as index on device.
Browse files Browse the repository at this point in the history
Graph traversal tests still failing, rest pass
  • Loading branch information
Robadob committed Oct 5, 2023
1 parent 29bff3f commit cc08a8a
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 77 deletions.
153 changes: 87 additions & 66 deletions src/flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cu
Original file line number Diff line number Diff line change
Expand Up @@ -235,21 +235,22 @@ id_t* CUDAEnvironmentDirectedGraphBuffers::getVertexIDBuffer(const cudaStream_t
return getVertexPropertyBuffer<id_t>(ID_VARIABLE_NAME, element_ct, stream);
}

__global__ void fillKVPairs(uint64_t *keys, uint32_t *vals, const unsigned int *srcdest, unsigned int count) {
__global__ void fillKVPairs(uint32_t *keys, uint32_t *vals, const unsigned int *srcdest, unsigned int count, const unsigned int *idMap, const unsigned int id_offset) {
unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
if (index < count) {
// To subsort by destination too, we treat the pair of uint32 as a uint64
keys[index] = reinterpret_cast<const uint64_t*>(srcdest)[index];
keys[index * 2 + 0] = idMap[srcdest[index * 2 + 0] - id_offset];
keys[index * 2 + 1] = idMap[srcdest[index * 2 + 1] - id_offset];
vals[index] = index;
}
}
__global__ void fillKVPairs_inverted(uint32_t* keys, uint32_t* vals, const unsigned int* srcdest, unsigned int count) {
__global__ void fillKVPairs_inverted(uint32_t* keys, uint32_t* vals, const unsigned int* srcdest, unsigned int count, const unsigned int *idMap, const unsigned int id_offset) {
unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
if (index < count) {
// To subsort by destination too, we treat the pair of uint32 as a uint64
// To invert we must switch the order of the contained uint32's
keys[index * 2 + 0] = srcdest[index * 2 + 1];
keys[index * 2 + 1] = srcdest[index * 2 + 0];
keys[index * 2 + 0] = idMap[srcdest[index * 2 + 1] - id_offset];
keys[index * 2 + 1] = idMap[srcdest[index * 2 + 0] - id_offset];
vals[index] = index;
}
}
Expand Down Expand Up @@ -317,7 +318,7 @@ __global__ void buildIDMap(const id_t *IDsIn, unsigned int *indexOut, const unsi
}
}
}
__global__ void validateSrcDest(const id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) {
__global__ void validateSrcDest(id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) {
const unsigned int thread_index = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_index < edge_count) {
const id_t my_src_id = edgeSrcDest[thread_index * 2 + 1];
Expand All @@ -342,6 +343,17 @@ __global__ void validateSrcDest(const id_t *edgeSrcDest, unsigned int *idMap, co
}
}
}
__global__ void translateSrcDest(id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) {
const unsigned int thread_index = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_index < edge_count) {
const id_t my_src_id = edgeSrcDest[thread_index * 2 + 1];
const id_t my_dest_id = edgeSrcDest[thread_index * 2 + 0];
const unsigned int src_id = idMap[my_src_id - vertex_id_min];
const unsigned int dest_id = idMap[my_dest_id - vertex_id_min];
edgeSrcDest[thread_index * 2 + 1] = src_id;
edgeSrcDest[thread_index * 2 + 0] = dest_id;
}
}
void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter& scatter, const unsigned int streamID, const cudaStream_t stream) {
// Copy variable buffers to device
if (vertex_count) {
Expand All @@ -364,61 +376,6 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter&
}
if (vertex_count && edge_count && requires_rebuild) {
{ // Rebuild the CSR/VBM (edgesLeaving())
// Fill Key/Val Pairs
int blockSize; // The launch configurator returned block size
gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0)); // Randomly 32
int gridSize = (edge_count + blockSize - 1) / blockSize; // Round up according to array size
fillKVPairs<<<gridSize, blockSize, 0, stream>>>(d_keys, d_vals, static_cast<unsigned int*>(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count);
gpuErrchkLaunch();
// Sort Key/Val Pairs according to src->dest
auto &cub_temp = scatter.CubTemp(streamID);
size_t temp_req = 0;
gpuErrchk(cub::DeviceRadixSort::SortPairs(nullptr, temp_req, d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
cub_temp.resize(temp_req);
gpuErrchk(cub::DeviceRadixSort::SortPairs(cub_temp.getPtr(), cub_temp.getSize(), d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
// Build PBM (For vertices with edges)
gpuErrchk(cudaMemset(d_pbm, 0xffffffff, (vertex_count + 1) * sizeof(unsigned int)));
gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, findBinStart, 32, 0)); // Randomly 32
gridSize = (edge_count + blockSize - 1) / blockSize; // Round up according to array size
findBinStart<<<gridSize, blockSize, 0, stream>>>(d_pbm, d_keys_swap, edge_count, vertex_count);
gpuErrchkLaunch();
// Build PBM (Fill vertices with no edges)
temp_req = 0;
gpuErrchk(cub::DeviceScan::InclusiveScan(nullptr, temp_req, ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
cub_temp.resize(temp_req);
gpuErrchk(cub::DeviceScan::InclusiveScan(cub_temp.getPtr(), cub_temp.getSize(), ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
// Sort edge variables
std::vector<detail::CUDAScatter::ScatterData> sd;
for (auto &edge : edge_buffers) {
edge.second.swap();
sd.push_back(detail::CUDAScatter::ScatterData{edge.second.element_size, reinterpret_cast<char*>(edge.second.d_ptr_swap), reinterpret_cast<char*>(edge.second.d_ptr)});
}
scatter.scatterPosition_async(streamID, stream, d_vals_swap, sd, edge_count);
// Swap all the swap pointers, so the junk data is in swap
std::swap(d_keys, d_keys_swap);
std::swap(d_vals, d_vals_swap);
std::swap(d_pbm, d_pbm_swap);
// Update which buffers curve points to
for (auto& e : graph_description.edgeProperties) {
auto& eb = edge_buffers.at(e.first);
for (const auto& _curve : curve_instances) {
if (const auto curve = _curve.lock())
curve->setEnvironmentDirectedGraphEdgeProperty(graph_description.name, e.first, eb.d_ptr, edge_count);
}
for (const auto& _curve : rtc_curve_instances) {
if (const auto curve = _curve.lock())
memcpy(curve->getEnvironmentDirectedGraphEdgePropertyCachePtr(graph_description.name, e.first), &eb.d_ptr, sizeof(void*));
}
eb.ready = Buffer::Device;
}
for (const auto& _curve : curve_instances) {
if (const auto curve = _curve.lock())
curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME, d_pbm, 1);
}
for (const auto& _curve : rtc_curve_instances) {
if (const auto curve = _curve.lock())
memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME), &d_pbm, sizeof(void*));
}
// Construct the vertex ID : index map
{
if (vertex_id_min == std::numeric_limits<unsigned int>::max() || vertex_id_max == std::numeric_limits<unsigned int>::min()) {
Expand Down Expand Up @@ -455,7 +412,7 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter&
const unsigned int BLOCK_CT = static_cast<unsigned int>(ceil(vertex_count / static_cast<float>(BLOCK_SZ)));
buildIDMap << <BLOCK_CT, BLOCK_SZ, 0, stream >> > (static_cast<id_t*>(v_id_b.d_ptr), d_vertex_index_map, vertex_count, d_pbm_swap, vertex_id_min, vertex_id_max);
gpuErrchkLaunch()
unsigned int err_collision_range[3];
unsigned int err_collision_range[3];
gpuErrchk(cudaMemcpyAsync(err_collision_range, d_pbm_swap, 3 * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream));
gpuErrchk(cudaStreamSynchronize(stream));
if (err_collision_range[2] > 0) {
Expand All @@ -466,14 +423,14 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter&
THROW flamegpu::exception::UnknownInternalError("Graph contains invalid vertex IDs, %u vertices reported an ID that does not satisfy %u < ID < %u, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()", err_collision_range[1], vertex_id_min, vertex_id_max);
}
}
{ // Validate that all edge source/dest pairs correspond to valid IDs.
{ // Validate that edge source/dest pairs correspond to valid IDs
const auto& e_srcdest_b = edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME);
gpuErrchk(cudaMemsetAsync(d_pbm_swap, 0, 4 * sizeof(unsigned int), stream)); // We will use spare pbm_swap to count errors, save allocating more memory
const unsigned int BLOCK_SZ = 512;
const unsigned int BLOCK_CT = static_cast<unsigned int>(ceil(edge_count / static_cast<float>(BLOCK_SZ)));
validateSrcDest<<<BLOCK_CT, BLOCK_SZ, 0 , stream>>> (static_cast<id_t*>(e_srcdest_b.d_ptr), d_vertex_index_map, edge_count, d_pbm_swap, vertex_id_min, vertex_id_max);
gpuErrchkLaunch()
unsigned int err_collision_range[4]; // {src_notset, dest_notset, src_invalid, dest_invalid}
unsigned int err_collision_range[4]; // {src_notset, dest_notset, src_invalid, dest_invalid}
gpuErrchk(cudaMemcpyAsync(err_collision_range, d_pbm_swap, 4 * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream));
gpuErrchk(cudaStreamSynchronize(stream));
if (err_collision_range[0] > 0 || err_collision_range[1] > 0) {
Expand All @@ -483,13 +440,68 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter&
}
}
}
// @TODO When is best time to copy edge buffers back to host after sort?
// Fill Key/Val Pairs
int blockSize; // The launch configurator returned block size
gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0)); // Randomly 32
int gridSize = (edge_count + blockSize - 1) / blockSize; // Round up according to array size
fillKVPairs<<<gridSize, blockSize, 0, stream>>>(reinterpret_cast<uint32_t*>(d_keys), d_vals, static_cast<unsigned int*>(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count, d_vertex_index_map, vertex_id_min);
gpuErrchkLaunch();
// Sort Key/Val Pairs according to src->dest
auto &cub_temp = scatter.CubTemp(streamID);
size_t temp_req = 0;
gpuErrchk(cub::DeviceRadixSort::SortPairs(nullptr, temp_req, d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
cub_temp.resize(temp_req);
gpuErrchk(cub::DeviceRadixSort::SortPairs(cub_temp.getPtr(), cub_temp.getSize(), d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
// Build PBM (For vertices with edges)
gpuErrchk(cudaMemset(d_pbm, 0xffffffff, (vertex_count + 1) * sizeof(unsigned int)));
gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, findBinStart, 32, 0)); // Randomly 32
gridSize = (edge_count + blockSize - 1) / blockSize; // Round up according to array size
findBinStart<<<gridSize, blockSize, 0, stream>>>(d_pbm, d_keys_swap, edge_count, vertex_count);
gpuErrchkLaunch();
// Build PBM (Fill vertices with no edges)
temp_req = 0;
gpuErrchk(cub::DeviceScan::InclusiveScan(nullptr, temp_req, ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
cub_temp.resize(temp_req);
gpuErrchk(cub::DeviceScan::InclusiveScan(cub_temp.getPtr(), cub_temp.getSize(), ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
// Sort edge variables
std::vector<detail::CUDAScatter::ScatterData> sd;
for (auto &edge : edge_buffers) {
edge.second.swap();
sd.push_back(detail::CUDAScatter::ScatterData{edge.second.element_size, reinterpret_cast<char*>(edge.second.d_ptr_swap), reinterpret_cast<char*>(edge.second.d_ptr)});
}
scatter.scatterPosition_async(streamID, stream, d_vals_swap, sd, edge_count);
// Swap all the swap pointers, so the junk data is in swap
std::swap(d_keys, d_keys_swap);
std::swap(d_vals, d_vals_swap);
std::swap(d_pbm, d_pbm_swap);
// Update which buffers curve points to
for (auto& e : graph_description.edgeProperties) {
auto& eb = edge_buffers.at(e.first);
for (const auto& _curve : curve_instances) {
if (const auto curve = _curve.lock())
curve->setEnvironmentDirectedGraphEdgeProperty(graph_description.name, e.first, eb.d_ptr, edge_count);
}
for (const auto& _curve : rtc_curve_instances) {
if (const auto curve = _curve.lock())
memcpy(curve->getEnvironmentDirectedGraphEdgePropertyCachePtr(graph_description.name, e.first), &eb.d_ptr, sizeof(void*));
}
eb.ready = Buffer::Device;
}
for (const auto& _curve : curve_instances) {
if (const auto curve = _curve.lock())
curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME, d_pbm, 1);
}
for (const auto& _curve : rtc_curve_instances) {
if (const auto curve = _curve.lock())
memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME), &d_pbm, sizeof(void*));
}
// @todo When is best time to copy edge buffers back to host after sort?
}
{ // Rebuild the CSC/Inverted VBM (edgesJoining())
int blockSize; // The launch configurator returned block size
gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0)); // Randomly 32
int gridSize = (edge_count + blockSize - 1) / blockSize; // Round up according to array size
fillKVPairs_inverted<<<gridSize, blockSize, 0, stream>>>(reinterpret_cast<uint32_t*>(d_keys), d_vals, static_cast<unsigned int*>(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count);
fillKVPairs_inverted<<<gridSize, blockSize, 0, stream>>>(reinterpret_cast<uint32_t*>(d_keys), d_vals, static_cast<unsigned int*>(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count, d_vertex_index_map, vertex_id_min);
gpuErrchkLaunch();
// Sort Key/Val Pairs according to dest->src
// Cub temp has already been resized above
Expand Down Expand Up @@ -521,6 +533,15 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter&
}
}
}
{ // Translate edge source/dest pairs and validate that they correspond to valid IDs
const auto& e_srcdest_b = edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME);
e_srcdest_b.updateHostBuffer(edge_count, stream); // Copy back to host, before we translate device IDs
const unsigned int BLOCK_SZ = 512;
const unsigned int BLOCK_CT = static_cast<unsigned int>(ceil(edge_count / static_cast<float>(BLOCK_SZ)));
translateSrcDest<<<BLOCK_CT, BLOCK_SZ, 0 , stream>>> (static_cast<id_t*>(e_srcdest_b.d_ptr), d_vertex_index_map, edge_count, d_pbm_swap, vertex_id_min, vertex_id_max);
gpuErrchkLaunch()
}
requires_rebuild = false;
}
}

Expand Down
Loading

0 comments on commit cc08a8a

Please sign in to comment.