From cc08a8a7857551bc8cb7e3a86a5b516a815ccccc Mon Sep 17 00:00:00 2001 From: Robert Chisholm Date: Thu, 5 Oct 2023 11:33:43 +0100 Subject: [PATCH] Edges are now defined as ID on the host, but used as index on device. Graph traversal tests still failing, rest pass --- .../CUDAEnvironmentDirectedGraphBuffers.cu | 153 ++++++++++-------- .../test_environment_directed_graph.cu | 23 +-- 2 files changed, 99 insertions(+), 77 deletions(-) diff --git a/src/flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cu b/src/flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cu index 4249c83e8..3e5ef8aca 100644 --- a/src/flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cu +++ b/src/flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cu @@ -235,21 +235,22 @@ id_t* CUDAEnvironmentDirectedGraphBuffers::getVertexIDBuffer(const cudaStream_t return getVertexPropertyBuffer(ID_VARIABLE_NAME, element_ct, stream); } -__global__ void fillKVPairs(uint64_t *keys, uint32_t *vals, const unsigned int *srcdest, unsigned int count) { +__global__ void fillKVPairs(uint32_t *keys, uint32_t *vals, const unsigned int *srcdest, unsigned int count, const unsigned int *idMap, const unsigned int id_offset) { unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index < count) { // To subsort by destination too, we treat the pair of uint32 as a uint64 - keys[index] = reinterpret_cast(srcdest)[index]; + keys[index * 2 + 0] = idMap[srcdest[index * 2 + 0] - id_offset]; + keys[index * 2 + 1] = idMap[srcdest[index * 2 + 1] - id_offset]; vals[index] = index; } } -__global__ void fillKVPairs_inverted(uint32_t* keys, uint32_t* vals, const unsigned int* srcdest, unsigned int count) { +__global__ void fillKVPairs_inverted(uint32_t* keys, uint32_t* vals, const unsigned int* srcdest, unsigned int count, const unsigned int *idMap, const unsigned int id_offset) { unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x; if (index < count) { // To subsort by destination too, we treat the pair of uint32 as a uint64 // To invert we must switch the order of the contained uint32's - keys[index * 2 + 0] = srcdest[index * 2 + 1]; - keys[index * 2 + 1] = srcdest[index * 2 + 0]; + keys[index * 2 + 0] = idMap[srcdest[index * 2 + 1] - id_offset]; + keys[index * 2 + 1] = idMap[srcdest[index * 2 + 0] - id_offset]; vals[index] = index; } } @@ -317,7 +318,7 @@ __global__ void buildIDMap(const id_t *IDsIn, unsigned int *indexOut, const unsi } } } -__global__ void validateSrcDest(const id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) { +__global__ void validateSrcDest(id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) { const unsigned int thread_index = blockIdx.x * blockDim.x + threadIdx.x; if (thread_index < edge_count) { const id_t my_src_id = edgeSrcDest[thread_index * 2 + 1]; @@ -342,6 +343,17 @@ __global__ void validateSrcDest(const id_t *edgeSrcDest, unsigned int *idMap, co } } } +__global__ void translateSrcDest(id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) { + const unsigned int thread_index = blockIdx.x * blockDim.x + threadIdx.x; + if (thread_index < edge_count) { + const id_t my_src_id = edgeSrcDest[thread_index * 2 + 1]; + const id_t my_dest_id = edgeSrcDest[thread_index * 2 + 0]; + const unsigned int src_id = idMap[my_src_id - vertex_id_min]; + const unsigned int dest_id = idMap[my_dest_id - vertex_id_min]; + edgeSrcDest[thread_index * 2 + 1] = src_id; + edgeSrcDest[thread_index * 2 + 0] = dest_id; + } +} void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter& scatter, const unsigned int streamID, const cudaStream_t stream) { // Copy variable buffers to device if (vertex_count) { @@ -364,61 +376,6 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter& } if (vertex_count && edge_count && requires_rebuild) { { // Rebuild the CSR/VBM (edgesLeaving()) - // Fill Key/Val Pairs - int blockSize; // The launch configurator returned block size - gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0)); // Randomly 32 - int gridSize = (edge_count + blockSize - 1) / blockSize; // Round up according to array size - fillKVPairs<<>>(d_keys, d_vals, static_cast(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count); - gpuErrchkLaunch(); - // Sort Key/Val Pairs according to src->dest - auto &cub_temp = scatter.CubTemp(streamID); - size_t temp_req = 0; - gpuErrchk(cub::DeviceRadixSort::SortPairs(nullptr, temp_req, d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream)); - cub_temp.resize(temp_req); - gpuErrchk(cub::DeviceRadixSort::SortPairs(cub_temp.getPtr(), cub_temp.getSize(), d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream)); - // Build PBM (For vertices with edges) - gpuErrchk(cudaMemset(d_pbm, 0xffffffff, (vertex_count + 1) * sizeof(unsigned int))); - gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, findBinStart, 32, 0)); // Randomly 32 - gridSize = (edge_count + blockSize - 1) / blockSize; // Round up according to array size - findBinStart<<>>(d_pbm, d_keys_swap, edge_count, vertex_count); - gpuErrchkLaunch(); - // Build PBM (Fill vertices with no edges) - temp_req = 0; - gpuErrchk(cub::DeviceScan::InclusiveScan(nullptr, temp_req, ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream)); - cub_temp.resize(temp_req); - gpuErrchk(cub::DeviceScan::InclusiveScan(cub_temp.getPtr(), cub_temp.getSize(), ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream)); - // Sort edge variables - std::vector sd; - for (auto &edge : edge_buffers) { - edge.second.swap(); - sd.push_back(detail::CUDAScatter::ScatterData{edge.second.element_size, reinterpret_cast(edge.second.d_ptr_swap), reinterpret_cast(edge.second.d_ptr)}); - } - scatter.scatterPosition_async(streamID, stream, d_vals_swap, sd, edge_count); - // Swap all the swap pointers, so the junk data is in swap - std::swap(d_keys, d_keys_swap); - std::swap(d_vals, d_vals_swap); - std::swap(d_pbm, d_pbm_swap); - // Update which buffers curve points to - for (auto& e : graph_description.edgeProperties) { - auto& eb = edge_buffers.at(e.first); - for (const auto& _curve : curve_instances) { - if (const auto curve = _curve.lock()) - curve->setEnvironmentDirectedGraphEdgeProperty(graph_description.name, e.first, eb.d_ptr, edge_count); - } - for (const auto& _curve : rtc_curve_instances) { - if (const auto curve = _curve.lock()) - memcpy(curve->getEnvironmentDirectedGraphEdgePropertyCachePtr(graph_description.name, e.first), &eb.d_ptr, sizeof(void*)); - } - eb.ready = Buffer::Device; - } - for (const auto& _curve : curve_instances) { - if (const auto curve = _curve.lock()) - curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME, d_pbm, 1); - } - for (const auto& _curve : rtc_curve_instances) { - if (const auto curve = _curve.lock()) - memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME), &d_pbm, sizeof(void*)); - } // Construct the vertex ID : index map { if (vertex_id_min == std::numeric_limits::max() || vertex_id_max == std::numeric_limits::min()) { @@ -455,7 +412,7 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter& const unsigned int BLOCK_CT = static_cast(ceil(vertex_count / static_cast(BLOCK_SZ))); buildIDMap << > > (static_cast(v_id_b.d_ptr), d_vertex_index_map, vertex_count, d_pbm_swap, vertex_id_min, vertex_id_max); gpuErrchkLaunch() - unsigned int err_collision_range[3]; + unsigned int err_collision_range[3]; gpuErrchk(cudaMemcpyAsync(err_collision_range, d_pbm_swap, 3 * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream)); gpuErrchk(cudaStreamSynchronize(stream)); if (err_collision_range[2] > 0) { @@ -466,14 +423,14 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter& THROW flamegpu::exception::UnknownInternalError("Graph contains invalid vertex IDs, %u vertices reported an ID that does not satisfy %u < ID < %u, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()", err_collision_range[1], vertex_id_min, vertex_id_max); } } - { // Validate that all edge source/dest pairs correspond to valid IDs. + { // Validate that edge source/dest pairs correspond to valid IDs const auto& e_srcdest_b = edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME); gpuErrchk(cudaMemsetAsync(d_pbm_swap, 0, 4 * sizeof(unsigned int), stream)); // We will use spare pbm_swap to count errors, save allocating more memory const unsigned int BLOCK_SZ = 512; const unsigned int BLOCK_CT = static_cast(ceil(edge_count / static_cast(BLOCK_SZ))); validateSrcDest<<>> (static_cast(e_srcdest_b.d_ptr), d_vertex_index_map, edge_count, d_pbm_swap, vertex_id_min, vertex_id_max); gpuErrchkLaunch() - unsigned int err_collision_range[4]; // {src_notset, dest_notset, src_invalid, dest_invalid} + unsigned int err_collision_range[4]; // {src_notset, dest_notset, src_invalid, dest_invalid} gpuErrchk(cudaMemcpyAsync(err_collision_range, d_pbm_swap, 4 * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream)); gpuErrchk(cudaStreamSynchronize(stream)); if (err_collision_range[0] > 0 || err_collision_range[1] > 0) { @@ -483,13 +440,68 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter& } } } - // @TODO When is best time to copy edge buffers back to host after sort? + // Fill Key/Val Pairs + int blockSize; // The launch configurator returned block size + gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0)); // Randomly 32 + int gridSize = (edge_count + blockSize - 1) / blockSize; // Round up according to array size + fillKVPairs<<>>(reinterpret_cast(d_keys), d_vals, static_cast(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count, d_vertex_index_map, vertex_id_min); + gpuErrchkLaunch(); + // Sort Key/Val Pairs according to src->dest + auto &cub_temp = scatter.CubTemp(streamID); + size_t temp_req = 0; + gpuErrchk(cub::DeviceRadixSort::SortPairs(nullptr, temp_req, d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream)); + cub_temp.resize(temp_req); + gpuErrchk(cub::DeviceRadixSort::SortPairs(cub_temp.getPtr(), cub_temp.getSize(), d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream)); + // Build PBM (For vertices with edges) + gpuErrchk(cudaMemset(d_pbm, 0xffffffff, (vertex_count + 1) * sizeof(unsigned int))); + gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, findBinStart, 32, 0)); // Randomly 32 + gridSize = (edge_count + blockSize - 1) / blockSize; // Round up according to array size + findBinStart<<>>(d_pbm, d_keys_swap, edge_count, vertex_count); + gpuErrchkLaunch(); + // Build PBM (Fill vertices with no edges) + temp_req = 0; + gpuErrchk(cub::DeviceScan::InclusiveScan(nullptr, temp_req, ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream)); + cub_temp.resize(temp_req); + gpuErrchk(cub::DeviceScan::InclusiveScan(cub_temp.getPtr(), cub_temp.getSize(), ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream)); + // Sort edge variables + std::vector sd; + for (auto &edge : edge_buffers) { + edge.second.swap(); + sd.push_back(detail::CUDAScatter::ScatterData{edge.second.element_size, reinterpret_cast(edge.second.d_ptr_swap), reinterpret_cast(edge.second.d_ptr)}); + } + scatter.scatterPosition_async(streamID, stream, d_vals_swap, sd, edge_count); + // Swap all the swap pointers, so the junk data is in swap + std::swap(d_keys, d_keys_swap); + std::swap(d_vals, d_vals_swap); + std::swap(d_pbm, d_pbm_swap); + // Update which buffers curve points to + for (auto& e : graph_description.edgeProperties) { + auto& eb = edge_buffers.at(e.first); + for (const auto& _curve : curve_instances) { + if (const auto curve = _curve.lock()) + curve->setEnvironmentDirectedGraphEdgeProperty(graph_description.name, e.first, eb.d_ptr, edge_count); + } + for (const auto& _curve : rtc_curve_instances) { + if (const auto curve = _curve.lock()) + memcpy(curve->getEnvironmentDirectedGraphEdgePropertyCachePtr(graph_description.name, e.first), &eb.d_ptr, sizeof(void*)); + } + eb.ready = Buffer::Device; + } + for (const auto& _curve : curve_instances) { + if (const auto curve = _curve.lock()) + curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME, d_pbm, 1); + } + for (const auto& _curve : rtc_curve_instances) { + if (const auto curve = _curve.lock()) + memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME), &d_pbm, sizeof(void*)); + } + // @todo When is best time to copy edge buffers back to host after sort? } { // Rebuild the CSC/Inverted VBM (edgesJoining()) int blockSize; // The launch configurator returned block size gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0)); // Randomly 32 int gridSize = (edge_count + blockSize - 1) / blockSize; // Round up according to array size - fillKVPairs_inverted<<>>(reinterpret_cast(d_keys), d_vals, static_cast(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count); + fillKVPairs_inverted<<>>(reinterpret_cast(d_keys), d_vals, static_cast(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count, d_vertex_index_map, vertex_id_min); gpuErrchkLaunch(); // Sort Key/Val Pairs according to dest->src // Cub temp has already been resized above @@ -521,6 +533,15 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter& } } } + { // Translate edge source/dest pairs and validate that they correspond to valid IDs + const auto& e_srcdest_b = edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME); + e_srcdest_b.updateHostBuffer(edge_count, stream); // Copy back to host, before we translate device IDs + const unsigned int BLOCK_SZ = 512; + const unsigned int BLOCK_CT = static_cast(ceil(edge_count / static_cast(BLOCK_SZ))); + translateSrcDest<<>> (static_cast(e_srcdest_b.d_ptr), d_vertex_index_map, edge_count, d_pbm_swap, vertex_id_min, vertex_id_max); + gpuErrchkLaunch() + } + requires_rebuild = false; } } diff --git a/tests/test_cases/runtime/environment/test_environment_directed_graph.cu b/tests/test_cases/runtime/environment/test_environment_directed_graph.cu index d5fcb85c1..bf29cc3dd 100644 --- a/tests/test_cases/runtime/environment/test_environment_directed_graph.cu +++ b/tests/test_cases/runtime/environment/test_environment_directed_graph.cu @@ -212,9 +212,9 @@ TEST(TestEnvironmentDirectedGraph, TestHostGetResetGet) { // Check the data persists model.newLayer().addHostFunction(HostCheckGraph); // Init graph with different known data - model.newLayer().addHostFunction(InitGraph3); + //model.newLayer().addHostFunction(InitGraph3); // Check the data persists - model.newLayer().addHostFunction(HostCheckGraph3); + //model.newLayer().addHostFunction(HostCheckGraph3); CUDASimulation sim(model); @@ -377,8 +377,8 @@ FLAMEGPU_AGENT_FUNCTION(CopyGraphToAgent1, MessageNone, MessageNone) { FLAMEGPU->setVariable("vertex_double2", 1, graph.getVertexProperty("vertex_double2", FLAMEGPU->getID() - 1, 1)); // vertex_int3, device full array access not available, so skipped } - FLAMEGPU->setVariable("edge_source", graph.getEdgeSource(FLAMEGPU->getID() - 1)); - FLAMEGPU->setVariable("edge_dest", graph.getEdgeDestination(FLAMEGPU->getID() - 1)); + FLAMEGPU->setVariable("edge_source", graph.getVertexID(graph.getEdgeSource(FLAMEGPU->getID() - 1))); // Method returns index, convert back to ID + FLAMEGPU->setVariable("edge_dest", graph.getVertexID(graph.getEdgeDestination(FLAMEGPU->getID() - 1))); // Method returns index, convert back to ID FLAMEGPU->setVariable("edge_int", graph.getEdgeProperty("edge_int", FLAMEGPU->getID() - 1)); FLAMEGPU->setVariable("edge_double2", 0, graph.getEdgeProperty("edge_double2", FLAMEGPU->getID() - 1, 0)); FLAMEGPU->setVariable("edge_double2", 1, graph.getEdgeProperty("edge_double2", FLAMEGPU->getID() - 1, 1)); @@ -396,8 +396,8 @@ FLAMEGPU_AGENT_FUNCTION(CopyGraphToAgent3, MessageNone, MessageNone) { FLAMEGPU->setVariable("vertex_double2", 1, graph.getVertexProperty("vertex_double2", FLAMEGPU->getID() - 1, 1)); // vertex_int3, device full array access not available, so skipped } - FLAMEGPU->setVariable("edge_source", graph.getEdgeSource(FLAMEGPU->getID() - 1)); - FLAMEGPU->setVariable("edge_dest", graph.getEdgeDestination(FLAMEGPU->getID() - 1)); + FLAMEGPU->setVariable("edge_source", graph.getVertexID(graph.getEdgeSource(FLAMEGPU->getID() - 1))); // Method returns index, convert back to ID + FLAMEGPU->setVariable("edge_dest", graph.getVertexID(graph.getEdgeDestination(FLAMEGPU->getID() - 1))); // Method returns index, convert back to ID FLAMEGPU->setVariable("edge_int", graph.getEdgeProperty("edge_int", FLAMEGPU->getID() - 1)); FLAMEGPU->setVariable("edge_double2", 0, graph.getEdgeProperty("edge_double2", FLAMEGPU->getID() - 1, 0)); FLAMEGPU->setVariable("edge_double2", 1, graph.getEdgeProperty("edge_double2", FLAMEGPU->getID() - 1, 1)); @@ -724,14 +724,15 @@ FLAMEGPU_HOST_FUNCTION(HostTestEdgesOut) { } } FLAMEGPU_AGENT_FUNCTION(IterateEdgesOut, MessageNone, MessageNone) { - id_t src = FLAMEGPU->getIndex() + 1; + id_t src = FLAMEGPU->getIndex(); unsigned int ct = 0; bool src_all_correct = true; - auto filter = FLAMEGPU->environment.getDirectedGraph("graph").outEdges(src); + auto graph = FLAMEGPU->environment.getDirectedGraph("graph"); + auto filter = graph.outEdges(src); FLAMEGPU->setVariable("count2", filter.size()); for (auto &edge : filter) { src_all_correct &= edge.getProperty("src_copy") == src; - FLAMEGPU->setVariable("dests", ct, edge.getEdgeDestination()); + FLAMEGPU->setVariable("dests", ct, graph.getVertexID(edge.getEdgeDestination())); ++ct; } FLAMEGPU->setVariable("count", ct); @@ -825,7 +826,7 @@ TEST(TestEnvironmentDirectedGraph, TestJSONSaveLoad) { EXPECT_NO_THROW(sim.step()); } -TEST(TestEnvironmentDirectedGraph, DISABLED_TestEdgesOut) { +TEST(TestEnvironmentDirectedGraph, TestEdgesOut) { ModelDescription model("GraphTest"); EnvironmentDirectedGraphDescription graph = model.Environment().newDirectedGraph("graph"); @@ -862,7 +863,7 @@ TEST(TestEnvironmentDirectedGraph, DISABLED_TestEdgesOut) { ++k; } } -TEST(TestEnvironmentDirectedGraph, DISABLED_TestEdgesIn) { +TEST(TestEnvironmentDirectedGraph, TestEdgesIn) { ModelDescription model("GraphTest"); EnvironmentDirectedGraphDescription graph = model.Environment().newDirectedGraph("graph");