Edges are now defined as ID on the host, but used as index on device.

Graph traversal tests still failing, rest pass
FLAMEGPU · Oct 5, 2023 · cc08a8a · cc08a8a
1 parent 29bff3f
commit cc08a8a
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 77 deletions.
diff --git a/src/flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cu b/src/flamegpu/simulation/detail/CUDAEnvironmentDirectedGraphBuffers.cu
@@ -235,21 +235,22 @@ id_t* CUDAEnvironmentDirectedGraphBuffers::getVertexIDBuffer(const cudaStream_t
     return getVertexPropertyBuffer<id_t>(ID_VARIABLE_NAME, element_ct, stream);
 }
 
-__global__ void fillKVPairs(uint64_t *keys, uint32_t *vals, const unsigned int *srcdest, unsigned int count) {
+__global__ void fillKVPairs(uint32_t *keys, uint32_t *vals, const unsigned int *srcdest, unsigned int count, const unsigned int *idMap, const unsigned int id_offset) {
     unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
     if (index < count) {
         // To subsort by destination too, we treat the pair of uint32 as a uint64
-        keys[index] = reinterpret_cast<const uint64_t*>(srcdest)[index];
+        keys[index * 2 + 0] = idMap[srcdest[index * 2 + 0] - id_offset];
+        keys[index * 2 + 1] = idMap[srcdest[index * 2 + 1] - id_offset];
         vals[index] = index;
     }
 }
-__global__ void fillKVPairs_inverted(uint32_t* keys, uint32_t* vals, const unsigned int* srcdest, unsigned int count) {
+__global__ void fillKVPairs_inverted(uint32_t* keys, uint32_t* vals, const unsigned int* srcdest, unsigned int count, const unsigned int *idMap, const unsigned int id_offset) {
     unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
     if (index < count) {
         // To subsort by destination too, we treat the pair of uint32 as a uint64
         // To invert we must switch the order of the contained uint32's
-        keys[index * 2 + 0] = srcdest[index * 2 + 1];
-        keys[index * 2 + 1] = srcdest[index * 2 + 0];
+        keys[index * 2 + 0] = idMap[srcdest[index * 2 + 1] - id_offset];
+        keys[index * 2 + 1] = idMap[srcdest[index * 2 + 0] - id_offset];
         vals[index] = index;
     }
 }
@@ -317,7 +318,7 @@ __global__ void buildIDMap(const id_t *IDsIn, unsigned int *indexOut, const unsi
         }
     }
 }
-__global__ void validateSrcDest(const id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) {
+__global__ void validateSrcDest(id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) {
     const unsigned int thread_index = blockIdx.x * blockDim.x + threadIdx.x;
     if (thread_index < edge_count) {
         const id_t my_src_id = edgeSrcDest[thread_index * 2 + 1];
@@ -342,6 +343,17 @@ __global__ void validateSrcDest(const id_t *edgeSrcDest, unsigned int *idMap, co
         }
     }
 }
+__global__ void translateSrcDest(id_t *edgeSrcDest, unsigned int *idMap, const unsigned int edge_count, unsigned int *errors, unsigned int vertex_id_min, unsigned int vertex_id_max) {
+    const unsigned int thread_index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (thread_index < edge_count) {
+        const id_t my_src_id = edgeSrcDest[thread_index * 2 + 1];
+        const id_t my_dest_id = edgeSrcDest[thread_index * 2 + 0];
+        const unsigned int src_id = idMap[my_src_id - vertex_id_min];
+        const unsigned int dest_id = idMap[my_dest_id - vertex_id_min];
+        edgeSrcDest[thread_index * 2 + 1] = src_id;
+        edgeSrcDest[thread_index * 2 + 0] = dest_id;
+    }
+}
 void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter& scatter, const unsigned int streamID, const cudaStream_t stream) {
     // Copy variable buffers to device
     if (vertex_count) {
@@ -364,61 +376,6 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter&
     }
     if (vertex_count && edge_count && requires_rebuild) {
         {  // Rebuild the CSR/VBM (edgesLeaving())
-            // Fill Key/Val Pairs
-            int blockSize;  // The launch configurator returned block size
-            gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0));  // Randomly 32
-            int gridSize = (edge_count + blockSize - 1) / blockSize;  // Round up according to array size
-            fillKVPairs<<<gridSize, blockSize, 0, stream>>>(d_keys, d_vals, static_cast<unsigned int*>(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count);
-            gpuErrchkLaunch();
-            // Sort Key/Val Pairs according to src->dest
-            auto &cub_temp = scatter.CubTemp(streamID);
-            size_t temp_req = 0;
-            gpuErrchk(cub::DeviceRadixSort::SortPairs(nullptr, temp_req, d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
-            cub_temp.resize(temp_req);
-            gpuErrchk(cub::DeviceRadixSort::SortPairs(cub_temp.getPtr(), cub_temp.getSize(), d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
-            // Build PBM (For vertices with edges)
-            gpuErrchk(cudaMemset(d_pbm, 0xffffffff, (vertex_count + 1) * sizeof(unsigned int)));
-            gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, findBinStart, 32, 0));  // Randomly 32
-            gridSize = (edge_count + blockSize - 1) / blockSize;  // Round up according to array size
-            findBinStart<<<gridSize, blockSize, 0, stream>>>(d_pbm, d_keys_swap, edge_count, vertex_count);
-            gpuErrchkLaunch();
-            // Build PBM (Fill vertices with no edges)
-            temp_req = 0;
-            gpuErrchk(cub::DeviceScan::InclusiveScan(nullptr, temp_req, ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
-            cub_temp.resize(temp_req);
-            gpuErrchk(cub::DeviceScan::InclusiveScan(cub_temp.getPtr(), cub_temp.getSize(), ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
-            // Sort edge variables
-            std::vector<detail::CUDAScatter::ScatterData> sd;
-            for (auto &edge : edge_buffers) {
-                edge.second.swap();
-                sd.push_back(detail::CUDAScatter::ScatterData{edge.second.element_size,  reinterpret_cast<char*>(edge.second.d_ptr_swap),  reinterpret_cast<char*>(edge.second.d_ptr)});
-            }
-            scatter.scatterPosition_async(streamID, stream, d_vals_swap, sd, edge_count);
-            // Swap all the swap pointers, so the junk data is in swap
-            std::swap(d_keys, d_keys_swap);
-            std::swap(d_vals, d_vals_swap);
-            std::swap(d_pbm, d_pbm_swap);
-            // Update which buffers curve points to
-            for (auto& e : graph_description.edgeProperties) {
-                auto& eb = edge_buffers.at(e.first);
-                for (const auto& _curve : curve_instances) {
-                    if (const auto curve = _curve.lock())
-                        curve->setEnvironmentDirectedGraphEdgeProperty(graph_description.name, e.first, eb.d_ptr, edge_count);
-                }
-                for (const auto& _curve : rtc_curve_instances) {
-                    if (const auto curve = _curve.lock())
-                        memcpy(curve->getEnvironmentDirectedGraphEdgePropertyCachePtr(graph_description.name, e.first), &eb.d_ptr, sizeof(void*));
-                }
-                eb.ready = Buffer::Device;
-            }
-            for (const auto& _curve : curve_instances) {
-                if (const auto curve = _curve.lock())
-                    curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME, d_pbm, 1);
-            }
-            for (const auto& _curve : rtc_curve_instances) {
-                if (const auto curve = _curve.lock())
-                    memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME), &d_pbm, sizeof(void*));
-            }
             // Construct the vertex ID : index map
             {
                 if (vertex_id_min == std::numeric_limits<unsigned int>::max() || vertex_id_max == std::numeric_limits<unsigned int>::min()) {
@@ -455,7 +412,7 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter&
                     const unsigned int BLOCK_CT = static_cast<unsigned int>(ceil(vertex_count / static_cast<float>(BLOCK_SZ)));
                     buildIDMap << <BLOCK_CT, BLOCK_SZ, 0, stream >> > (static_cast<id_t*>(v_id_b.d_ptr), d_vertex_index_map, vertex_count, d_pbm_swap, vertex_id_min, vertex_id_max);
                     gpuErrchkLaunch()
-                        unsigned int err_collision_range[3];
+                    unsigned int err_collision_range[3];
                     gpuErrchk(cudaMemcpyAsync(err_collision_range, d_pbm_swap, 3 * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream));
                     gpuErrchk(cudaStreamSynchronize(stream));
                     if (err_collision_range[2] > 0) {
@@ -466,14 +423,14 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter&
                         THROW flamegpu::exception::UnknownInternalError("Graph contains invalid vertex IDs, %u vertices reported an ID that does not satisfy %u < ID < %u, in CUDAEnvironmentDirectedGraphBuffers::syncDevice_async()", err_collision_range[1], vertex_id_min, vertex_id_max);
                     }
                 }
-                {  // Validate that all edge source/dest pairs correspond to valid IDs.
+                {  // Validate that edge source/dest pairs correspond to valid IDs
                     const auto& e_srcdest_b = edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME);
                     gpuErrchk(cudaMemsetAsync(d_pbm_swap, 0, 4 * sizeof(unsigned int), stream));  // We will use spare pbm_swap to count errors, save allocating more memory
                     const unsigned int BLOCK_SZ = 512;
                     const unsigned int BLOCK_CT = static_cast<unsigned int>(ceil(edge_count / static_cast<float>(BLOCK_SZ)));
                     validateSrcDest<<<BLOCK_CT, BLOCK_SZ, 0 , stream>>> (static_cast<id_t*>(e_srcdest_b.d_ptr), d_vertex_index_map, edge_count, d_pbm_swap, vertex_id_min, vertex_id_max);
                     gpuErrchkLaunch()
-                    unsigned int err_collision_range[4];  // {src_notset, dest_notset, src_invalid, dest_invalid}
+                        unsigned int err_collision_range[4];  // {src_notset, dest_notset, src_invalid, dest_invalid}
                     gpuErrchk(cudaMemcpyAsync(err_collision_range, d_pbm_swap, 4 * sizeof(unsigned int), cudaMemcpyDeviceToHost, stream));
                     gpuErrchk(cudaStreamSynchronize(stream));
                     if (err_collision_range[0] > 0 || err_collision_range[1] > 0) {
@@ -483,13 +440,68 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter&
                     }
                 }
             }
-            // @TODO When is best time to copy edge buffers back to host after sort?
+            // Fill Key/Val Pairs
+            int blockSize;  // The launch configurator returned block size
+            gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0));  // Randomly 32
+            int gridSize = (edge_count + blockSize - 1) / blockSize;  // Round up according to array size
+            fillKVPairs<<<gridSize, blockSize, 0, stream>>>(reinterpret_cast<uint32_t*>(d_keys), d_vals, static_cast<unsigned int*>(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count, d_vertex_index_map, vertex_id_min);
+            gpuErrchkLaunch();
+            // Sort Key/Val Pairs according to src->dest
+            auto &cub_temp = scatter.CubTemp(streamID);
+            size_t temp_req = 0;
+            gpuErrchk(cub::DeviceRadixSort::SortPairs(nullptr, temp_req, d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
+            cub_temp.resize(temp_req);
+            gpuErrchk(cub::DeviceRadixSort::SortPairs(cub_temp.getPtr(), cub_temp.getSize(), d_keys, d_keys_swap, d_vals, d_vals_swap, edge_count, 0, sizeof(uint64_t) * 8, stream));
+            // Build PBM (For vertices with edges)
+            gpuErrchk(cudaMemset(d_pbm, 0xffffffff, (vertex_count + 1) * sizeof(unsigned int)));
+            gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, findBinStart, 32, 0));  // Randomly 32
+            gridSize = (edge_count + blockSize - 1) / blockSize;  // Round up according to array size
+            findBinStart<<<gridSize, blockSize, 0, stream>>>(d_pbm, d_keys_swap, edge_count, vertex_count);
+            gpuErrchkLaunch();
+            // Build PBM (Fill vertices with no edges)
+            temp_req = 0;
+            gpuErrchk(cub::DeviceScan::InclusiveScan(nullptr, temp_req, ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
+            cub_temp.resize(temp_req);
+            gpuErrchk(cub::DeviceScan::InclusiveScan(cub_temp.getPtr(), cub_temp.getSize(), ReverseIterator(d_pbm + vertex_count), ReverseIterator(d_pbm_swap + vertex_count), CustomMin(), vertex_count + 1, stream));
+            // Sort edge variables
+            std::vector<detail::CUDAScatter::ScatterData> sd;
+            for (auto &edge : edge_buffers) {
+                edge.second.swap();
+                sd.push_back(detail::CUDAScatter::ScatterData{edge.second.element_size,  reinterpret_cast<char*>(edge.second.d_ptr_swap),  reinterpret_cast<char*>(edge.second.d_ptr)});
+            }
+            scatter.scatterPosition_async(streamID, stream, d_vals_swap, sd, edge_count);
+            // Swap all the swap pointers, so the junk data is in swap
+            std::swap(d_keys, d_keys_swap);
+            std::swap(d_vals, d_vals_swap);
+            std::swap(d_pbm, d_pbm_swap);
+            // Update which buffers curve points to
+            for (auto& e : graph_description.edgeProperties) {
+                auto& eb = edge_buffers.at(e.first);
+                for (const auto& _curve : curve_instances) {
+                    if (const auto curve = _curve.lock())
+                        curve->setEnvironmentDirectedGraphEdgeProperty(graph_description.name, e.first, eb.d_ptr, edge_count);
+                }
+                for (const auto& _curve : rtc_curve_instances) {
+                    if (const auto curve = _curve.lock())
+                        memcpy(curve->getEnvironmentDirectedGraphEdgePropertyCachePtr(graph_description.name, e.first), &eb.d_ptr, sizeof(void*));
+                }
+                eb.ready = Buffer::Device;
+            }
+            for (const auto& _curve : curve_instances) {
+                if (const auto curve = _curve.lock())
+                    curve->setEnvironmentDirectedGraphVertexProperty(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME, d_pbm, 1);
+            }
+            for (const auto& _curve : rtc_curve_instances) {
+                if (const auto curve = _curve.lock())
+                    memcpy(curve->getEnvironmentDirectedGraphVertexPropertyCachePtr(graph_description.name, GRAPH_VERTEX_PBM_VARIABLE_NAME), &d_pbm, sizeof(void*));
+            }
+            // @todo When is best time to copy edge buffers back to host after sort?
         }
         {  // Rebuild the CSC/Inverted VBM (edgesJoining())
             int blockSize;  // The launch configurator returned block size
             gpuErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blockSize, fillKVPairs, 32, 0));  // Randomly 32
             int gridSize = (edge_count + blockSize - 1) / blockSize;  // Round up according to array size
-            fillKVPairs_inverted<<<gridSize, blockSize, 0, stream>>>(reinterpret_cast<uint32_t*>(d_keys), d_vals, static_cast<unsigned int*>(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count);
+            fillKVPairs_inverted<<<gridSize, blockSize, 0, stream>>>(reinterpret_cast<uint32_t*>(d_keys), d_vals, static_cast<unsigned int*>(edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME).d_ptr), edge_count, d_vertex_index_map, vertex_id_min);
             gpuErrchkLaunch();
             // Sort Key/Val Pairs according to dest->src
             // Cub temp has already been resized above
@@ -521,6 +533,15 @@ void CUDAEnvironmentDirectedGraphBuffers::syncDevice_async(detail::CUDAScatter&
                 }
             }
         }
+        {  // Translate edge source/dest pairs and validate that they correspond to valid IDs
+            const auto& e_srcdest_b = edge_buffers.at(GRAPH_SOURCE_DEST_VARIABLE_NAME);
+            e_srcdest_b.updateHostBuffer(edge_count, stream);  // Copy back to host, before we translate device IDs
+            const unsigned int BLOCK_SZ = 512;
+            const unsigned int BLOCK_CT = static_cast<unsigned int>(ceil(edge_count / static_cast<float>(BLOCK_SZ)));
+            translateSrcDest<<<BLOCK_CT, BLOCK_SZ, 0 , stream>>> (static_cast<id_t*>(e_srcdest_b.d_ptr), d_vertex_index_map, edge_count, d_pbm_swap, vertex_id_min, vertex_id_max);
+            gpuErrchkLaunch()
+        }
+        requires_rebuild = false;
     }
 }