mst.cu

#include "mst.hpp"

#include <cuda/std/limits>

#include <cuda/atomic>

#include <cooperative_groups.h>

namespace cg = cooperative_groups;
namespace custd = cuda::std;

#define WARPSIZE 32
#define BLOCKSIZE 512

static inline constexpr uint64_t MINV_MAX =
    custd::numeric_limits<uint64_t>::max();

namespace kernel {
struct alignas(16) WorkList {
  uint32_t u;
  uint32_t v;
  uint32_t w;
  uint32_t e;
};

namespace DS {
static inline __device__ uint32_t
root(const uint32_t node, const uint32_t *const __restrict__ parent) {
  auto root = node;
  auto next = parent[root];

  while (root != next) {
    root = next;
    next = parent[root];
  }

  return root;
}

static inline __device__ void join(const uint32_t u, const uint32_t v,
                                   uint32_t *const __restrict__ parent) {
  uint32_t M;
  uint32_t m = u;

  uint32_t R = v;

  do {
    M = max(m, R);
    m = min(m, R);
    R = M;
  } while (!cuda::atomic_ref<uint32_t, cuda::thread_scope::thread_scope_device>{
      parent[M]}
                .compare_exchange_strong(R, m));
}
} // namespace DS

static __global__ void initializeVertices(const uint32_t nodes,
                                          uint32_t *const __restrict__ parent,
                                          uint64_t *const __restrict__ minv) {
  const auto node = threadIdx.x + blockIdx.x * BLOCKSIZE;
  if (node < nodes) {
    parent[node] = node;
    minv[node] = MINV_MAX;
  }
}

static __global__ void initializeWL(const uint32_t nodes,
                                    uint32_t *const __restrict__ wlSize,
                                    WorkList *const __restrict__ wl,
                                    const uint32_t *const __restrict__ N,
                                    const uint32_t *const __restrict__ F,
                                    const uint32_t *const __restrict__ W) {
  auto node = threadIdx.x + blockIdx.x * BLOCKSIZE;

  uint32_t adjBegin = 0;
  uint32_t adjEnd = 0;
  uint32_t degree = 0;

  if (node < nodes) {
    adjBegin = N[node];
    adjEnd = N[node + 1];
    degree = adjEnd - adjBegin;

    if (degree < 4) {
      for (uint32_t i = adjBegin; i < adjEnd; i++) {
        const auto neighbor = F[i];
        // Only one direction
        // less work is being done
        if (neighbor > node) {
          const auto weight = W[i];
          const auto k =
              cuda::atomic_ref<uint32_t, cuda::thread_scope_device>{*wlSize}
                  .fetch_add(1);
          wl[k] = WorkList{node, neighbor, weight, i};
        }
      }
    }
  }

  auto warp = cg::tiled_partition<WARPSIZE>(cg::this_thread_block());
  auto lane = warp.thread_rank();

  auto ballot = warp.ballot(degree >= 4);
  while (ballot != 0) {
    // Select lane with lowest id and mark as processed
    // aka remove active bit from ballot
    int who = __ffs(ballot) - 1;
    ballot &= ballot - 1;

    // Pass around the node, adjBegin and adjEnd
    node = warp.shfl(node, who);
    adjBegin = warp.shfl(adjBegin, who);
    adjEnd = warp.shfl(adjEnd, who);
    for (uint32_t i = adjBegin + lane; i < adjEnd; i += warp.num_threads()) {
      const auto neighbor = F[i];
      if (neighbor > node) {
        const auto weight = W[i];
        const auto k =
            cuda::atomic_ref<uint32_t, cuda::thread_scope_device>{*wlSize}
                .fetch_add(1);
        wl[k] = WorkList{node, neighbor, weight, i};
      }
    }
  }
}

static __global__ void
gatherLightestNode(const uint32_t activeWLSize,
                   uint32_t *const __restrict__ secondaryWLSize,
                   const WorkList *const __restrict__ activeWL,
                   WorkList *const __restrict__ secondaryWL,
                   const uint32_t *const __restrict__ parent,
                   uint64_t *const __restrict__ minv) {
  const int idx = threadIdx.x + blockIdx.x * BLOCKSIZE;
  if (idx < activeWLSize) {
    auto item = activeWL[idx];
    const auto u = DS::root(item.u, parent);
    const auto v = DS::root(item.v, parent);
    if (u != v) {
      item.u = u;
      item.v = v;

      const auto k =
          cuda::atomic_ref<uint32_t, cuda::thread_scope_device>{
              *secondaryWLSize}
              .fetch_add(1);
      secondaryWL[k] = item;
      const auto value =
          static_cast<uint64_t>(item.w) << 32 | static_cast<uint64_t>(item.e);

      if (minv[u] > value)
        cuda::atomic_ref<uint64_t, cuda::thread_scope_device>{minv[u]}
            .fetch_min(value);

      if (minv[v] > value)
        cuda::atomic_ref<uint64_t, cuda::thread_scope_device>{minv[v]}
            .fetch_min(value);
    }
  }
}

static __global__ void insideMST(const uint32_t wlSize,
                                 const WorkList *const __restrict__ wl,
                                 uint32_t *const __restrict__ parent,
                                 uint64_t *const __restrict__ minv,
                                 bool *const __restrict__ inMST) {
  const auto idx = threadIdx.x + blockIdx.x * BLOCKSIZE;
  if (idx < wlSize) {
    const auto item = wl[idx];
    const auto value =
        static_cast<uint64_t>(item.w) << 32 | static_cast<uint64_t>(item.e);
    if (value == minv[item.u] || value == minv[item.v]) {
      DS::join(item.u, item.v, parent);
      inMST[item.e] = true;
    }
  }
}

static __global__ void resetMINV(const uint32_t wlSize,
                                 WorkList *const __restrict__ wl,
                                 uint64_t *const __restrict__ minv) {
  const auto idx = threadIdx.x + blockIdx.x * BLOCKSIZE;
  if (idx < wlSize) {
    const auto item = wl[idx];
    minv[item.u] = MINV_MAX;
    minv[item.v] = MINV_MAX;
  }
}
} // namespace kernel

bool *MST(const CSRGraph &g) {
  bool *inMST;
  cudaMallocManaged(&inMST, sizeof(bool) * g.E);

  uint32_t *parentD;
  uint64_t *minvD;

  uint32_t *activeWlSizeD;
  kernel::WorkList *activeWLD;
  kernel::WorkList *secondaryWLD;

  cudaStream_t stream0;
  cudaStream_t stream1;
  cudaStreamCreateWithFlags(&stream0, cudaStreamNonBlocking);
  cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);

  cudaMallocAsync(&parentD, sizeof(uint32_t) * g.V, stream0);
  cudaMallocAsync(&minvD, sizeof(uint64_t) * g.V, stream0);

  const uint32_t initBlocks = (g.V + BLOCKSIZE - 1) / BLOCKSIZE;
  kernel::initializeVertices<<<initBlocks, BLOCKSIZE, 0, stream0>>>(
      g.V, parentD, minvD);

  cudaMallocAsync(&activeWlSizeD, sizeof(uint32_t), stream1);
  cudaMemsetAsync(activeWlSizeD, 0, sizeof(uint32_t), stream1);

  cudaMallocAsync(&activeWLD, sizeof(kernel::WorkList) * (g.E / 2), stream1);
  cudaMallocAsync(&secondaryWLD, sizeof(kernel::WorkList) * (g.E / 2), stream1);

  kernel::initializeWL<<<initBlocks, BLOCKSIZE, 0, stream1>>>(
      g.V, activeWlSizeD, activeWLD, g.N, g.F, g.W);

  uint32_t wlSize;
  cudaMemcpyAsync(&wlSize, activeWlSizeD, sizeof(uint32_t), cudaMemcpyDefault,
                  stream1);
  cudaStreamSynchronize(stream1);

  while (wlSize > 0) {
    cudaMemsetAsync(activeWlSizeD, 0, sizeof(uint32_t), stream0);

    const uint32_t wlBlocks = (wlSize + BLOCKSIZE - 1) / BLOCKSIZE;
    kernel::gatherLightestNode<<<wlBlocks, BLOCKSIZE, 0, stream0>>>(
        wlSize, activeWlSizeD, activeWLD, secondaryWLD, parentD, minvD);

    cudaMemcpyAsync(&wlSize, activeWlSizeD, sizeof(uint32_t), cudaMemcpyDefault,
                    stream0);
    cudaStreamSynchronize(stream0);

    std::swap(activeWLD, secondaryWLD);
    if (wlSize > 0) {
      kernel::insideMST<<<wlBlocks, BLOCKSIZE, 0, stream0>>>(
          wlSize, activeWLD, parentD, minvD, inMST);
      kernel::resetMINV<<<wlBlocks, BLOCKSIZE, 0, stream0>>>(wlSize, activeWLD,
                                                             minvD);
    }
  }

  cudaFreeAsync(secondaryWLD, stream0);
  cudaFreeAsync(activeWLD, stream0);
  cudaFreeAsync(activeWlSizeD, stream0);

  cudaFreeAsync(minvD, stream0);
  cudaFreeAsync(parentD, stream0);

  cudaStreamSynchronize(stream0);

  cudaStreamDestroy(stream1);
  cudaStreamDestroy(stream0);

  return inMST;
}