Moved RowSum operator() and dependencies back to jaccard.cpp

psath · psath · commit e9ec98c73ec2 · 2022-10-20T11:26:51.000-04:00
diff --git a/jaccard.cpp b/jaccard.cpp
@@ -47,6 +47,95 @@
     #define EMULATE_ATOMIC_ADD_DOUBLE
   #endif
 
+// From utilties/graph_utils.cuh
+// FIXME Revisit the barriers and fences and local storage with subgroups
+// FIXME revisit with SYCL group algorithms
+template <typename count_t, typename index_t, typename value_t>
+__inline__ value_t
+parallel_prefix_sum(cl::sycl::nd_item<2> const &tid_info, count_t n,
+                    cl::sycl::accessor<index_t, 1, cl::sycl::access::mode::read> ind,
+                    count_t ind_off, cl::sycl::accessor<value_t, 1, cl::sycl::access::mode::read> w,
+                    cl::sycl::accessor<value_t, 1, cl::sycl::access::mode::read_write,
+                                       cl::sycl::access::target::local>
+                        shfl_temp) {
+  count_t i, j, mn;
+  value_t v, last;
+  value_t sum = 0.0;
+  bool valid;
+
+  // Parallel prefix sum (using __shfl)
+  mn = (((n + tid_info.get_local_range(1) - 1) / tid_info.get_local_range(1)) *
+        tid_info.get_local_range(1)); // n in multiple of blockDim.x
+  for (i = tid_info.get_local_id(1); i < mn; i += tid_info.get_local_range(1)) {
+    // All threads (especially the last one) must always participate
+    // in the shfl instruction, otherwise their sum will be undefined.
+    // So, the loop stopping condition is based on multiple of n in loop increments,
+    // so that all threads enter into the loop and inside we make sure we do not
+    // read out of bounds memory checking for the actual size n.
+
+    // check if the thread is valid
+    valid = i < n;
+
+    // Notice that the last thread is used to propagate the prefix sum.
+    // For all the threads, in the first iteration the last is 0, in the following
+    // iterations it is the value at the last thread of the previous iterations.
+
+    // get the value of the last thread
+    // FIXME: __shfl_sync
+    // FIXME make sure everybody is here
+    group_barrier(tid_info.get_group());
+    // write your current sum
+    // This is a 2D block, use a linear ID
+    shfl_temp[tid_info.get_local_linear_id()] = sum;
+    // FIXME make sure everybody has read from the top thread in the same Y-dimensional subgroup
+    group_barrier(tid_info.get_group());
+    last = shfl_temp[tid_info.get_local_range(1) - 1 +
+                     (tid_info.get_local_range(1) * tid_info.get_local_id(0))];
+    // Move forward
+    // last = __shfl_sync(warp_full_mask(), sum, blockDim.x - 1, blockDim.x);
+
+    // if you are valid read the value from memory, otherwise set your value to 0
+    sum = (valid) ? w[ind[ind_off + i]] : 0.0;
+
+    // do prefix sum (of size warpSize=blockDim.x =< 32)
+    for (j = 1; j < tid_info.get_local_range(1); j *= 2) {
+      // FIXME: __shfl_up_warp
+      // FIXME make sure everybody is here
+      // Write your current sum
+      group_barrier(tid_info.get_group());
+      shfl_temp[tid_info.get_local_linear_id()] = sum;
+      // FIXME Force writes to finish
+      // read from tid-j
+      // Using the x-dimension local id for the conditional protects from overflows to other
+      // Y-subgroups Using the local_linear_id for the read saves us having to offset by x_range *
+      // y_id
+      group_barrier(tid_info.get_group());
+      if (tid_info.get_local_id(1) >= j) v = shfl_temp[tid_info.get_local_linear_id() - j];
+      // FIXME Force reads to finish
+      // v = __shfl_up_sync(warp_full_mask(), sum, j, blockDim.x);
+      if (tid_info.get_local_id(1) >= j) sum += v;
+    }
+    // shift by last
+    sum += last;
+    // notice that no __threadfence or __syncthreads are needed in this implementation
+  }
+  // get the value of the last thread (to all threads)
+  // FIXME: __shfl_sync
+  // FIXME make sure everybody is here
+  // write your current sum
+  // This is a 2D block, use a linear ID
+  group_barrier(tid_info.get_group());
+  shfl_temp[tid_info.get_local_linear_id()] = sum;
+  // FIXME make sure everybody has read from the top thread in the same Y-dimensional group
+  group_barrier(tid_info.get_group());
+  last = shfl_temp[tid_info.get_local_range(1) - 1 +
+                   (tid_info.get_local_range(1) * tid_info.get_local_id(0))];
+  // Move forward
+  // last = __shfl_sync(warp_full_mask(), sum, blockDim.x - 1, blockDim.x);
+
+  return last;
+}
+
 // From RAFT at commit 048063dc08
 constexpr inline int warp_size() {
   return 32;
@@ -142,6 +231,34 @@ double myAtomicAdd(cl::sycl::atomic<uint64_t> &address, double val) {
 
 namespace sygraph {
 namespace detail {
+// Volume of neighboors (*weight_s)
+template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
+// Must be marked external since main.cpp uses it
+extern SYCL_EXTERNAL const void
+Jaccard_RowSumKernel<weighted, vertex_t, edge_t, weight_t>::operator()(
+    cl::sycl::nd_item<2> tid_info) const {
+  vertex_t row;
+  edge_t start, end, length;
+  weight_t sum;
+
+  vertex_t row_start = tid_info.get_global_id(0);
+  vertex_t row_incr = tid_info.get_global_range(0);
+  for (row = row_start; row < n; row += row_incr) {
+    start = csrPtr[row];
+    end = csrPtr[row + 1];
+    length = end - start;
+
+    // compute row sums
+    // Must be if constexpr so it doesn't try to evaluate v when it's a nullptr_t
+    if constexpr (weighted) {
+      sum = parallel_prefix_sum(tid_info, length, csrInd, start, v, shfl_temp);
+      if (tid_info.get_local_id(1) == 0) work[row] = sum;
+    } else {
+      work[row] = static_cast<weight_t>(length);
+    }
+  }
+}
+
 // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 const void Jaccard_IsKernel<weighted, vertex_t, edge_t, weight_t>::operator()(
diff --git a/jaccard.hpp b/jaccard.hpp
@@ -52,6 +52,53 @@ class FillKernel {
 
 namespace sygraph {
 namespace detail {
+template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
+class Jaccard_RowSumKernel {
+  vertex_t n;
+  cl::sycl::accessor<edge_t, 1, cl::sycl::access::mode::read> csrPtr;
+  cl::sycl::accessor<vertex_t, 1, cl::sycl::access::mode::read> csrInd;
+  // FIXME, with std::conditional_t we should be able to simplify out some of the code paths in the
+  // other weight-branching kernels
+  #ifdef NEEDS_NULL_DEVICE_PTR
+  std::conditional_t<weighted, cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read>,
+                     cl::sycl::device_ptr<std::nullptr_t>>
+      v;
+  #else
+  std::conditional_t<weighted, cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read>,
+                     std::nullptr_t>
+      v;
+  #endif
+  cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::discard_write> work;
+  cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read_write,
+                     cl::sycl::access::target::local>
+      shfl_temp;
+
+public:
+  Jaccard_RowSumKernel<true>(
+      vertex_t n, cl::sycl::accessor<edge_t, 1, cl::sycl::access::mode::read> csrPtr,
+      cl::sycl::accessor<vertex_t, 1, cl::sycl::access::mode::read> csrInd,
+      cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read> v,
+      cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::discard_write> work,
+      cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read_write,
+                         cl::sycl::access::target::local>
+          shfl_temp)
+      : n{n}, csrInd{csrInd}, csrPtr{csrPtr}, v{v}, work{work}, shfl_temp{shfl_temp} {
+  }
+  // When not using weights, we don't care about v
+  Jaccard_RowSumKernel<false>(
+      vertex_t n, cl::sycl::accessor<edge_t, 1, cl::sycl::access::mode::read> csrPtr,
+      cl::sycl::accessor<vertex_t, 1, cl::sycl::access::mode::read> csrInd,
+      cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::discard_write> work,
+      cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read_write,
+                         cl::sycl::access::target::local>
+          shfl_temp)
+      : n{n}, csrInd{csrInd}, csrPtr{csrPtr}, work{work}, shfl_temp{shfl_temp} {
+  }
+  // Volume of neighboors (*weight_s)
+  // Must be marked external since main.cpp uses it
+  SYCL_EXTERNAL const void operator()(cl::sycl::nd_item<2> tid_info) const;
+};
+
 // Volume of intersections (*weight_i) and cumulated volume of neighboors (*weight_s)
 template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
 class Jaccard_IsKernel {
diff --git a/main.cpp b/main.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "filetypes.hpp"
+#include "jaccard.hpp"
 #include "readMtxToCSR.hpp" //implicitly includes standalone_csr.hpp
 #include "standalone_algorithms.hpp"
 #include "standalone_csr.hpp"
diff --git a/standalone_algorithms.hpp b/standalone_algorithms.hpp
@@ -46,94 +46,6 @@
       }
   #endif // EVENT_PROFILE
 
-// From utilties/graph_utils.cuh
-// FIXME Revisit the barriers and fences and local storage with subgroups
-// FIXME revisit with SYCL group algorithms
-template <typename count_t, typename index_t, typename value_t>
-__inline__ value_t
-parallel_prefix_sum(cl::sycl::nd_item<2> const &tid_info, count_t n,
-                    cl::sycl::accessor<index_t, 1, cl::sycl::access::mode::read> ind,
-                    count_t ind_off, cl::sycl::accessor<value_t, 1, cl::sycl::access::mode::read> w,
-                    cl::sycl::accessor<value_t, 1, cl::sycl::access::mode::read_write,
-                                       cl::sycl::access::target::local>
-                        shfl_temp) {
-  count_t i, j, mn;
-  value_t v, last;
-  value_t sum = 0.0;
-  bool valid;
-
-  // Parallel prefix sum (using __shfl)
-  mn = (((n + tid_info.get_local_range(1) - 1) / tid_info.get_local_range(1)) *
-        tid_info.get_local_range(1)); // n in multiple of blockDim.x
-  for (i = tid_info.get_local_id(1); i < mn; i += tid_info.get_local_range(1)) {
-    // All threads (especially the last one) must always participate
-    // in the shfl instruction, otherwise their sum will be undefined.
-    // So, the loop stopping condition is based on multiple of n in loop increments,
-    // so that all threads enter into the loop and inside we make sure we do not
-    // read out of bounds memory checking for the actual size n.
-
-    // check if the thread is valid
-    valid = i < n;
-
-    // Notice that the last thread is used to propagate the prefix sum.
-    // For all the threads, in the first iteration the last is 0, in the following
-    // iterations it is the value at the last thread of the previous iterations.
-
-    // get the value of the last thread
-    // FIXME: __shfl_sync
-    // FIXME make sure everybody is here
-    group_barrier(tid_info.get_group());
-    // write your current sum
-    // This is a 2D block, use a linear ID
-    shfl_temp[tid_info.get_local_linear_id()] = sum;
-    // FIXME make sure everybody has read from the top thread in the same Y-dimensional subgroup
-    group_barrier(tid_info.get_group());
-    last = shfl_temp[tid_info.get_local_range(1) - 1 +
-                     (tid_info.get_local_range(1) * tid_info.get_local_id(0))];
-    // Move forward
-    // last = __shfl_sync(warp_full_mask(), sum, blockDim.x - 1, blockDim.x);
-
-    // if you are valid read the value from memory, otherwise set your value to 0
-    sum = (valid) ? w[ind[ind_off + i]] : 0.0;
-
-    // do prefix sum (of size warpSize=blockDim.x =< 32)
-    for (j = 1; j < tid_info.get_local_range(1); j *= 2) {
-      // FIXME: __shfl_up_warp
-      // FIXME make sure everybody is here
-      // Write your current sum
-      group_barrier(tid_info.get_group());
-      shfl_temp[tid_info.get_local_linear_id()] = sum;
-      // FIXME Force writes to finish
-      // read from tid-j
-      // Using the x-dimension local id for the conditional protects from overflows to other
-      // Y-subgroups Using the local_linear_id for the read saves us having to offset by x_range *
-      // y_id
-      group_barrier(tid_info.get_group());
-      if (tid_info.get_local_id(1) >= j) v = shfl_temp[tid_info.get_local_linear_id() - j];
-      // FIXME Force reads to finish
-      // v = __shfl_up_sync(warp_full_mask(), sum, j, blockDim.x);
-      if (tid_info.get_local_id(1) >= j) sum += v;
-    }
-    // shift by last
-    sum += last;
-    // notice that no __threadfence or __syncthreads are needed in this implementation
-  }
-  // get the value of the last thread (to all threads)
-  // FIXME: __shfl_sync
-  // FIXME make sure everybody is here
-  // write your current sum
-  // This is a 2D block, use a linear ID
-  group_barrier(tid_info.get_group());
-  shfl_temp[tid_info.get_local_linear_id()] = sum;
-  // FIXME make sure everybody has read from the top thread in the same Y-dimensional group
-  group_barrier(tid_info.get_group());
-  last = shfl_temp[tid_info.get_local_range(1) - 1 +
-                   (tid_info.get_local_range(1) * tid_info.get_local_id(0))];
-  // Move forward
-  // last = __shfl_sync(warp_full_mask(), sum, blockDim.x - 1, blockDim.x);
-
-  return last;
-}
 namespace sygraph {
 
 /**
@@ -220,76 +132,6 @@ template <typename VT, typename ET, typename WT>
 void jaccard_list(GraphCSRView<VT, ET, WT> &graph, ET num_pairs, cl::sycl::buffer<VT> &first,
                   cl::sycl::buffer<VT> &second, cl::sycl::buffer<WT> &result, cl::sycl::queue &q);
 
-namespace detail {
-template <bool weighted, typename vertex_t, typename edge_t, typename weight_t>
-class Jaccard_RowSumKernel {
-  vertex_t n;
-  cl::sycl::accessor<edge_t, 1, cl::sycl::access::mode::read> csrPtr;
-  cl::sycl::accessor<vertex_t, 1, cl::sycl::access::mode::read> csrInd;
-  // FIXME, with std::conditional_t we should be able to simplify out some of the code paths in the
-  // other weight-branching kernels
-  #ifdef NEEDS_NULL_DEVICE_PTR
-  std::conditional_t<weighted, cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read>,
-                     cl::sycl::device_ptr<std::nullptr_t>>
-      v;
-  #else
-  std::conditional_t<weighted, cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read>,
-                     std::nullptr_t>
-      v;
-  #endif
-  cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::discard_write> work;
-  cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read_write,
-                     cl::sycl::access::target::local>
-      shfl_temp;
-
-public:
-  Jaccard_RowSumKernel<true>(
-      vertex_t n, cl::sycl::accessor<edge_t, 1, cl::sycl::access::mode::read> csrPtr,
-      cl::sycl::accessor<vertex_t, 1, cl::sycl::access::mode::read> csrInd,
-      cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read> v,
-      cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::discard_write> work,
-      cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read_write,
-                         cl::sycl::access::target::local>
-          shfl_temp)
-      : n{n}, csrInd{csrInd}, csrPtr{csrPtr}, v{v}, work{work}, shfl_temp{shfl_temp} {
-  }
-  // When not using weights, we don't care about v
-  Jaccard_RowSumKernel<false>(
-      vertex_t n, cl::sycl::accessor<edge_t, 1, cl::sycl::access::mode::read> csrPtr,
-      cl::sycl::accessor<vertex_t, 1, cl::sycl::access::mode::read> csrInd,
-      cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::discard_write> work,
-      cl::sycl::accessor<weight_t, 1, cl::sycl::access::mode::read_write,
-                         cl::sycl::access::target::local>
-          shfl_temp)
-      : n{n}, csrInd{csrInd}, csrPtr{csrPtr}, work{work}, shfl_temp{shfl_temp} {
-  }
-  // Volume of neighboors (*weight_s)
-  const void
-  operator()(cl::sycl::nd_item<2> tid_info) const {
-    vertex_t row;
-    edge_t start, end, length;
-    weight_t sum;
-
-    vertex_t row_start = tid_info.get_global_id(0);
-    vertex_t row_incr = tid_info.get_global_range(0);
-    for (row = row_start; row < n; row += row_incr) {
-      start = csrPtr[row];
-      end = csrPtr[row + 1];
-      length = end - start;
-
-      // compute row sums
-      // Must be if constexpr so it doesn't try to evaluate v when it's a nullptr_t
-      if constexpr (weighted) {
-        sum = parallel_prefix_sum(tid_info, length, csrInd, start, v, shfl_temp);
-        if (tid_info.get_local_id(1) == 0) work[row] = sum;
-      } else {
-        work[row] = static_cast<weight_t>(length);
-      }
-    }
-  }
-};
-
-} // namespace detail
 } // namespace sygraph
 
 #endif