Sparse spatial autocorrelation (#93)

scverse · Nov 14, 2023 · 7b7f551 · 7b7f551
1 parent f9456fc
commit 7b7f551
Show file tree

Hide file tree

Showing 4 changed files with 291 additions and 132 deletions.
diff --git a/docs/release-notes/0.9.3.md b/docs/release-notes/0.9.3.md
@@ -3,7 +3,8 @@
 ```{rubric} Features
 ```
 
-*  `neighbors` now works with `raft` and better supports approximate search with `cagra`, `ivfpq` and `ivfflat` {pr}`89` {smaller}`S Dicks`
+* `neighbors` now works with `raft` and better supports approximate search with `cagra`, `ivfpq` and `ivfflat` {pr}`89` {smaller}`S Dicks`
+* `spatial_autocorr` now works with sparse data matrices without densifying. It will use the sparse matrix by default. {pr}`93` {smaller}`S Dicks`
 
 ```{rubric} Bug fixes
 ```

diff --git a/src/rapids_singlecell/squidpy_gpu/_autocorr.py b/src/rapids_singlecell/squidpy_gpu/_autocorr.py
@@ -29,7 +29,7 @@ def spatial_autocorr(
     corr_method: Union[str, None] = "fdr_bh",
     layer: Union[str, None] = None,
     use_raw: bool = False,
-    use_sparse: bool = False,
+    use_sparse: bool = True,
     copy: bool = False,
 ) -> Optional[pd.DataFrame]:
     """
@@ -68,7 +68,7 @@ def spatial_autocorr(
         use_raw
             If True, use the raw data in the AnnData object, by default False.
         use_sparse
-            If True, use a sparse representation for the input matrix `vals` when it is a sparse matrix, by default False.
+            If True, use a sparse representation for the input matrix `vals` when it is a sparse matrix, by default True.
         copy
             If True, return the results as a DataFrame instead of storing them in `adata.uns`, by default False.
 
@@ -93,7 +93,7 @@ def spatial_autocorr(
         vals = adata.raw[:, genes].X
     else:
         if layer:
-            vals = adata[:, genes].layers["layer"]
+            vals = adata[:, genes].layers[layer]
         else:
             vals = adata[:, genes].X
     # create Adj-Matrix
@@ -108,14 +108,25 @@ def spatial_autocorr(
 
     params = {"two_tailed": two_tailed}
 
-    # check sparse:
-    if use_sparse:
-        vals = sparse.csr_matrix(vals)
-        data = sparse_gpu.csr_matrix(vals, dtype=cp.float32)
-    else:
-        if sparse.issparse(vals):
-            vals = vals.toarray()
-        data = cp.array(vals, dtype=cp.float32)
+    def process_input_data(vals, use_sparse):
+        # Check if input is already a sparse matrix
+        is_sparse_input = sparse.issparse(vals) or sparse_gpu.isspmatrix(vals)
+        if use_sparse and is_sparse_input:
+            # Convert to CuPy CSR format if not already in sparse GPU format
+            data = (
+                sparse_gpu.csr_matrix(vals.tocsr(), dtype=cp.float32)
+                if not sparse_gpu.isspmatrix(vals)
+                else vals.tocsr().astype(cp.float32)
+            )
+        elif is_sparse_input:
+            # Convert sparse input to dense format
+            data = cp.array(vals.toarray(), dtype=cp.float32)
+        else:
+            # Keep dense input as is
+            data = cp.array(vals, dtype=cp.float32)
+        return data
+
+    data = process_input_data(vals, use_sparse)
     # Run Spartial Autocorr
     if mode == "moran":
         score, score_perms = _morans_I_cupy(

diff --git a/src/rapids_singlecell/squidpy_gpu/_gearysc.py b/src/rapids_singlecell/squidpy_gpu/_gearysc.py
@@ -3,6 +3,8 @@
 import cupy as cp
 from cupyx.scipy import sparse
 
+from ._moransi import pre_den_calc_sparse
+
 kernel_gearys_C_num_dense = r"""
 extern "C" __global__ void gearys_C_num_dense(const float* data,
 const int* adj_matrix_row_ptr, const int* adj_matrix_col_ind, const float* adj_matrix_data,
@@ -25,6 +27,77 @@
     }
 }
 """
+kernel_gearys_C_num_sparse = r"""
+extern "C" __global__
+void gearys_C_num_sparse(const int* adj_matrix_row_ptr, const int* adj_matrix_col_ind, const float* adj_matrix_data,
+                    const int* data_row_ptr, const int* data_col_ind, const float* data_values,
+                    const int n_samples, const int n_features,
+                    float* num) {
+    int i = blockIdx.x;
+    int numThreads = blockDim.x;
+    int threadid = threadIdx.x;
+
+    // Create cache
+    __shared__ float cell1[3072];
+    __shared__ float cell2[3072];
+
+    int numruns = (n_features + 3072 - 1) / 3072;
+
+    if (i >= n_samples) {
+        return;
+    }
+
+    int k_start = adj_matrix_row_ptr[i];
+    int k_end = adj_matrix_row_ptr[i + 1];
+
+    for (int k = k_start; k < k_end; ++k) {
+        int j = adj_matrix_col_ind[k];
+        float edge_weight = adj_matrix_data[k];
+
+        int cell1_start = data_row_ptr[i];
+        int cell1_stop = data_row_ptr[i+1];
+
+        int cell2_start = data_row_ptr[j];
+        int cell2_stop = data_row_ptr[j+1];
+
+        for(int batch_runner = 0; batch_runner < numruns; batch_runner++){
+            // Set cache to 0
+            for (int idx = threadid; idx < 3072; idx += numThreads) {
+                cell1[idx] = 0.0f;
+                cell2[idx] = 0.0f;
+            }
+            __syncthreads();
+            int batch_start = 3072 * batch_runner;
+            int batch_end = 3072 * (batch_runner + 1);
+
+            // Densify sparse into cache
+            for (int cell1_idx = cell1_start+ threadid; cell1_idx < cell1_stop;cell1_idx+=numThreads) {
+                int gene_id = data_col_ind[cell1_idx];
+                if (gene_id >= batch_start && gene_id < batch_end){
+                    cell1[gene_id % 3072] = data_values[cell1_idx];
+                }
+            }
+            __syncthreads();
+            for (int cell2_idx = cell2_start+threadid; cell2_idx < cell2_stop;cell2_idx+=numThreads) {
+                int gene_id = data_col_ind[cell2_idx];
+                if (gene_id >= batch_start && gene_id < batch_end){
+                    cell2[gene_id % 3072] = data_values[cell2_idx];
+                }
+            }
+            __syncthreads();
+
+            // Calc num
+            for(int gene = threadid; gene < 3072; gene+= numThreads){
+                int global_gene_index = batch_start + gene;
+                if (global_gene_index < n_features) {
+                    float diff_sq = (cell1[gene] - cell2[gene]) * (cell1[gene] - cell2[gene]);
+                    atomicAdd(&num[global_gene_index], edge_weight * diff_sq);
+                }
+            }
+        }
+    }
+}
+"""
 
 
 def _gearys_C_cupy_dense(data, adj_matrix_cupy, n_permutations=100):
@@ -78,7 +151,6 @@ def _gearys_C_cupy_dense(data, adj_matrix_cupy, n_permutations=100):
                     n_features,
                 ),
             )
-            # den_permuted = 2 * adj_matrix_permuted.sum() * preden
             gearys_C_permutations[p, :] = (n_samples - 1) * num_permuted / den
             cp.cuda.Stream.null.synchronize()
     else:
@@ -88,43 +160,41 @@ def _gearys_C_cupy_dense(data, adj_matrix_cupy, n_permutations=100):
 
 def _gearys_C_cupy_sparse(data, adj_matrix_cupy, n_permutations=100):
     n_samples, n_features = data.shape
+    # Calculate the numerator for Geary's C
+    num = cp.zeros(n_features, dtype=cp.float32)
+    num_kernel = cp.RawKernel(kernel_gearys_C_num_sparse, "gearys_C_num_sparse")
 
+    n_samples, n_features = data.shape
+    sg = n_samples
+    # Launch the kernel
+    num_kernel(
+        (sg,),
+        (1024,),
+        (
+            adj_matrix_cupy.indptr,
+            adj_matrix_cupy.indices,
+            adj_matrix_cupy.data,
+            data.indptr,
+            data.indices,
+            data.data,
+            n_samples,
+            n_features,
+            num,
+        ),
+    )
     # Calculate the denominator for Geary's C
-    gene_mean = data.mean(axis=0).ravel()
-    preden = cp.sum((data - gene_mean) ** 2, axis=0)
-    den = 2 * adj_matrix_cupy.sum() * preden
-
-    # Calculate the numerator for Geary's C
-    data = data.tocsc()
-    num = cp.zeros(n_features, dtype=data.dtype)
-    block_size = 8
-    sg = int(math.ceil(n_samples / block_size))
-    num_kernel = cp.RawKernel(kernel_gearys_C_num_dense, "gearys_C_num_dense")
-    batchsize = 1000
-    n_batches = math.ceil(n_features / batchsize)
-    for batch in range(n_batches):
-        start_idx = batch * batchsize
-        stop_idx = min(batch * batchsize + batchsize, n_features)
-        data_block = data[:, start_idx:stop_idx].toarray()
-        num_block = cp.zeros(data_block.shape[1], dtype=data.dtype)
-        fg = int(math.ceil(data_block.shape[1] / block_size))
-        grid_size = (fg, sg, 1)
-
-        num_kernel(
-            grid_size,
-            (block_size, block_size, 1),
-            (
-                data_block,
-                adj_matrix_cupy.indptr,
-                adj_matrix_cupy.indices,
-                adj_matrix_cupy.data,
-                num_block,
-                n_samples,
-                data_block.shape[1],
-            ),
-        )
-        num[start_idx:stop_idx] = num_block
-        cp.cuda.Stream.null.synchronize()
+    means = data.mean(axis=0).ravel()
+    den = cp.zeros(n_features, dtype=cp.float32)
+    counter = cp.zeros(n_features, dtype=cp.int32)
+    block_den = math.ceil(data.nnz / 32)
+    pre_den_kernel = cp.RawKernel(pre_den_calc_sparse, "pre_den_sparse_kernel")
+
+    pre_den_kernel(
+        (block_den,), (32,), (data.indices, data.data, data.nnz, means, den, counter)
+    )
+    counter = n_samples - counter
+    den += counter * means**2
+    den *= 2 * adj_matrix_cupy.sum()
 
     # Calculate Geary's C
     gearys_C = (n_samples - 1) * num / den
@@ -136,30 +206,21 @@ def _gearys_C_cupy_sparse(data, adj_matrix_cupy, n_permutations=100):
             idx_shuffle = cp.random.permutation(adj_matrix_cupy.shape[0])
             adj_matrix_permuted = adj_matrix_cupy[idx_shuffle, :]
             num_permuted = cp.zeros(n_features, dtype=data.dtype)
-            for batch in range(n_batches):
-                start_idx = batch * batchsize
-                stop_idx = min(batch * batchsize + batchsize, n_features)
-                data_block = data[:, start_idx:stop_idx].toarray()
-                num_block = cp.zeros(data_block.shape[1], dtype=data.dtype)
-                fg = int(math.ceil(data_block.shape[1] / block_size))
-                grid_size = (fg, sg, 1)
-
-                num_kernel(
-                    grid_size,
-                    (block_size, block_size, 1),
-                    (
-                        data_block,
-                        adj_matrix_permuted.indptr,
-                        adj_matrix_permuted.indices,
-                        adj_matrix_permuted.data,
-                        num_block,
-                        n_samples,
-                        data_block.shape[1],
-                    ),
-                )
-                num_permuted[start_idx:stop_idx] = num_block
-
-            # den_permuted = 2 * adj_matrix_permuted.sum() * preden
+            num_kernel(
+                (sg,),
+                (1024,),
+                (
+                    adj_matrix_permuted.indptr,
+                    adj_matrix_permuted.indices,
+                    adj_matrix_permuted.data,
+                    data.indptr,
+                    data.indices,
+                    data.data,
+                    n_samples,
+                    n_features,
+                    num_permuted,
+                ),
+            )
             gearys_C_permutations[p, :] = (n_samples - 1) * num_permuted / den
             cp.cuda.Stream.null.synchronize()
     else: