Skip to content

Commit

Permalink
Sparse spatial autocorrelation (#93)
Browse files Browse the repository at this point in the history
  • Loading branch information
Intron7 authored Nov 14, 2023
1 parent f9456fc commit 7b7f551
Show file tree
Hide file tree
Showing 4 changed files with 291 additions and 132 deletions.
3 changes: 2 additions & 1 deletion docs/release-notes/0.9.3.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
```{rubric} Features
```

* `neighbors` now works with `raft` and better supports approximate search with `cagra`, `ivfpq` and `ivfflat` {pr}`89` {smaller}`S Dicks`
* `neighbors` now works with `raft` and better supports approximate search with `cagra`, `ivfpq` and `ivfflat` {pr}`89` {smaller}`S Dicks`
* `spatial_autocorr` now works with sparse data matrices without densifying. It will use the sparse matrix by default. {pr}`93` {smaller}`S Dicks`

```{rubric} Bug fixes
```
Expand Down
33 changes: 22 additions & 11 deletions src/rapids_singlecell/squidpy_gpu/_autocorr.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def spatial_autocorr(
corr_method: Union[str, None] = "fdr_bh",
layer: Union[str, None] = None,
use_raw: bool = False,
use_sparse: bool = False,
use_sparse: bool = True,
copy: bool = False,
) -> Optional[pd.DataFrame]:
"""
Expand Down Expand Up @@ -68,7 +68,7 @@ def spatial_autocorr(
use_raw
If True, use the raw data in the AnnData object, by default False.
use_sparse
If True, use a sparse representation for the input matrix `vals` when it is a sparse matrix, by default False.
If True, use a sparse representation for the input matrix `vals` when it is a sparse matrix, by default True.
copy
If True, return the results as a DataFrame instead of storing them in `adata.uns`, by default False.
Expand All @@ -93,7 +93,7 @@ def spatial_autocorr(
vals = adata.raw[:, genes].X
else:
if layer:
vals = adata[:, genes].layers["layer"]
vals = adata[:, genes].layers[layer]
else:
vals = adata[:, genes].X
# create Adj-Matrix
Expand All @@ -108,14 +108,25 @@ def spatial_autocorr(

params = {"two_tailed": two_tailed}

# check sparse:
if use_sparse:
vals = sparse.csr_matrix(vals)
data = sparse_gpu.csr_matrix(vals, dtype=cp.float32)
else:
if sparse.issparse(vals):
vals = vals.toarray()
data = cp.array(vals, dtype=cp.float32)
def process_input_data(vals, use_sparse):
# Check if input is already a sparse matrix
is_sparse_input = sparse.issparse(vals) or sparse_gpu.isspmatrix(vals)
if use_sparse and is_sparse_input:
# Convert to CuPy CSR format if not already in sparse GPU format
data = (
sparse_gpu.csr_matrix(vals.tocsr(), dtype=cp.float32)
if not sparse_gpu.isspmatrix(vals)
else vals.tocsr().astype(cp.float32)
)
elif is_sparse_input:
# Convert sparse input to dense format
data = cp.array(vals.toarray(), dtype=cp.float32)
else:
# Keep dense input as is
data = cp.array(vals, dtype=cp.float32)
return data

data = process_input_data(vals, use_sparse)
# Run Spartial Autocorr
if mode == "moran":
score, score_perms = _morans_I_cupy(
Expand Down
181 changes: 121 additions & 60 deletions src/rapids_singlecell/squidpy_gpu/_gearysc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import cupy as cp
from cupyx.scipy import sparse

from ._moransi import pre_den_calc_sparse

kernel_gearys_C_num_dense = r"""
extern "C" __global__ void gearys_C_num_dense(const float* data,
const int* adj_matrix_row_ptr, const int* adj_matrix_col_ind, const float* adj_matrix_data,
Expand All @@ -25,6 +27,77 @@
}
}
"""
kernel_gearys_C_num_sparse = r"""
extern "C" __global__
void gearys_C_num_sparse(const int* adj_matrix_row_ptr, const int* adj_matrix_col_ind, const float* adj_matrix_data,
const int* data_row_ptr, const int* data_col_ind, const float* data_values,
const int n_samples, const int n_features,
float* num) {
int i = blockIdx.x;
int numThreads = blockDim.x;
int threadid = threadIdx.x;
// Create cache
__shared__ float cell1[3072];
__shared__ float cell2[3072];
int numruns = (n_features + 3072 - 1) / 3072;
if (i >= n_samples) {
return;
}
int k_start = adj_matrix_row_ptr[i];
int k_end = adj_matrix_row_ptr[i + 1];
for (int k = k_start; k < k_end; ++k) {
int j = adj_matrix_col_ind[k];
float edge_weight = adj_matrix_data[k];
int cell1_start = data_row_ptr[i];
int cell1_stop = data_row_ptr[i+1];
int cell2_start = data_row_ptr[j];
int cell2_stop = data_row_ptr[j+1];
for(int batch_runner = 0; batch_runner < numruns; batch_runner++){
// Set cache to 0
for (int idx = threadid; idx < 3072; idx += numThreads) {
cell1[idx] = 0.0f;
cell2[idx] = 0.0f;
}
__syncthreads();
int batch_start = 3072 * batch_runner;
int batch_end = 3072 * (batch_runner + 1);
// Densify sparse into cache
for (int cell1_idx = cell1_start+ threadid; cell1_idx < cell1_stop;cell1_idx+=numThreads) {
int gene_id = data_col_ind[cell1_idx];
if (gene_id >= batch_start && gene_id < batch_end){
cell1[gene_id % 3072] = data_values[cell1_idx];
}
}
__syncthreads();
for (int cell2_idx = cell2_start+threadid; cell2_idx < cell2_stop;cell2_idx+=numThreads) {
int gene_id = data_col_ind[cell2_idx];
if (gene_id >= batch_start && gene_id < batch_end){
cell2[gene_id % 3072] = data_values[cell2_idx];
}
}
__syncthreads();
// Calc num
for(int gene = threadid; gene < 3072; gene+= numThreads){
int global_gene_index = batch_start + gene;
if (global_gene_index < n_features) {
float diff_sq = (cell1[gene] - cell2[gene]) * (cell1[gene] - cell2[gene]);
atomicAdd(&num[global_gene_index], edge_weight * diff_sq);
}
}
}
}
}
"""


def _gearys_C_cupy_dense(data, adj_matrix_cupy, n_permutations=100):
Expand Down Expand Up @@ -78,7 +151,6 @@ def _gearys_C_cupy_dense(data, adj_matrix_cupy, n_permutations=100):
n_features,
),
)
# den_permuted = 2 * adj_matrix_permuted.sum() * preden
gearys_C_permutations[p, :] = (n_samples - 1) * num_permuted / den
cp.cuda.Stream.null.synchronize()
else:
Expand All @@ -88,43 +160,41 @@ def _gearys_C_cupy_dense(data, adj_matrix_cupy, n_permutations=100):

def _gearys_C_cupy_sparse(data, adj_matrix_cupy, n_permutations=100):
n_samples, n_features = data.shape
# Calculate the numerator for Geary's C
num = cp.zeros(n_features, dtype=cp.float32)
num_kernel = cp.RawKernel(kernel_gearys_C_num_sparse, "gearys_C_num_sparse")

n_samples, n_features = data.shape
sg = n_samples
# Launch the kernel
num_kernel(
(sg,),
(1024,),
(
adj_matrix_cupy.indptr,
adj_matrix_cupy.indices,
adj_matrix_cupy.data,
data.indptr,
data.indices,
data.data,
n_samples,
n_features,
num,
),
)
# Calculate the denominator for Geary's C
gene_mean = data.mean(axis=0).ravel()
preden = cp.sum((data - gene_mean) ** 2, axis=0)
den = 2 * adj_matrix_cupy.sum() * preden

# Calculate the numerator for Geary's C
data = data.tocsc()
num = cp.zeros(n_features, dtype=data.dtype)
block_size = 8
sg = int(math.ceil(n_samples / block_size))
num_kernel = cp.RawKernel(kernel_gearys_C_num_dense, "gearys_C_num_dense")
batchsize = 1000
n_batches = math.ceil(n_features / batchsize)
for batch in range(n_batches):
start_idx = batch * batchsize
stop_idx = min(batch * batchsize + batchsize, n_features)
data_block = data[:, start_idx:stop_idx].toarray()
num_block = cp.zeros(data_block.shape[1], dtype=data.dtype)
fg = int(math.ceil(data_block.shape[1] / block_size))
grid_size = (fg, sg, 1)

num_kernel(
grid_size,
(block_size, block_size, 1),
(
data_block,
adj_matrix_cupy.indptr,
adj_matrix_cupy.indices,
adj_matrix_cupy.data,
num_block,
n_samples,
data_block.shape[1],
),
)
num[start_idx:stop_idx] = num_block
cp.cuda.Stream.null.synchronize()
means = data.mean(axis=0).ravel()
den = cp.zeros(n_features, dtype=cp.float32)
counter = cp.zeros(n_features, dtype=cp.int32)
block_den = math.ceil(data.nnz / 32)
pre_den_kernel = cp.RawKernel(pre_den_calc_sparse, "pre_den_sparse_kernel")

pre_den_kernel(
(block_den,), (32,), (data.indices, data.data, data.nnz, means, den, counter)
)
counter = n_samples - counter
den += counter * means**2
den *= 2 * adj_matrix_cupy.sum()

# Calculate Geary's C
gearys_C = (n_samples - 1) * num / den
Expand All @@ -136,30 +206,21 @@ def _gearys_C_cupy_sparse(data, adj_matrix_cupy, n_permutations=100):
idx_shuffle = cp.random.permutation(adj_matrix_cupy.shape[0])
adj_matrix_permuted = adj_matrix_cupy[idx_shuffle, :]
num_permuted = cp.zeros(n_features, dtype=data.dtype)
for batch in range(n_batches):
start_idx = batch * batchsize
stop_idx = min(batch * batchsize + batchsize, n_features)
data_block = data[:, start_idx:stop_idx].toarray()
num_block = cp.zeros(data_block.shape[1], dtype=data.dtype)
fg = int(math.ceil(data_block.shape[1] / block_size))
grid_size = (fg, sg, 1)

num_kernel(
grid_size,
(block_size, block_size, 1),
(
data_block,
adj_matrix_permuted.indptr,
adj_matrix_permuted.indices,
adj_matrix_permuted.data,
num_block,
n_samples,
data_block.shape[1],
),
)
num_permuted[start_idx:stop_idx] = num_block

# den_permuted = 2 * adj_matrix_permuted.sum() * preden
num_kernel(
(sg,),
(1024,),
(
adj_matrix_permuted.indptr,
adj_matrix_permuted.indices,
adj_matrix_permuted.data,
data.indptr,
data.indices,
data.data,
n_samples,
n_features,
num_permuted,
),
)
gearys_C_permutations[p, :] = (n_samples - 1) * num_permuted / den
cp.cuda.Stream.null.synchronize()
else:
Expand Down
Loading

0 comments on commit 7b7f551

Please sign in to comment.