Sparse csc and csr arrays backed by on-disk storage in Zarr or HDF5. Allows accessing slices of larger than memory arrays. Inspired by h5sparse and anndata.
pip install backedarray
import backedarray as ba
import scipy.sparse
import numpy as np
import h5py
import zarr
csr_matrix = scipy.sparse.random(100, 50, format="csr", density=0.2)
dense_array = csr_matrix.toarray()
# Write sparse matrix in csc or csr format to hdf5 file
h5_csr_path = 'csr.h5'
with h5py.File(h5_csr_path, "w") as f:
ba.write_sparse(f.create_group("X"), csr_matrix)
# Write sparse matrix in csc or csr format to zarr file
zarr_csr_path = 'csr.zarr'
with zarr.open(zarr_csr_path, mode="w") as f:
ba.write_sparse(f.create_group("X"), csr_matrix)
h5_csr_file = h5py.File(h5_csr_path, "r")
h5_csr_disk = ba.open(h5_csr_file["X"])
zarr_csr_disk = ba.open(zarr.open(zarr_csr_path)["X"])
zarr_csr_disk[1:3].toarray()
array([[0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.06275782, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.61030855, 0.46886635, 0. , 0.11597629, 0. , 0. , 0. , 0.23471198, 0. , 0. , 0. , 0. , 0.4911036 , 0. , 0. , 0. , 0. , 0. , 0.00851426, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.10065413], [0. , 0. , 0. , 0. , 0. , 0.93545866, 0. , 0. , 0. , 0. , 0.26147665, 0. , 0.99931215, 0. , 0. , 0. , 0. , 0.18532786, 0. , 0.69309913, 0. , 0. , 0. , 0. , 0. , 0. , 0.32219088, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.14121076, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.70207481, 0. , 0. , 0. , 0. ]])
h5_csr_disk[2:].toarray()
array([[0. , 0. , 0. , ..., 0. , 0. , 0. ], [0. , 0. , 0. , ..., 0. , 0. , 0. ], [0. , 0.89758627, 0. , ..., 0. , 0. , 0. ], ..., [0. , 0. , 0. , ..., 0. , 0. , 0. ], [0. , 0. , 0. , ..., 0. , 0. , 0. ], [0.81611075, 0. , 0. , ..., 0.82151986, 0. , 0. ]])
h5_csr_disk[...].toarray()
array([[0. , 0.45873864, 0. , ..., 0. , 0. , 0. ], [0. , 0. , 0. , ..., 0. , 0. , 0.10065413], [0. , 0. , 0. , ..., 0. , 0. , 0. ], ..., [0. , 0. , 0. , ..., 0. , 0. , 0. ], [0. , 0. , 0. , ..., 0. , 0. , 0. ], [0.81611075, 0. , 0. , ..., 0.82151986, 0. , 0. ]])
h5_csr_file.close()
zarr_csr_disk.append(csr_matrix)
np.testing.assert_array_equal(zarr_csr_disk[...].toarray(), scipy.sparse.vstack((csr_matrix, csr_matrix)).toarray())
Read h5ad files created using anndata
%%bash
if [ ! -f "pbmc3k.h5ad" ]; then
wget -q https://raw.githubusercontent.com/chanzuckerberg/cellxgene/main/example-dataset/pbmc3k.h5ad
fi
import anndata.experimental
with h5py.File('pbmc3k.h5ad', 'r') as f:
obs = anndata.experimental.read_elem(f['obs'])
var = anndata.experimental.read_elem(f['var'])
X = ba.open(f['X'])