From f2979b6ed734a7c1dccb83fef3f73bcc0c8edffe Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 18 Jun 2024 23:38:38 -0700 Subject: [PATCH] Rewriting the build process (#2) Update documentation and tests --- README.md | 40 +- docs/index.md | 6 +- docs/tutorial.md | 7 +- src/cellarr/CellArrDataset.py | 211 ++++++-- src/cellarr/__init__.py | 3 +- src/cellarr/build_cellarrdataset.py | 414 --------------- src/cellarr/build_options.py | 125 +++++ src/cellarr/buildutils_cellarrdataset.py | 476 ++++++++++++++++++ ...db_array.py => buildutils_tiledb_array.py} | 28 +- ...db_frame.py => buildutils_tiledb_frame.py} | 0 src/cellarr/queryutils_tiledb_frame.py | 78 +++ src/cellarr/utils_anndata.py | 68 +-- tests/test_build.py | 23 +- tests/test_dataset.py | 66 --- tests/test_query.py | 59 +++ 15 files changed, 1008 insertions(+), 596 deletions(-) delete mode 100644 src/cellarr/build_cellarrdataset.py create mode 100644 src/cellarr/build_options.py create mode 100644 src/cellarr/buildutils_cellarrdataset.py rename src/cellarr/{utils_tiledb_array.py => buildutils_tiledb_array.py} (83%) rename src/cellarr/{utils_tiledb_frame.py => buildutils_tiledb_frame.py} (100%) create mode 100644 src/cellarr/queryutils_tiledb_frame.py delete mode 100644 tests/test_dataset.py create mode 100644 tests/test_query.py diff --git a/README.md b/README.md index 3a147fa..ef63b65 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,3 @@ - - [![PyPI-Server](https://img.shields.io/pypi/v/cellarr.svg)](https://pypi.org/project/cellarr/) ![Unit tests](https://github.com/BiocPy/cellarr/actions/workflows/pypi-test.yml/badge.svg) @@ -30,9 +18,9 @@ pip install cellarr ## Usage -### Create a `CellArrDataset` +### Build a `CellArrDataset` -Creating a `CellArrDataset` generates three TileDB files in the specified output directory: +Building a `CellArrDataset` generates three TileDB files in the specified output directory: - `gene_metadata`: Contains feature annotations. - `cell_metadata`: Contains cell or sample metadata. @@ -67,11 +55,29 @@ dataset = build_cellarrdataset( matrix_dim_dtype=np.float32 ) ``` + +The build process usually involves 3 steps: + +1. **Scan the Collection**: Scan the entire collection of files to create +a unique set of gene symbols. Store this gene set as the +`gene_metadata` TileDB file. +2. **Store Cell Metadata**: Store cell metadata as the +`cell_metadata` TileDB file. +3. **Remap and Orient Data**: For each dataset in the collection, +remap and orient the gene dimension using the gene set from Step 1. +This step ensures consistency in gene measurement and order, even if +some genes are unmeasured or ordered differently in the original experiments. + +***Note: The objects to build the `CellArrDataset` are expected to be fairly consistent, especially along the feature dimension. +if these are `AnnData` or `H5AD`objects, all objects must contain an index (in the `var` slot) specifying the gene symbols.*** + +Check out the [documentation](https://biocpy.github.io/cellarr/tutorial.html) for more details. + ---- -#### TODO: This following section does not work yet. +#### TODO: This following section does not work yet -Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializng it to the path where the files were created. +Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializing it to the path where the files were created. ```python # Create a CellArrDataset object from the existing dataset @@ -86,4 +92,4 @@ expression_data = dataset[10, ["gene1", "gene10", "gene500"]] ## Note This project has been set up using PyScaffold 4.5. For details and usage -information on PyScaffold see https://pyscaffold.org/. +information on PyScaffold see . diff --git a/docs/index.md b/docs/index.md index 7283ccf..8f9872f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,11 +16,11 @@ pip install cellarr :maxdepth: 2 Overview -Contributions & Help -License +Module Reference Authors Changelog -Module Reference +Contributions & Help +License ``` ## Indices and tables diff --git a/docs/tutorial.md b/docs/tutorial.md index 127c86f..f9fa957 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -10,9 +10,9 @@ Cell Arrays is a Python package that provides a TileDB-backed store for large co ## Usage -### Create the `CellArrDataset` +### Build the `CellArrDataset` -Creating a CellArrDataset generates three TileDB files in the specified output directory: +Creating a `CellArrDataset` generates three TileDB files in the specified output directory: - `gene_metadata`: Contains feature annotations. - `cell_metadata`: Contains cell or sample metadata. @@ -43,11 +43,12 @@ dataset = build_cellarrdataset( matrix_dim_dtype=np.float32 ) ``` + ---- #### TODO: This following section does not work yet. -Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializng it to the path where the files were created. +Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializing it to the path where the files were created. ```python # Create a CellArrDataset object from the existing dataset diff --git a/src/cellarr/CellArrDataset.py b/src/cellarr/CellArrDataset.py index 0dd467a..536cd8a 100644 --- a/src/cellarr/CellArrDataset.py +++ b/src/cellarr/CellArrDataset.py @@ -1,34 +1,46 @@ import os -from typing import Union +from typing import List, Union, Sequence +import pandas as pd import tiledb +from . import queryutils_tiledb_frame as qtd + __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" __license__ = "MIT" class CellArrDataset: - """A class that represent a collection of cells in TileDB.""" + """A class that represent a collection of cells and their associated metadata in a TileDB backed store.""" def __init__( self, dataset_path: str, - counts_tdb_uri: str = "counts", - gene_metadata_uri: str = "gene_metadata", + matrix_tdb_uri: str = "counts", + gene_annotation_uri: str = "gene_annotation", cell_metadata_uri: str = "cell_metadata", + sample_metadata_uri: str = "sample_metadata", ): - """Initialize a ``CellArr`` dataset. + """Initialize a ``CellArrDataset``. Args: + dataset_path: + Path to the directory containing the tiledb stores. + Usually the ``output_path`` from the + :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`. + counts_tdb_uri: - Path to counts TileDB. + Relative path to matrix store. - gene_metadata_uri: - Path to gene metadata TileDB. + gene_annotation_uri: + Relative path to gene annotation store. cell_metadata_uri: - Path to cell metadata TileDB. + Relative path to cell metadata store. + + sample_metadata_uri: + Relative path to sample metadata store. """ if not os.path.isdir(dataset_path): @@ -36,71 +48,132 @@ def __init__( self._dataset_path = dataset_path # TODO: Maybe switch to on-demand loading of these objects - self._counts_tdb_tdb = tiledb.open(f"{dataset_path}/{counts_tdb_uri}", "r") - self._gene_metadata_tdb = tiledb.open( - f"{dataset_path}/{gene_metadata_uri}", "r" + self._matrix_tdb_tdb = tiledb.open(f"{dataset_path}/{matrix_tdb_uri}", "r") + self._gene_annotation_tdb = tiledb.open( + f"{dataset_path}/{gene_annotation_uri}", "r" ) self._cell_metadata_tdb = tiledb.open( f"{dataset_path}/{cell_metadata_uri}", "r" ) def __del__(self): - self._counts_tdb_tdb.close() - self._gene_metadata_tdb.close() + self._matrix_tdb_tdb.close() + self._gene_annotation_tdb.close() self._cell_metadata_tdb.close() - # TODO: - # Methods to implement - # search by gene - # search by cell metadata - # slice counts after search + def get_cell_metadata_columns(self) -> List[str]: + """Get column names from ``cell_metadata`` store. - def get_cell_metadata_columns(self): - columns = [] - for i in range(self._cell_metadata_tdb.schema.nattr): - columns.append(self._cell_metadata_tdb.schema.attr(i).name) + Returns: + List of available metadata columns. + """ + return qtd.get_schema_names_frame(self._cell_metadata_tdb) - return columns + def get_cell_metadata_column(self, column_name: str) -> list: + """Access a column from the ``cell_metadata`` store. - def get_cell_metadata_column(self, column_name: str): - return self._cell_metadata_tdb.query(attrs=[column_name]).df[:] + Args: + column_name: + Name of the column or attribute. Usually one of the column names + from of :py:meth:`~get_cell_metadata_columns`. + + Returns: + A list of values for this column. + """ + return qtd.get_a_column(self._cell_metadata_tdb, column_name=column_name) def get_cell_subset( self, subset: Union[slice, tiledb.QueryCondition], columns=None - ): - if columns is None: - columns = self.get_cell_metadata_columns() + ) -> pd.DataFrame: + """Slice the ``cell_metadata`` store. + + Args: + subset: + A list of integer indices to subset the ``cell_metadata`` + store. + + Alternatively, may also provide a + :py:class:`tiledb.QueryCondition` to query the store. - query = self._cell_metadata_tdb.query(cond=subset, attrs=columns) - data = query.df[:] - result = data.dropna() - return result + columns: + List of specific column names to access. - def get_gene_metadata_columns(self): - columns = [] - for i in range(self._gene_metadata_tdb.schema.nattr): - columns.append(self._gene_metadata_tdb.schema.attr(i).name) + Defaults to None, in which case all columns are extracted. - return columns + Returns: + A pandas Dataframe of the subset. + """ + return qtd.subset_frame(self._cell_metadata_tdb, subset=subset, columns=columns) + + def get_gene_metadata_columns(self) -> List[str]: + """Get annotation column names from ``gene_metadata`` store. + + Returns: + List of available annotations. + """ + return qtd.get_schema_names_frame(self._gene_annotation_tdb) def get_gene_metadata_column(self, column_name: str): - return self._gene_metadata_tdb.query(attrs=[column_name]).df[:] + """Access a column from the ``gene_metadata`` store. + + Args: + column_name: + Name of the column or attribute. Usually one of the column names + from of :py:meth:`~get_gene_metadata_columns`. + + Returns: + A list of values for this column. + """ + return qtd.get_a_column(self._gene_annotation_tdb, column_name=column_name) + + def get_gene_metadata_index(self): + """Get index of the ``gene_metadata`` store. This typically should store all unique gene symbols. + + Returns: + List of unique symbols. + """ + return qtd.get_index(self._gene_annotation_tdb) + + def _get_indices_for_gene_list(self, query: list) -> List[int]: + _gene_index = self.get_gene_metadata_index() + return qtd._match_to_list(_gene_index, query=query) def get_gene_subset( - self, subset: Union[slice, tiledb.QueryCondition], columns=None + self, subset: Union[slice, List[str], tiledb.QueryCondition], columns=None ): - if columns is None: - columns = self.get_gene_metadata_columns() + """Slice the ``gene_metadata`` store. + + Args: + subset: + A list of integer indices to subset the ``gene_metadata`` + store. + + Alternatively, may provide a + :py:class:`tiledb.QueryCondition` to query the store. + + Alternatively, may provide a list of strings to match with + the index of ``gene_metadata`` store. + + columns: + List of specific column names to access. + + Defaults to None, in which case all columns are extracted. + + Returns: + A pandas Dataframe of the subset. + """ + + if qtd._is_list_strings(subset): + subset = self._get_indices_for_gene_list(subset) - query = self._gene_metadata_tdb.query(cond=subset, attrs=columns) - data = query.df[:] - result = data.dropna() - return result + return qtd.subset_frame( + self._gene_annotation_tdb, subset=subset, columns=columns + ) def get_slice( self, cell_subset: Union[slice, tiledb.QueryCondition], - gene_subset: Union[slice, tiledb.QueryCondition], + gene_subset: Union[slice, List[str], tiledb.QueryCondition], ): _csubset = self.get_cell_subset(cell_subset) _cell_indices = _csubset.index.tolist() @@ -108,4 +181,46 @@ def get_slice( _gsubset = self.get_gene_subset(gene_subset) _gene_indices = _gsubset.index.tolist() - return self._counts_tdb_tdb.multi_index[_cell_indices, _gene_indices] + return self._matrix_tdb_tdb.multi_index[_cell_indices, _gene_indices] + + def __getitem__( + self, + args: Union[int, str, Sequence, tuple], + ): + """Subset a ``CellArrDataset``. + + Args: + args: + Integer indices, a boolean filter, or (if the current object is + named) names specifying the ranges to be extracted. + + Alternatively a tuple of length 1. The first entry specifies + the rows to retain based on their names or indices. + + Alternatively a tuple of length 2. The first entry specifies + the rows to retain, while the second entry specifies the + columns to retain, based on their names or indices. + + Raises: + ValueError: + If too many or too few slices provided. + """ + if isinstance(args, (str, int)): + return self.get_slice(args, slice(None)) + + if isinstance(args, tuple): + if len(args) == 0: + raise ValueError("At least one slicing argument must be provided.") + + if len(args) == 1: + return self.get_slice(args[0], slice(None)) + elif len(args) == 2: + return self.get_slice(args[0], args[1]) + else: + raise ValueError( + f"`{type(self).__name__}` only supports 2-dimensional slicing." + ) + + raise TypeError( + "args must be a sequence or a scalar integer or string or a tuple of atmost 2 values." + ) diff --git a/src/cellarr/__init__.py b/src/cellarr/__init__.py index e7c6f0c..3bcc975 100644 --- a/src/cellarr/__init__.py +++ b/src/cellarr/__init__.py @@ -15,5 +15,6 @@ finally: del version, PackageNotFoundError -from .build_cellarrdataset import build_cellarrdataset +from .build_options import CellMetadataOptions, GeneAnnotationOptions, MatrixOptions, SampleMetadataOptions +from .buildutils_cellarrdataset import build_cellarrdataset from .CellArrDataset import CellArrDataset diff --git a/src/cellarr/build_cellarrdataset.py b/src/cellarr/build_cellarrdataset.py deleted file mode 100644 index 2e0eb9c..0000000 --- a/src/cellarr/build_cellarrdataset.py +++ /dev/null @@ -1,414 +0,0 @@ -"""Build the `CellArrDatset` - -The `CellArrDataset` method is designed to store single-cell RNA-seq -datasets but can be generalized to store any 2-dimensional experimental data. - -This method creates three TileDB files in the directory specified by `output_path`: -- `gene_metadata`: A TileDB file containing gene metadata. -- `cell_metadata`: A TileDB file containing cell metadata. -- A matrix TileDB file named as specified by the `layer_matrix_name` parameter. - -The TileDB matrix file is stored in a cell X gene orientation. This orientation -is chosen because the fastest-changing dimension as new files are added to the -collection is usually the cells rather than genes. - -## Process - -1. **Scan the Collection**: Scan the entire collection of files to create -a unique set of gene symbols. Store this gene set as the -`gene_metadata` TileDB file. -2. **Store Cell Metadata**: Store cell metadata as the -`cell_metadata` TileDB file. -3. **Remap and Orient Data**: For each dataset in the collection, -remap and orient the gene dimension using the gene set from Step 1. -This step ensures consistency in gene measurement and order, even if -some genes are unmeasured or ordered differently in the original experiments. - - -Example: - - .. code-block:: python - - import anndata - import numpy as np - import tempfile - from cellarr import build_cellarrdataset, CellArrDataset - - # Create a temporary directory - tempdir = tempfile.mkdtemp() - - # Read AnnData objects - adata1 = anndata.read_h5ad("path/to/object1.h5ad") - # or just provide the path - adata2 = "path/to/object2.h5ad" - - # Build CellArrDataset - dataset = build_cellarrdataset( - output_path=tempdir, - h5ad_or_adata=[adata1, adata2], - matrix_dim_dtype=np.float32 - ) -""" - -import os -import warnings -from typing import List, Union - -import anndata -import numpy as np -import pandas as pd - -from . import utils_anndata as uad -from . import utils_tiledb_array as uta -from . import utils_tiledb_frame as utf -from .CellArrDataset import CellArrDataset - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - - -def build_cellarrdataset( - files: List[Union[str, anndata.AnnData]], - output_path: str, - num_cells: int = None, - num_genes: int = None, - cell_metadata: Union[pd.DataFrame, str] = None, - gene_metadata: Union[List[str], dict, str, pd.DataFrame] = None, - var_gene_column: str = "index", - layer_matrix_name: str = "counts", - skip_gene_tiledb: bool = False, - skip_cell_tiledb: bool = False, - skip_matrix_tiledb: bool = False, - cell_dim_dtype: np.dtype = np.uint32, - gene_dim_dtype: np.dtype = np.uint32, - matrix_dim_dtype: np.dtype = np.uint32, - optimize_tiledb: bool = True, - num_threads: int = 1, -): - """Generate the `CellArrDataset`. - - All files are expected to be consistent and any modifications - to make them consistent is outside the scope of this function - and package. - - There's a few assumptions this process makes: - - If object in ``files`` is an AnnData or H5AD object, these must - contain an assay matrix in layer names as ``layer_matrix_name`` - parameter. - - Feature information must contain a column defined by - ``var_gene_column`` that contains gene symbols or a common entity - across all files. - - - - Args: - files: - List of file paths to `H5AD` or ``AnnData`` objects. - Each object in this list must contain - - gene symbols as index or the column specified by - ``var_gene_column``. - - Must contain a layers with a matrix named as - ``layer_matrix_name``. - - output_path: - Path to where the output tiledb files should be stored. - - num_cells: - Number of cells across all files. - - Defualts to None, in which case, automatically inferred by - scanning all objects in ``h5ad_or_adata``. - - num_genes: - Number of genes across all cells. - - Defualts to None, in which case, automatically inferred by - scanning all objects in ``h5ad_or_adata``. - - cell_metadata: - Path to the file containing a concatenated cell metadata across - all cells. In this case, the first row is expected to contain the - column names. - - Alternatively, may also provide a dataframe containing the cell - metadata across all objects. - - Regardless of the input type, the number of rows in the file or - DataFrame must match the ``num_cells`` argument. - - Defaults to None, then a simple range index is created using the - ``num_cells`` argument. - - gene_metadata: - Path to the file containing a concatenated gene annotations across - all datasets. In this case, the first row is - expected to contain the column names and an index column - containing the gene symbols to remap the matrix. - - Alternatively, may also provide a dataframe containing the gene - annotations across all objects. - - Alternatively, a list or a dictionary of gene symbols. - - Regardless of the input type, the number of rows in the file or - DataFrame must match the ``num_genes`` argument. - - Defaults to None, then a gene set is generated by scanning all - objects in ``h5ad_or_adata``. - - var_gene_column: - Column name from ``var`` slot that contains the gene symbols. - Must be consistent across all objects in ``h5ad_or_adata``. - - Defaults to "index". - - layer_matrix_name: - Matrix name from ``layers`` slot to add to tiledb. - Must be consistent across all objects in ``h5ad_or_adata``. - - Defaults to "counts". - - skip_gene_tiledb: - Whether to skip generating gene metadata tiledb. - - Defaults to False. - - skip_cell_tiledb: - Whether to skip generating cell metadata tiledb. - - Defaults to False. - - skip_matrix_tiledb: - Whether to skip generating matrix tiledb. - - Defaults to False. - - cell_dim_dtype: - NumPy dtype for the cell dimension. - Defaults to np.uint32. - - Note: make sure the number of cells fit - within the range limits of unsigned-int32. - - gene_dim_dtype: - NumPy dtype for the gene dimension. - Defaults to np.uint32. - - Note: make sure the number of genes fit - within the range limits of unsigned-int32. - - matrix_dim_dtype: - NumPy dtype for the values in the matrix. - Defaults to np.uint32. - - Note: make sure the matrix values fit - within the range limits of unsigned-int32. - - num_threads: - Number of threads. - - Defaults to 1. - """ - if not os.path.isdir(output_path): - raise ValueError("'output_path' must be a directory.") - - if gene_metadata is None: - warnings.warn( - "Scanning all files for gene symbols, this may take long", UserWarning - ) - gene_set = uad.scan_for_genes( - files, var_gene_column=var_gene_column, num_threads=num_threads - ) - - gene_set = sorted(gene_set) - - gene_metadata = pd.DataFrame({"genes": gene_set}, index=gene_set) - elif isinstance(gene_metadata, list): - _gene_list = sorted(list(set(gene_metadata))) - gene_metadata = pd.DataFrame({"genes": _gene_list}, index=_gene_list) - elif isinstance(gene_metadata, dict): - _gene_list = sorted(list(gene_metadata.keys())) - gene_metadata = pd.DataFrame({"genes": _gene_list}, index=_gene_list) - elif isinstance(gene_metadata, str): - gene_metadata = pd.read_csv(gene_metadata, index=True, header=True) - - gene_metadata["genes_index"] = gene_metadata.index.tolist() - - if not isinstance(gene_metadata, pd.DataFrame): - raise TypeError("'gene_metadata' must be a pandas dataframe.") - - if len(gene_metadata.index.unique()) != len(gene_metadata.index.tolist()): - raise ValueError("'gene_metadata' must contain a unique index.") - - if num_genes is None: - num_genes = len(gene_metadata) - - # Create the gene metadata tiledb - if not skip_gene_tiledb: - _col_types = {} - for col in gene_metadata.columns: - _col_types[col] = "ascii" - - _gene_output_uri = f"{output_path}/gene_metadata" - generate_metadata_tiledb_frame( - _gene_output_uri, gene_metadata, column_types=_col_types - ) - - if optimize_tiledb: - uta.optimize_tiledb_array(_gene_output_uri) - - if cell_metadata is None: - if num_cells is None: - warnings.warn( - "Scanning all files to compute cell counts, this may take long", - UserWarning, - ) - cell_counts = uad.scan_for_cellcounts(files, num_threads=num_threads) - num_cells = sum(cell_counts) - - cell_metadata = pd.DataFrame({"cell_index": [x for x in range(num_cells)]}) - - if isinstance(cell_metadata, str): - warnings.warn( - "Scanning 'cell_metadata' to count number of cells, this may take long", - UserWarning, - ) - with open(cell_metadata) as fp: - count = 0 - for _ in fp: - count += 1 - - num_cells = count - 1 # removing 1 for the header line - elif isinstance(cell_metadata, pd.DataFrame): - num_cells = len(cell_metadata) - - if num_cells is None: - raise ValueError( - "Cannot determine 'num_cells', we recommend setting this parameter." - ) - - # Create the cell metadata tiledb - if not skip_cell_tiledb: - _cell_output_uri = f"{output_path}/cell_metadata" - - if isinstance(cell_metadata, str): - _cell_metaframe = pd.read_csv(cell_metadata, chunksize=5, header=True) - generate_metadata_tiledb_csv( - _cell_output_uri, cell_metadata, _cell_metaframe.columns - ) - elif isinstance(cell_metadata, pd.DataFrame): - _col_types = {} - for col in gene_metadata.columns: - _col_types[col] = "ascii" - - _to_write = gene_metadata.astype(str) - - generate_metadata_tiledb_frame( - _cell_output_uri, _to_write, column_types=_col_types - ) - - if optimize_tiledb: - uta.optimize_tiledb_array(_cell_output_uri) - - # create the counts metadata - if not skip_matrix_tiledb: - gene_idx = gene_metadata.index.tolist() - gene_set = {} - for i, x in enumerate(gene_idx): - gene_set[x] = i - - _counts_uri = f"{output_path}/{layer_matrix_name}" - uta.create_tiledb_array( - _counts_uri, - num_cells=num_cells, - num_genes=num_genes, - matrix_attr_name=layer_matrix_name, - x_dim_dtype=cell_dim_dtype, - y_dim_dtype=gene_dim_dtype, - matrix_dim_dtype=matrix_dim_dtype, - ) - - offset = 0 - for fd in files: - mat = uad.remap_anndata( - fd, - gene_set, - var_gene_column=var_gene_column, - layer_matrix_name=layer_matrix_name, - ) - uta.write_csr_matrix_to_tiledb( - _counts_uri, matrix=mat, row_offset=offset, value_dtype=matrix_dim_dtype - ) - offset += int(mat.shape[0]) - - if optimize_tiledb: - uta.optimize_tiledb_array(_counts_uri) - - return CellArrDataset(dataset_path=output_path, counts_tdb_uri=layer_matrix_name) - - -def generate_metadata_tiledb_frame( - output_uri: str, input: pd.DataFrame, column_types: dict = None -): - """Generate metadata tiledb from a :pu:class:`~pandas.DataFrame`. - - Args: - output_uri: - TileDB URI or path to save the file. - - input: - Input dataframe. - - column_types: - You can specify type of each column name to cast into. - "ascii" or str works best for most scenarios. - - Defaults to None. - """ - _to_write = input.astype(str) - utf.create_tiledb_frame_from_dataframe( - output_uri, _to_write, column_types=column_types - ) - - -def generate_metadata_tiledb_csv( - output_uri: str, - input: str, - column_dtype=str, - chunksize=1000, -): - """Generate a metadata tiledb from csv. - - The difference between this and ``generate_metadata_tiledb_frame`` - is when the csv is super large and it won't fit into memory. - - Args: - output_uri: - TileDB URI or path to save the file. - - input: - Path to the csv file. The first row is expected to - contain the column names. - - column_dtype: - Dtype of the columns. - Defaults to str. - - chunksize: - Chunk size to read the dataframe. - Defaults to 1000. - """ - chunksize = 1000 - initfile = True - offset = 0 - - for chunk in pd.read_csv(input, chunksize=chunksize, header=True): - if initfile: - utf.create_tiledb_frame_from_column_names( - output_uri, chunk.columns, column_dtype - ) - initfile = False - - _to_write = chunk.astype(str) - utf.append_to_tiledb_frame(output_uri, _to_write, offset) - offset += len(chunk) diff --git a/src/cellarr/build_options.py b/src/cellarr/build_options.py new file mode 100644 index 0000000..94ab665 --- /dev/null +++ b/src/cellarr/build_options.py @@ -0,0 +1,125 @@ +import numpy as np + +from dataclasses import dataclass + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@dataclass +class CellMetadataOptions: + """Optional arguments for the ``cell_metadata`` store for + :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`. + + Attributes: + skip: + Whether to skip generating cell metadata tiledb. + Defaults to False. + + dtype: + NumPy dtype for the cell dimension. + Defaults to np.uint32. + + Note: make sure the number of cells fit + within the integer limits of unsigned-int32. + + tiledb_store_name: + Name of the tiledb file. + Defaults to "cell_metadata". + """ + + skip: bool = False + dtype: np.dtype = np.uint32 + tiledb_store_name: str = "cell_metadata" + + +@dataclass +class GeneAnnotationOptions: + """Optional arguments for the ``gene_annotation`` store for + :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`. + + Attributes: + feature_column: + Column in ``var`` containing the feature ids (e.g. gene symbols). + Defaults to the index of the ``var`` slot. + + skip: + Whether to skip generating gene annotation tiledb. + Defaults to False. + + dtype: + NumPy dtype for the gene dimension. + Defaults to np.uint32. + + Note: make sure the number of genes fit + within the integer limits of unsigned-int32. + + tiledb_store_name: + Name of the tiledb file. + Defaults to "gene_annotation". + """ + + skip: bool = False + feature_column: str = "index" + dtype: np.dtype = np.uint32 + tiledb_store_name: str = "gene_annotation" + + +@dataclass +class MatrixOptions: + """Optional arguments for the ``matrix`` store for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`. + + Attributes: + matrix_name: + Matrix name from ``layers`` slot to add to tiledb. + Must be consistent across all objects in ``files``. + + Defaults to "counts". + + skip: + Whether to skip generating matrix tiledb. + Defaults to False. + + dtype: + NumPy dtype for the values in the matrix. + Defaults to np.uint16. + + Note: make sure the matrix values fit + within the range limits of unsigned-int16. + + tiledb_store_name: + Name of the tiledb file. + Defaults to `matrix`. + """ + + skip: bool = False + matrix_name: str = "counts" + dtype: np.dtype = np.uint16 + tiledb_store_name: str = "counts" + + +@dataclass +class SampleMetadataOptions: + """Optional arguments for the ``sample`` store for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`. + + Attributes: + skip: + Whether to skip generating sample tiledb. + Defaults to False. + + dtype: + NumPy dtype for the sample dimension. + Defaults to np.uint32. + + Note: make sure the number of samples fit + within the integer limits of unsigned-int32. + + tiledb_store_name: + Name of the tiledb file. + Defaults to "sample_metadata". + """ + + skip: bool = False + dtype: np.dtype = np.uint32 + tiledb_store_name: str = "sample_metadata" diff --git a/src/cellarr/buildutils_cellarrdataset.py b/src/cellarr/buildutils_cellarrdataset.py new file mode 100644 index 0000000..61b0b9f --- /dev/null +++ b/src/cellarr/buildutils_cellarrdataset.py @@ -0,0 +1,476 @@ +"""Build the `CellArrDatset`. + +The `CellArrDataset` method is designed to store single-cell RNA-seq +datasets but can be generalized to store any 2-dimensional experimental data. + +This method creates three TileDB files in the directory specified by `output_path`: + +- `cell_metadata`: A TileDB file containing cell metadata. +- `gene_annotation`: A TileDB file containing feature/gene annotations. +- A matrix TileDB file named by the `layer_matrix_name` parameter. +- `sample_metadata`: A TileDB file containing sample metadata. + +The TileDB matrix file is stored in a ``cell X gene`` orientation. This orientation +is chosen because the fastest-changing dimension as new files are added to the +collection is usually the cells rather than genes. + +Process: + +1. **Scan the Collection**: Scan the entire collection of files to create +a unique set of feature ids (e.g. gene symbols). Store this set as the +`gene_annotation` TileDB file. + +2. **Store Cell Metadata**: Store cell metadata in the `cell_metadata` +TileDB file. + +3. **Remap and Orient Data**: For each dataset in the collection, +remap and orient the feature dimension using the feature set from Step 1. +This step ensures consistency in gene measurement and order, even if +some genes are unmeasured or ordered differently in the original experiments. + +4. **Sample Metadata**: Store sample metadata in `sample_metadata` +TileDB file. + +Example: + + .. code-block:: python + + import anndata + import numpy as np + import tempfile + from cellarr import build_cellarrdataset, CellArrDataset + + # Create a temporary directory + tempdir = tempfile.mkdtemp() + + # Read AnnData objects + adata1 = anndata.read_h5ad("path/to/object1.h5ad") + # or just provide the path + adata2 = "path/to/object2.h5ad" + + # Build CellArrDataset + dataset = build_cellarrdataset( + output_path=tempdir, + files=[adata1, adata2], + matrix_dim_dtype=np.float32 + ) +""" + +import os +import warnings +from typing import List, Union + +import anndata +import pandas as pd + +from . import utils_anndata as uad +from . import buildutils_tiledb_array as uta +from . import buildutils_tiledb_frame as utf +from .CellArrDataset import CellArrDataset +from . import build_options as bopt + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +# TODO: Accept files as a dictionary with names to each dataset. +def build_cellarrdataset( + files: List[Union[str, anndata.AnnData]], + output_path: str, + gene_annotation: Union[List[str], str, pd.DataFrame] = None, + sample_metadata: Union[pd.DataFrame, str] = None, + cell_metadata: Union[pd.DataFrame, str] = None, + sample_metadata_options: bopt.SampleMetadataOptions = bopt.SampleMetadataOptions(), + cell_metadata_options: bopt.CellMetadataOptions = bopt.CellMetadataOptions(), + gene_annotation_options: bopt.GeneAnnotationOptions = bopt.GeneAnnotationOptions(), + matrix_options: bopt.MatrixOptions = bopt.MatrixOptions(), + optimize_tiledb: bool = True, + num_threads: int = 1, +): + """Generate the `CellArrDataset`. + + All files are expected to be consistent and any modifications + to make them consistent is outside the scope of this function + and package. + + There's a few assumptions this process makes: + - If object in ``files`` is an :py:class:`~anndata.AnnData` + or H5AD object, these must contain an assay matrix in layer + names as ``layer_matrix_name`` parameter. + - Feature information must contain a column defined by + ``feature_column`` in the + :py:class:`~cellarr.build_options.GeneAnnotationOptions.` that + contains feature ids or gene symbols across all files. + - If no ``cell_metadata`` is provided, we scan to count the number of cells + and create a simple range index. + + Args: + files: + List of file paths to `H5AD` or ``AnnData`` objects. + + output_path: + Path to where the output tiledb files should be stored. + + gene_metadata: + A :py:class:`~pandas.DataFrame` containing the feature/gene + annotations across all objects. + + Alternatively, may provide a path to the file containing + a concatenated gene annotations across all datasets. + In this case, the first row is expected to contain the + column names and an index column containing the feature ids + or gene symbols. + + Alternatively, a list or a dictionary of gene symbols. + + Irrespective of the input, the object will be appended + with a ``cellarr_gene_index`` column that contains the + store the gene index across all objects. + + Defaults to None, then a gene set is generated by scanning all + objects in ``files``. + + sample_metadata: + A :py:class:`~pandas.DataFrame` containing the sample + metadata for each file in ``files``. Hences the number of rows + in the dataframe must match the number of files. + + Alternatively, may provide path to the file containing a + concatenated sample metadata across all cells. In this case, + the first row is expected to contain the column names. + + Additionally, the order of rows is expected to be in the same + order as the input list of ``files``. + + Irrespective of the input, this object is appended with a + ``cellarr_original_gene_list`` column that contains the original + set of feature ids (or gene symbols) from the dataset to + differentiate between zero-expressed vs unmeasured genes. + + Defaults to `None`, in which case, we create a simple sample + metadata dataframe containing the list of datasets. + Each dataset is named as ``sample_{i}`` where `i` refers to + the index position of the object in ``files``. + + cell_metadata: + A :py:class:`~pandas.DataFrame` containing the cell + metadata for cells across ``files``. Hences the number of rows + in the dataframe must match the number of cells across + all files. + + Alternatively, may provide path to the file containing a + concatenated cell metadata across all cells. In this case, + the first row is expected to contain the column names. + + Additionally, the order of cells is expected to be in the same + order as the input list of ``files``. If the input is a path, + the file is expected to contain mappings between cells and + datasets (or samples). + + Defaults to None, we scan all files to count the number of cells, + then create a simple cell metadata DataFrame containing mappings from + cells to their associated datasets. Each dataset is named as + ``sample_{i}`` where `i` refers to the index position of + the object in ``files``. + + sample_metadata_options: + Optional parameters when generating ``sample_metadata`` store. + + cell_metadata_options: + Optional parameters when generating ``cell_metadata`` store. + + gene_annotation_options: + Optional parameters when generating ``gene_annotation`` store. + + matrix_options: + Optional parameters when generating ``matrix`` store. + + optimize_tiledb: + Whether to run TileDb's vaccum and consolidation (may take long). + + num_threads: + Number of threads. + Defaults to 1. + """ + if not os.path.isdir(output_path): + raise ValueError("'output_path' must be a directory.") + + #### + ## Writing gene annotation file + #### + if gene_annotation is None: + warnings.warn( + "Scanning all files for feature ids (e.g. gene symbols), this may take long", + UserWarning, + ) + gene_set = uad.scan_for_features( + files, + var_feature_column=gene_annotation_options.feature_column, + num_threads=num_threads, + ) + + gene_set = sorted(gene_set) + + gene_annotation = pd.DataFrame({"cellarr_gene_index": gene_set}, index=gene_set) + elif isinstance(gene_annotation, list): + _gene_list = sorted(list(set(gene_annotation))) + gene_annotation = pd.DataFrame( + {"cellarr_gene_index": _gene_list}, index=_gene_list + ) + elif isinstance(gene_annotation, str): + gene_annotation = pd.read_csv(gene_annotation, index=True, header=True) + gene_annotation["cellarr_gene_index"] = gene_annotation.index.tolist() + else: + raise TypeError("'gene_annotation' is not an expected type.") + + if not isinstance(gene_annotation, pd.DataFrame): + raise TypeError("'gene_annotation' must be a pandas dataframe.") + + if len(gene_annotation.index.unique()) != len(gene_annotation.index.tolist()): + raise ValueError( + "'gene_annotation' must contain unique feature ids or gene symbols." + ) + + gene_annotation.reset_index(drop=True, inplace=True) + + # Create the gene annotation tiledb + if not gene_annotation_options.skip: + _col_types = {} + for col in gene_annotation.columns: + _col_types[col] = "ascii" + + _gene_output_uri = f"{output_path}/{gene_annotation_options.tiledb_store_name}" + generate_metadata_tiledb_frame( + _gene_output_uri, gene_annotation, column_types=_col_types + ) + + if optimize_tiledb: + uta.optimize_tiledb_array(_gene_output_uri) + + #### + ## Writing the sample metadata file + #### + _samples = [] + for idx, _ in enumerate(files): + _samples.append(f"sample_{idx}") + if sample_metadata is None: + warnings.warn( + "Sample metadata is not provided, each dataset in 'files' is considered a sample", + UserWarning, + ) + + sample_metadata = pd.DataFrame({"cellarr_sample": _samples}) + elif isinstance(sample_metadata, str): + sample_metadata = pd.read_csv(sample_metadata, header=True) + sample_metadata["cellarr_sample"] = _samples + else: + raise TypeError("'sample_metadata' is not an expected type.") + + if not sample_metadata_options.skip: + warnings.warn( + "Scanning all files for feature ids (e.g. gene symbols), this may take long", + UserWarning, + ) + gene_scan_set = uad.scan_for_features( + files, + var_feature_column=gene_annotation_options.feature_column, + num_threads=num_threads, + unique=False, + ) + gene_set_str = [",".join(x) for x in gene_scan_set] + sample_metadata["cellarr_original_gene_set"] = gene_set_str + + _col_types = {} + for col in sample_metadata.columns: + _col_types[col] = "ascii" + + _sample_output_uri = ( + f"{output_path}/{sample_metadata_options.tiledb_store_name}" + ) + generate_metadata_tiledb_frame( + _sample_output_uri, sample_metadata, column_types=_col_types + ) + + if optimize_tiledb: + uta.optimize_tiledb_array(_sample_output_uri) + + #### + ## Writing the cell metadata file + #### + warnings.warn( + "Scanning all files to compute cell counts, this may take long", + UserWarning, + ) + cell_counts = uad.scan_for_cellcounts(files, num_threads=num_threads) + _cellindex_in_dataset = [] + _dataset = [] + for idx, cci in enumerate(cell_counts): + _cellindex_in_dataset.extend([x for x in range(cci)]) + _dataset.extend([f"dataset_{idx}" for _ in range(cci)]) + + if cell_metadata is None: + cell_metadata = pd.DataFrame( + {"cellarr_cell_counts": _cellindex_in_dataset, "cellarr_sample": _dataset} + ) + elif isinstance(cell_metadata, str): + warnings.warn( + "Scanning 'cell_metadata' csv file to count number of cells, this may take long", + UserWarning, + ) + with open(cell_metadata) as fp: + count = 0 + for _ in fp: + count += 1 + + if sum(cell_counts) != count - 1: + raise ValueError( + "Number of rows in the 'cell_metadata' csv does not match the number of cells across files." + ) + + warnings.warn( + "'cell_metadata' csv file is expected to contain mapping between cells and samples", + UserWarning, + ) + elif isinstance(cell_metadata, pd.DataFrame): + if sum(cell_counts) != len(cell_metadata): + raise ValueError( + "Number of rows in 'cell_metadata' does not match the number of cells across files." + ) + + cell_metadata["cellarr_sample"] = _dataset + + # Create the cell metadata tiledb + if not cell_metadata_options.skip: + _cell_output_uri = f"{output_path}/cell_metadata" + + if isinstance(cell_metadata, str): + _cell_metaframe = pd.read_csv(cell_metadata, chunksize=5, header=True) + generate_metadata_tiledb_csv( + _cell_output_uri, cell_metadata, _cell_metaframe.columns + ) + elif isinstance(cell_metadata, pd.DataFrame): + _col_types = {} + for col in cell_metadata.columns: + _col_types[col] = "ascii" + + _to_write = cell_metadata.astype(str) + + generate_metadata_tiledb_frame( + _cell_output_uri, _to_write, column_types=_col_types + ) + + if optimize_tiledb: + uta.optimize_tiledb_array(_cell_output_uri) + + #### + ## Writing the matrix file + #### + if not matrix_options.skip: + gene_idx = gene_annotation["cellarr_gene_index"].tolist() + gene_set = {} + for i, x in enumerate(gene_idx): + gene_set[x] = i + + _counts_uri = f"{output_path}/{matrix_options.tiledb_store_name}" + uta.create_tiledb_array( + _counts_uri, + matrix_attr_name=matrix_options.matrix_name, + x_dim_dtype=cell_metadata_options.dtype, + y_dim_dtype=gene_annotation_options.dtype, + matrix_dim_dtype=matrix_options.dtype, + ) + + offset = 0 + for fd in files: + mat = uad.remap_anndata( + fd, + gene_set, + var_feature_column=gene_annotation_options.feature_column, + layer_matrix_name=matrix_options.matrix_name, + ) + uta.write_csr_matrix_to_tiledb( + _counts_uri, + matrix=mat, + row_offset=offset, + value_dtype=matrix_options.dtype, + ) + offset += int(mat.shape[0]) + + if optimize_tiledb: + uta.optimize_tiledb_array(_counts_uri) + + return CellArrDataset( + dataset_path=output_path, + sample_metadata_uri=sample_metadata_options.tiledb_store_name, + cell_metadata_uri=cell_metadata_options.tiledb_store_name, + gene_annotation_uri=gene_annotation_options.tiledb_store_name, + matrix_tdb_uri=matrix_options.tiledb_store_name, + ) + + +def generate_metadata_tiledb_frame( + output_uri: str, input: pd.DataFrame, column_types: dict = None +): + """Generate metadata tiledb from a :pu:class:`~pandas.DataFrame`. + + Args: + output_uri: + TileDB URI or path to save the file. + + input: + Input dataframe. + + column_types: + You can specify type of each column name to cast into. + "ascii" or str works best for most scenarios. + + Defaults to None. + """ + _to_write = input.astype(str) + utf.create_tiledb_frame_from_dataframe( + output_uri, _to_write, column_types=column_types + ) + + +def generate_metadata_tiledb_csv( + output_uri: str, + input: str, + column_dtype=str, + chunksize=1000, +): + """Generate a metadata tiledb from csv. + + The difference between this and ``generate_metadata_tiledb_frame`` + is when the csv is super large and it won't fit into memory. + + Args: + output_uri: + TileDB URI or path to save the file. + + input: + Path to the csv file. The first row is expected to + contain the column names. + + column_dtype: + Dtype of the columns. + Defaults to str. + + chunksize: + Chunk size to read the dataframe. + Defaults to 1000. + """ + chunksize = 1000 + initfile = True + offset = 0 + + for chunk in pd.read_csv(input, chunksize=chunksize, header=True): + if initfile: + utf.create_tiledb_frame_from_column_names( + output_uri, chunk.columns, column_dtype + ) + initfile = False + + _to_write = chunk.astype(str) + utf.append_to_tiledb_frame(output_uri, _to_write, offset) + offset += len(chunk) diff --git a/src/cellarr/utils_tiledb_array.py b/src/cellarr/buildutils_tiledb_array.py similarity index 83% rename from src/cellarr/utils_tiledb_array.py rename to src/cellarr/buildutils_tiledb_array.py index f967616..383d424 100644 --- a/src/cellarr/utils_tiledb_array.py +++ b/src/cellarr/buildutils_tiledb_array.py @@ -13,8 +13,8 @@ def create_tiledb_array( tiledb_uri_path: str, - num_cells: int, - num_genes: int, + x_dim_length: int = None, + y_dim_length: int = None, x_dim_name: str = "cell_index", y_dim_name: str = "gene_index", matrix_attr_name: str = "counts", @@ -32,11 +32,17 @@ def create_tiledb_array( tiledb_uri_path: Path to create the array tiledb file. - num_cells: - Number of cells (x/fastest-changing dimension). + x_dim_length: + Number of entries along the x/fastest-changing dimension. + e.g. Number of cells. + Defaults to None, in which case, the max integer value of + ``x_dim_dtype`` is used. - num_genes: - Number fo genes (y dimension). + y_dim_length: + Number of entries along the y dimension. + e.g. Number of genes. + Defaults to None, in which case, the max integer value of + ``y_dim_dtype`` is used. x_dim_name: Name for the x-dimension. @@ -67,8 +73,14 @@ def create_tiledb_array( Defaults to True. """ - xdim = tiledb.Dim(name=x_dim_name, domain=(0, num_cells - 1), dtype=x_dim_dtype) - ydim = tiledb.Dim(name=y_dim_name, domain=(0, num_genes - 1), dtype=y_dim_dtype) + if x_dim_length is None: + x_dim_length = np.iinfo(x_dim_dtype).max + + if y_dim_length is None: + y_dim_length = np.iinfo(y_dim_dtype).max + + xdim = tiledb.Dim(name=x_dim_name, domain=(0, x_dim_length - 1), dtype=x_dim_dtype) + ydim = tiledb.Dim(name=y_dim_name, domain=(0, y_dim_length - 1), dtype=y_dim_dtype) dom = tiledb.Domain(xdim, ydim) diff --git a/src/cellarr/utils_tiledb_frame.py b/src/cellarr/buildutils_tiledb_frame.py similarity index 100% rename from src/cellarr/utils_tiledb_frame.py rename to src/cellarr/buildutils_tiledb_frame.py diff --git a/src/cellarr/queryutils_tiledb_frame.py b/src/cellarr/queryutils_tiledb_frame.py new file mode 100644 index 0000000..50af7fc --- /dev/null +++ b/src/cellarr/queryutils_tiledb_frame.py @@ -0,0 +1,78 @@ +from functools import lru_cache +from typing import List, Union +from warnings import warn + +import pandas as pd +import tiledb + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@lru_cache +def get_schema_names_frame(tiledb_obj: tiledb.Array) -> List[str]: + columns = [] + for i in range(tiledb_obj.schema.nattr): + columns.append(tiledb_obj.schema.attr(i).name) + + return columns + + +def subset_frame( + tiledb_obj: tiledb.Array, + subset: Union[slice, tiledb.QueryCondition], + columns: Union[str, list] = None, +) -> pd.DataFrame: + + _avail_columns = get_schema_names_frame(tiledb_obj) + + if columns is None: + columns = _avail_columns + else: + _not_avail = [] + for col in columns: + if col not in _avail_columns: + _not_avail.append(col) + + if len(_not_avail) > 0: + raise ValueError(f"Columns '{', '.join(_not_avail)}' are not available.") + + if isinstance(columns, str): + warn( + "provided subset is string, its expected to be a 'query_condition'", + UserWarning, + ) + + query = tiledb_obj.query(cond=subset, attrs=columns) + data = query.df[:] + else: + data = query.df[subset, columns] + result = data.dropna() + return result + + +def get_a_column(tiledb_obj: tiledb.Array, column_name: str) -> list: + if column_name not in get_schema_names_frame(tiledb_obj): + raise ValueError(f"Column '{column_name}' does not exist.") + + return tiledb_obj.query(attrs=[column_name]).df[:] + + +@lru_cache +def get_index(tiledb_obj: tiledb.Array) -> list: + _index = tiledb_obj.unique_dim_values("__tiledb_rows") + return [x.decode() for x in _index] + + +def _match_to_list(x: list, query: list): + return sorted([x.index(x) for x in query]) + + +def _is_list_strings(x: list): + _ret = False + + if isinstance(x, (list, tuple)) and all(isinstance(y, str) for y in x): + _ret = True + + return _ret diff --git a/src/cellarr/utils_anndata.py b/src/cellarr/utils_anndata.py index 5921e08..787748d 100644 --- a/src/cellarr/utils_anndata.py +++ b/src/cellarr/utils_anndata.py @@ -6,18 +6,19 @@ import numpy as np from scipy.sparse import coo_matrix, csr_array, csr_matrix -__author__ = "Jayaram Kancherla, Tony Kuo" +__author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" __license__ = "MIT" def remap_anndata( h5ad_or_adata: Union[str, anndata.AnnData], - gene_set_order: dict, - var_gene_column: str = "index", + feature_set_order: dict, + var_feature_column: str = "index", layer_matrix_name: str = "counts", ) -> csr_matrix: - """Extract and remap the count matrix to the provided gene set order from the :py:class:`~anndata.AnnData` object. + """Extract and remap the count matrix to the provided feature (gene) set order from the :py:class:`~anndata.AnnData` + object. Args: adata: @@ -25,16 +26,17 @@ def remap_anndata( Alternatively, may also provide a path to the H5ad file. - The index of the `var` slot must contain the gene symbols + The index of the `var` slot must contain the feature ids for the columns in the matrix. - gene_set_order: - A dictionary with the symbols as keys and their index as value. - The gene symbols from the ``AnnData`` object are remapped to the - gene order from this dictionary. + feature_set_order: + A dictionary with the feature ids as keys and their index as + value (e.g. gene symbols). The feature ids from the + ``AnnData`` object are remapped to the feature order from + this dictionary. - var_gene_column: - Column in ``var`` containing the symbols. + var_feature_column: + Column in ``var`` containing the feature ids (e.g. gene symbols). Defaults to the index of the ``var`` slot. layer_matrix_name: @@ -55,19 +57,19 @@ def remap_anndata( mat = adata.layers[layer_matrix_name] - if var_gene_column == "index": + if var_feature_column == "index": symbols = adata.var.index.tolist() else: - symbols = adata.var[var_gene_column].tolist() + symbols = adata.var[var_feature_column].tolist() - indices_to_keep = [i for i, x in enumerate(symbols) if x in gene_set_order] + indices_to_keep = [i for i, x in enumerate(symbols) if x in feature_set_order] symbols_to_keep = [symbols[i] for i in indices_to_keep] mat = mat[:, indices_to_keep].copy() indices_to_map = [] for x in symbols_to_keep: - indices_to_map.append(gene_set_order[x]) + indices_to_map.append(feature_set_order[x]) if isinstance(mat, np.ndarray): mat_coo = coo_matrix(mat) @@ -80,13 +82,13 @@ def remap_anndata( return coo_matrix( (mat_coo.data, (mat_coo.row, new_col)), - shape=(adata.shape[0], len(gene_set_order)), + shape=(adata.shape[0], len(feature_set_order)), ).tocsr() -def _get_genes_index( +def _get_feature_index( h5ad_or_adata: Union[str, anndata.AnnData], - var_gene_column: str = "index", + var_feature_column: str = "index", ) -> List[str]: if isinstance(h5ad_or_adata, str): adata = anndata.read_h5ad(h5ad_or_adata, backed=True) @@ -96,32 +98,33 @@ def _get_genes_index( adata = h5ad_or_adata - if var_gene_column == "index": + if var_feature_column == "index": symbols = adata.var.index.tolist() else: - symbols = adata.var[var_gene_column].tolist() + symbols = adata.var[var_feature_column].tolist() return symbols -def _wrapper_get_genes(args): +def _wrapper_get_feature_ids(args): file, gcol = args - return _get_genes_index(file, gcol) + return _get_feature_index(file, gcol) -def scan_for_genes( +def scan_for_features( h5ad_or_adata: List[Union[str, anndata.AnnData]], - var_gene_column: str = "index", + var_feature_column: str = "index", num_threads: int = 1, + unique: bool = True, ) -> List[str]: - """Extract and generate the list of unique genes identifiers across files. + """Extract and generate the list of unique feature identifiers across files. Args: h5ad_or_adata: List of anndata objects or path to h5ad files. - var_gene_column: - Column containing the gene symbols. + var_feature_column: + Column containing the feature ids (e.g. gene symbols). Defaults to "index". num_threads: @@ -129,12 +132,15 @@ def scan_for_genes( Defaults to 1. Returns: - List of all unique gene symbols across all files. + List of all unique feature ids across all files. """ with Pool(num_threads) as p: - _args = [(file_info, var_gene_column) for file_info in h5ad_or_adata] - all_symbols = p.map(_wrapper_get_genes, _args) - return list(set(itertools.chain.from_iterable(all_symbols))) + _args = [(file_info, var_feature_column) for file_info in h5ad_or_adata] + all_symbols = p.map(_wrapper_get_feature_ids, _args) + if unique: + return list(set(itertools.chain.from_iterable(all_symbols))) + + return all_symbols def _get_cellcounts(h5ad_or_adata: Union[str, anndata.AnnData]) -> int: diff --git a/tests/test_build.py b/tests/test_build.py index a09a23a..c6cabad 100644 --- a/tests/test_build.py +++ b/tests/test_build.py @@ -5,7 +5,7 @@ import pandas as pd import pytest import tiledb -from cellarr import build_cellarrdataset +from cellarr import build_cellarrdataset, MatrixOptions __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" @@ -28,31 +28,44 @@ def generate_adata(n, d, k): return adata -def test_build_tiledb(): +def test_build_cellarrdataset(): tempdir = tempfile.mkdtemp() adata1 = generate_adata(1000, 100, 10) adata2 = generate_adata(100, 1000, 100) build_cellarrdataset( - output_path=tempdir, files=[adata1, adata2], matrix_dim_dtype=np.float32 + output_path=tempdir, + files=[adata1, adata2], + matrix_options=MatrixOptions(dtype=np.float32), ) cfp = tiledb.open(f"{tempdir}/counts", "r") - gfp = tiledb.open(f"{tempdir}/gene_metadata", "r") + gfp = tiledb.open(f"{tempdir}/gene_annotation", "r") genes = gfp.df[:] + print(genes) + gene_list = ["gene_1", "gene_95", "gene_50"] - gene_indices_tdb = sorted([genes.index.tolist().index(x) for x in gene_list]) + _genes_from_tile = genes["cellarr_gene_index"].tolist() + # print(_genes_from_tile) + gene_indices_tdb = sorted([_genes_from_tile.index(x) for x in gene_list]) + print(gene_indices_tdb) adata1_gene_indices = sorted( [adata1.var.index.tolist().index(x) for x in gene_list] ) + + print(adata1_gene_indices) adata2_gene_indices = sorted( [adata2.var.index.tolist().index(x) for x in gene_list] ) + print(adata2_gene_indices) + + print(cfp.multi_index[0, gene_indices_tdb]) + assert np.allclose( cfp.multi_index[0, gene_indices_tdb]["counts"], adata1.layers["counts"][0, adata1_gene_indices], diff --git a/tests/test_dataset.py b/tests/test_dataset.py deleted file mode 100644 index 1b99c5a..0000000 --- a/tests/test_dataset.py +++ /dev/null @@ -1,66 +0,0 @@ -import tempfile - -import anndata -import numpy as np -import pandas as pd -import pytest -import tiledb -from cellarr import CellArrDataset, build_cellarrdataset - -__author__ = "Jayaram Kancherla" -__copyright__ = "Jayaram Kancherla" -__license__ = "MIT" - - -def generate_adata(n, d, k): - np.random.seed(1) - - z = np.random.normal(loc=np.arange(k), scale=np.arange(k) * 2, size=(n, k)) - w = np.random.normal(size=(d, k)) - y = np.dot(z, w.T) - - gene_index = [f"gene_{i+1}" for i in range(d)] - var_df = pd.DataFrame({"names": gene_index}, index=gene_index) - obs_df = pd.DataFrame({"cells": [f"cell1_{j+1}" for j in range(n)]}) - - adata = anndata.AnnData(layers={"counts": y}, var=var_df, obs=obs_df) - - return adata - - -def test_build_tiledb(): - tempdir = tempfile.mkdtemp() - - adata1 = generate_adata(1000, 100, 10) - adata2 = generate_adata(100, 1000, 100) - - dataset = build_cellarrdataset( - output_path=tempdir, files=[adata1, adata2], matrix_dim_dtype=np.float32 - ) - - assert dataset is not None - assert isinstance(dataset, CellArrDataset) - - cfp = tiledb.open(f"{tempdir}/counts", "r") - gfp = tiledb.open(f"{tempdir}/gene_metadata", "r") - - genes = gfp.df[:] - - gene_list = ["gene_1", "gene_95", "gene_50"] - gene_indices_tdb = sorted([genes.index.tolist().index(x) for x in gene_list]) - - adata1_gene_indices = sorted( - [adata1.var.index.tolist().index(x) for x in gene_list] - ) - adata2_gene_indices = sorted( - [adata2.var.index.tolist().index(x) for x in gene_list] - ) - - assert np.allclose( - cfp.multi_index[0, gene_indices_tdb]["counts"], - adata1.layers["counts"][0, adata1_gene_indices], - ) - assert np.allclose( - cfp.multi_index[1000, gene_indices_tdb]["counts"], - adata2.layers["counts"][0, adata2_gene_indices], - ) diff --git a/tests/test_query.py b/tests/test_query.py new file mode 100644 index 0000000..4d8ce7d --- /dev/null +++ b/tests/test_query.py @@ -0,0 +1,59 @@ +# import tempfile + +# import anndata +# import numpy as np +# import pandas as pd +# import pytest +# import tiledb +# from cellarr import CellArrDataset, build_cellarrdataset + +# __author__ = "Jayaram Kancherla" +# __copyright__ = "Jayaram Kancherla" +# __license__ = "MIT" + + +# def generate_adata(n, d, k): +# np.random.seed(1) + +# z = np.random.normal(loc=np.arange(k), scale=np.arange(k) * 2, size=(n, k)) +# w = np.random.normal(size=(d, k)) +# y = np.dot(z, w.T) + +# gene_index = [f"gene_{i+1}" for i in range(d)] +# var_df = pd.DataFrame({"names": gene_index}, index=gene_index) +# obs_df = pd.DataFrame({"cells": [f"cell1_{j+1}" for j in range(n)]}) + +# adata = anndata.AnnData(layers={"counts": y}, var=var_df, obs=obs_df) + +# return adata + + +# def test_query_cellarrdataset(): +# tempdir = tempfile.mkdtemp() + +# adata1 = generate_adata(1000, 100, 10) +# adata2 = generate_adata(100, 1000, 100) + +# dataset = build_cellarrdataset( +# output_path=tempdir, files=[adata1, adata2], matrix_dim_dtype=np.float32 +# ) + +# assert dataset is not None +# assert isinstance(dataset, CellArrDataset) + +# cfp = tiledb.open(f"{tempdir}/counts", "r") +# gfp = tiledb.open(f"{tempdir}/gene_metadata", "r") + +# cd = CellArrDataset(dataset_path=tempdir) + +# gene_list = ["gene_1", "gene_95", "gene_50"] +# result1 = cd[0, gene_list] + +# assert np.allclose( +# cfp.multi_index[0, gene_indices_tdb]["counts"], +# adata1.layers["counts"][0, adata1_gene_indices], +# ) +# assert np.allclose( +# cfp.multi_index[1000, gene_indices_tdb]["counts"], +# adata2.layers["counts"][0, adata2_gene_indices], +# )