Rewriting the build process (#2)

Update documentation and tests
CellArr · Jun 19, 2024 · f2979b6 · f2979b6
1 parent b945bc0
commit f2979b6
Show file tree

Hide file tree

Showing 15 changed files with 1,008 additions and 596 deletions.
diff --git a/README.md b/README.md
@@ -1,15 +1,3 @@
-<!-- These are examples of badges you might want to add to your README:
-     please update the URLs accordingly
-
-[![Built Status](https://api.cirrus-ci.com/github/<USER>/cellarr.svg?branch=main)](https://cirrus-ci.com/github/<USER>/cellarr)
-[![ReadTheDocs](https://readthedocs.org/projects/cellarr/badge/?version=latest)](https://cellarr.readthedocs.io/en/stable/)
-[![Coveralls](https://img.shields.io/coveralls/github/<USER>/cellarr/main.svg)](https://coveralls.io/r/<USER>/cellarr)
-[![PyPI-Server](https://img.shields.io/pypi/v/cellarr.svg)](https://pypi.org/project/cellarr/)
-[![Conda-Forge](https://img.shields.io/conda/vn/conda-forge/cellarr.svg)](https://anaconda.org/conda-forge/cellarr)
-[![Monthly Downloads](https://pepy.tech/badge/cellarr/month)](https://pepy.tech/project/cellarr)
-[![Twitter](https://img.shields.io/twitter/url/http/shields.io.svg?style=social&label=Twitter)](https://twitter.com/cellarr)
--->
-
 [![PyPI-Server](https://img.shields.io/pypi/v/cellarr.svg)](https://pypi.org/project/cellarr/)
 ![Unit tests](https://github.com/BiocPy/cellarr/actions/workflows/pypi-test.yml/badge.svg)
 
@@ -30,9 +18,9 @@ pip install cellarr
 
 ## Usage
 
-### Create a `CellArrDataset`
+### Build a `CellArrDataset`
 
-Creating a `CellArrDataset` generates three TileDB files in the specified output directory:
+Building a `CellArrDataset` generates three TileDB files in the specified output directory:
 
 - `gene_metadata`: Contains feature annotations.
 - `cell_metadata`: Contains cell or sample metadata.
@@ -67,11 +55,29 @@ dataset = build_cellarrdataset(
      matrix_dim_dtype=np.float32
 )
 ```
+
+The build process usually involves 3 steps:
+
+1. **Scan the Collection**: Scan the entire collection of files to create
+a unique set of gene symbols. Store this gene set as the
+`gene_metadata` TileDB file.
+2. **Store Cell Metadata**: Store cell metadata as the
+`cell_metadata` TileDB file.
+3. **Remap and Orient Data**: For each dataset in the collection,
+remap and orient the gene dimension using the gene set from Step 1.
+This step ensures consistency in gene measurement and order, even if
+some genes are unmeasured or ordered differently in the original experiments.
+
+***Note: The objects to build the `CellArrDataset` are expected to be fairly consistent, especially along the feature dimension.
+if these are `AnnData` or `H5AD`objects, all objects must contain an index (in the `var` slot) specifying the gene symbols.***
+
+Check out the [documentation](https://biocpy.github.io/cellarr/tutorial.html) for more details.
+
 ----
 
-#### TODO: This following section does not work yet.
+#### TODO: This following section does not work yet
 
-Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializng it to the path where the files were created.
+Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializing it to the path where the files were created.
 
 ```python
 # Create a CellArrDataset object from the existing dataset
@@ -86,4 +92,4 @@ expression_data = dataset[10, ["gene1", "gene10", "gene500"]]
 ## Note
 
 This project has been set up using PyScaffold 4.5. For details and usage
-information on PyScaffold see https://pyscaffold.org/.
+information on PyScaffold see <https://pyscaffold.org/>.
diff --git a/docs/index.md b/docs/index.md
@@ -16,11 +16,11 @@ pip install cellarr
 :maxdepth: 2
 
 Overview <tutorial>
-Contributions & Help <contributing>
-License <license>
+Module Reference <api/modules>
 Authors <authors>
 Changelog <changelog>
-Module Reference <api/modules>
+Contributions & Help <contributing>
+License <license>
 ```
 
 ## Indices and tables

diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -10,9 +10,9 @@ Cell Arrays is a Python package that provides a TileDB-backed store for large co
 
 ## Usage
 
-### Create the `CellArrDataset`
+### Build the `CellArrDataset`
 
-Creating a CellArrDataset generates three TileDB files in the specified output directory:
+Creating a `CellArrDataset` generates three TileDB files in the specified output directory:
 
 - `gene_metadata`: Contains feature annotations.
 - `cell_metadata`: Contains cell or sample metadata.
@@ -43,11 +43,12 @@ dataset = build_cellarrdataset(
      matrix_dim_dtype=np.float32
 )
 ```
+
 ----
 
 #### TODO: This following section does not work yet.
 
-Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializng it to the path where the files were created.
+Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializing it to the path where the files were created.
 
 ```python
 # Create a CellArrDataset object from the existing dataset

diff --git a/src/cellarr/CellArrDataset.py b/src/cellarr/CellArrDataset.py
@@ -1,111 +1,226 @@
 import os
-from typing import Union
+from typing import List, Union, Sequence
 
+import pandas as pd
 import tiledb
 
+from . import queryutils_tiledb_frame as qtd
+
 __author__ = "Jayaram Kancherla"
 __copyright__ = "Jayaram Kancherla"
 __license__ = "MIT"
 
 
 class CellArrDataset:
-    """A class that represent a collection of cells in TileDB."""
+    """A class that represent a collection of cells and their associated metadata in a TileDB backed store."""
 
     def __init__(
         self,
         dataset_path: str,
-        counts_tdb_uri: str = "counts",
-        gene_metadata_uri: str = "gene_metadata",
+        matrix_tdb_uri: str = "counts",
+        gene_annotation_uri: str = "gene_annotation",
         cell_metadata_uri: str = "cell_metadata",
+        sample_metadata_uri: str = "sample_metadata",
     ):
-        """Initialize a ``CellArr`` dataset.
+        """Initialize a ``CellArrDataset``.
 
         Args:
+            dataset_path:
+                Path to the directory containing the tiledb stores.
+                Usually the ``output_path`` from the
+                :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
+
             counts_tdb_uri:
-                Path to counts TileDB.
+                Relative path to matrix store.
 
-            gene_metadata_uri:
-                Path to gene metadata TileDB.
+            gene_annotation_uri:
+                Relative path to gene annotation store.
 
             cell_metadata_uri:
-                Path to cell metadata TileDB.
+                Relative path to cell metadata store.
+
+            sample_metadata_uri:
+                Relative path to sample metadata store.
         """
 
         if not os.path.isdir(dataset_path):
             raise ValueError("'dataset_path' is not a directory.")
 
         self._dataset_path = dataset_path
         # TODO: Maybe switch to on-demand loading of these objects
-        self._counts_tdb_tdb = tiledb.open(f"{dataset_path}/{counts_tdb_uri}", "r")
-        self._gene_metadata_tdb = tiledb.open(
-            f"{dataset_path}/{gene_metadata_uri}", "r"
+        self._matrix_tdb_tdb = tiledb.open(f"{dataset_path}/{matrix_tdb_uri}", "r")
+        self._gene_annotation_tdb = tiledb.open(
+            f"{dataset_path}/{gene_annotation_uri}", "r"
         )
         self._cell_metadata_tdb = tiledb.open(
             f"{dataset_path}/{cell_metadata_uri}", "r"
         )
 
     def __del__(self):
-        self._counts_tdb_tdb.close()
-        self._gene_metadata_tdb.close()
+        self._matrix_tdb_tdb.close()
+        self._gene_annotation_tdb.close()
         self._cell_metadata_tdb.close()
 
-    # TODO:
-    # Methods to implement
-    # search by gene
-    # search by cell metadata
-    # slice counts after search
+    def get_cell_metadata_columns(self) -> List[str]:
+        """Get column names from ``cell_metadata`` store.
 
-    def get_cell_metadata_columns(self):
-        columns = []
-        for i in range(self._cell_metadata_tdb.schema.nattr):
-            columns.append(self._cell_metadata_tdb.schema.attr(i).name)
+        Returns:
+            List of available metadata columns.
+        """
+        return qtd.get_schema_names_frame(self._cell_metadata_tdb)
 
-        return columns
+    def get_cell_metadata_column(self, column_name: str) -> list:
+        """Access a column from the ``cell_metadata`` store.
 
-    def get_cell_metadata_column(self, column_name: str):
-        return self._cell_metadata_tdb.query(attrs=[column_name]).df[:]
+        Args:
+            column_name:
+                Name of the column or attribute. Usually one of the column names
+                from of :py:meth:`~get_cell_metadata_columns`.
+
+        Returns:
+            A list of values for this column.
+        """
+        return qtd.get_a_column(self._cell_metadata_tdb, column_name=column_name)
 
     def get_cell_subset(
         self, subset: Union[slice, tiledb.QueryCondition], columns=None
-    ):
-        if columns is None:
-            columns = self.get_cell_metadata_columns()
+    ) -> pd.DataFrame:
+        """Slice the ``cell_metadata`` store.
+
+        Args:
+            subset:
+                A list of integer indices to subset the ``cell_metadata``
+                store.
+
+                Alternatively, may also provide a
+                :py:class:`tiledb.QueryCondition` to query the store.
 
-        query = self._cell_metadata_tdb.query(cond=subset, attrs=columns)
-        data = query.df[:]
-        result = data.dropna()
-        return result
+            columns:
+                List of specific column names to access.
 
-    def get_gene_metadata_columns(self):
-        columns = []
-        for i in range(self._gene_metadata_tdb.schema.nattr):
-            columns.append(self._gene_metadata_tdb.schema.attr(i).name)
+                Defaults to None, in which case all columns are extracted.
 
-        return columns
+        Returns:
+            A pandas Dataframe of the subset.
+        """
+        return qtd.subset_frame(self._cell_metadata_tdb, subset=subset, columns=columns)
+
+    def get_gene_metadata_columns(self) -> List[str]:
+        """Get annotation column names from ``gene_metadata`` store.
+
+        Returns:
+            List of available annotations.
+        """
+        return qtd.get_schema_names_frame(self._gene_annotation_tdb)
 
     def get_gene_metadata_column(self, column_name: str):
-        return self._gene_metadata_tdb.query(attrs=[column_name]).df[:]
+        """Access a column from the ``gene_metadata`` store.
+
+        Args:
+            column_name:
+                Name of the column or attribute. Usually one of the column names
+                from of :py:meth:`~get_gene_metadata_columns`.
+
+        Returns:
+            A list of values for this column.
+        """
+        return qtd.get_a_column(self._gene_annotation_tdb, column_name=column_name)
+
+    def get_gene_metadata_index(self):
+        """Get index of the ``gene_metadata`` store. This typically should store all unique gene symbols.
+
+        Returns:
+            List of unique symbols.
+        """
+        return qtd.get_index(self._gene_annotation_tdb)
+
+    def _get_indices_for_gene_list(self, query: list) -> List[int]:
+        _gene_index = self.get_gene_metadata_index()
+        return qtd._match_to_list(_gene_index, query=query)
 
     def get_gene_subset(
-        self, subset: Union[slice, tiledb.QueryCondition], columns=None
+        self, subset: Union[slice, List[str], tiledb.QueryCondition], columns=None
     ):
-        if columns is None:
-            columns = self.get_gene_metadata_columns()
+        """Slice the ``gene_metadata`` store.
+
+        Args:
+            subset:
+                A list of integer indices to subset the ``gene_metadata``
+                store.
+
+                Alternatively, may provide a
+                :py:class:`tiledb.QueryCondition` to query the store.
+
+                Alternatively, may provide a list of strings to match with
+                the index of ``gene_metadata`` store.
+
+            columns:
+                List of specific column names to access.
+
+                Defaults to None, in which case all columns are extracted.
+
+        Returns:
+            A pandas Dataframe of the subset.
+        """
+
+        if qtd._is_list_strings(subset):
+            subset = self._get_indices_for_gene_list(subset)
 
-        query = self._gene_metadata_tdb.query(cond=subset, attrs=columns)
-        data = query.df[:]
-        result = data.dropna()
-        return result
+        return qtd.subset_frame(
+            self._gene_annotation_tdb, subset=subset, columns=columns
+        )
 
     def get_slice(
         self,
         cell_subset: Union[slice, tiledb.QueryCondition],
-        gene_subset: Union[slice, tiledb.QueryCondition],
+        gene_subset: Union[slice, List[str], tiledb.QueryCondition],
     ):
         _csubset = self.get_cell_subset(cell_subset)
         _cell_indices = _csubset.index.tolist()
 
         _gsubset = self.get_gene_subset(gene_subset)
         _gene_indices = _gsubset.index.tolist()
 
-        return self._counts_tdb_tdb.multi_index[_cell_indices, _gene_indices]
+        return self._matrix_tdb_tdb.multi_index[_cell_indices, _gene_indices]
+
+    def __getitem__(
+        self,
+        args: Union[int, str, Sequence, tuple],
+    ):
+        """Subset a ``CellArrDataset``.
+
+        Args:
+            args:
+                Integer indices, a boolean filter, or (if the current object is
+                named) names specifying the ranges to be extracted.
+
+                Alternatively a tuple of length 1. The first entry specifies
+                the rows to retain based on their names or indices.
+
+                Alternatively a tuple of length 2. The first entry specifies
+                the rows to retain, while the second entry specifies the
+                columns to retain, based on their names or indices.
+
+        Raises:
+            ValueError:
+                If too many or too few slices provided.
+        """
+        if isinstance(args, (str, int)):
+            return self.get_slice(args, slice(None))
+
+        if isinstance(args, tuple):
+            if len(args) == 0:
+                raise ValueError("At least one slicing argument must be provided.")
+
+            if len(args) == 1:
+                return self.get_slice(args[0], slice(None))
+            elif len(args) == 2:
+                return self.get_slice(args[0], args[1])
+            else:
+                raise ValueError(
+                    f"`{type(self).__name__}` only supports 2-dimensional slicing."
+                )
+
+        raise TypeError(
+            "args must be a sequence or a scalar integer or string or a tuple of atmost 2 values."
+        )
diff --git a/src/cellarr/__init__.py b/src/cellarr/__init__.py
@@ -15,5 +15,6 @@
 finally:
     del version, PackageNotFoundError
 
-from .build_cellarrdataset import build_cellarrdataset
+from .build_options import CellMetadataOptions, GeneAnnotationOptions, MatrixOptions, SampleMetadataOptions
+from .buildutils_cellarrdataset import build_cellarrdataset
 from .CellArrDataset import CellArrDataset