From f2979b6ed734a7c1dccb83fef3f73bcc0c8edffe Mon Sep 17 00:00:00 2001
From: Jayaram Kancherla <jayaram.kancherla@gmail.com>
Date: Tue, 18 Jun 2024 23:38:38 -0700
Subject: [PATCH] Rewriting the build process (#2)

Update documentation and tests
---
 README.md                                     |  40 +-
 docs/index.md                                 |   6 +-
 docs/tutorial.md                              |   7 +-
 src/cellarr/CellArrDataset.py                 | 211 ++++++--
 src/cellarr/__init__.py                       |   3 +-
 src/cellarr/build_cellarrdataset.py           | 414 ---------------
 src/cellarr/build_options.py                  | 125 +++++
 src/cellarr/buildutils_cellarrdataset.py      | 476 ++++++++++++++++++
 ...db_array.py => buildutils_tiledb_array.py} |  28 +-
 ...db_frame.py => buildutils_tiledb_frame.py} |   0
 src/cellarr/queryutils_tiledb_frame.py        |  78 +++
 src/cellarr/utils_anndata.py                  |  68 +--
 tests/test_build.py                           |  23 +-
 tests/test_dataset.py                         |  66 ---
 tests/test_query.py                           |  59 +++
 15 files changed, 1008 insertions(+), 596 deletions(-)
 delete mode 100644 src/cellarr/build_cellarrdataset.py
 create mode 100644 src/cellarr/build_options.py
 create mode 100644 src/cellarr/buildutils_cellarrdataset.py
 rename src/cellarr/{utils_tiledb_array.py => buildutils_tiledb_array.py} (83%)
 rename src/cellarr/{utils_tiledb_frame.py => buildutils_tiledb_frame.py} (100%)
 create mode 100644 src/cellarr/queryutils_tiledb_frame.py
 delete mode 100644 tests/test_dataset.py
 create mode 100644 tests/test_query.py
diff --git a/README.md b/README.md
index 3a147fa..ef63b65 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,3 @@
-<!-- These are examples of badges you might want to add to your README:
-     please update the URLs accordingly
-
-[![Built Status](https://api.cirrus-ci.com/github/<USER>/cellarr.svg?branch=main)](https://cirrus-ci.com/github/<USER>/cellarr)
-[![ReadTheDocs](https://readthedocs.org/projects/cellarr/badge/?version=latest)](https://cellarr.readthedocs.io/en/stable/)
-[![Coveralls](https://img.shields.io/coveralls/github/<USER>/cellarr/main.svg)](https://coveralls.io/r/<USER>/cellarr)
-[![PyPI-Server](https://img.shields.io/pypi/v/cellarr.svg)](https://pypi.org/project/cellarr/)
-[![Conda-Forge](https://img.shields.io/conda/vn/conda-forge/cellarr.svg)](https://anaconda.org/conda-forge/cellarr)
-[![Monthly Downloads](https://pepy.tech/badge/cellarr/month)](https://pepy.tech/project/cellarr)
-[![Twitter](https://img.shields.io/twitter/url/http/shields.io.svg?style=social&label=Twitter)](https://twitter.com/cellarr)
--->
-
 [![PyPI-Server](https://img.shields.io/pypi/v/cellarr.svg)](https://pypi.org/project/cellarr/)
 ![Unit tests](https://github.com/BiocPy/cellarr/actions/workflows/pypi-test.yml/badge.svg)
 
@@ -30,9 +18,9 @@ pip install cellarr
 
 ## Usage
 
-### Create a `CellArrDataset`
+### Build a `CellArrDataset`
 
-Creating a `CellArrDataset` generates three TileDB files in the specified output directory:
+Building a `CellArrDataset` generates three TileDB files in the specified output directory:
 
 - `gene_metadata`: Contains feature annotations.
 - `cell_metadata`: Contains cell or sample metadata.
@@ -67,11 +55,29 @@ dataset = build_cellarrdataset(
      matrix_dim_dtype=np.float32
 )
 ```
+
+The build process usually involves 3 steps:
+
+1. **Scan the Collection**: Scan the entire collection of files to create
+a unique set of gene symbols. Store this gene set as the
+`gene_metadata` TileDB file.
+2. **Store Cell Metadata**: Store cell metadata as the
+`cell_metadata` TileDB file.
+3. **Remap and Orient Data**: For each dataset in the collection,
+remap and orient the gene dimension using the gene set from Step 1.
+This step ensures consistency in gene measurement and order, even if
+some genes are unmeasured or ordered differently in the original experiments.
+
+***Note: The objects to build the `CellArrDataset` are expected to be fairly consistent, especially along the feature dimension.
+if these are `AnnData` or `H5AD`objects, all objects must contain an index (in the `var` slot) specifying the gene symbols.***
+
+Check out the [documentation](https://biocpy.github.io/cellarr/tutorial.html) for more details.
+
 ----
 
-#### TODO: This following section does not work yet.
+#### TODO: This following section does not work yet
 
-Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializng it to the path where the files were created.
+Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializing it to the path where the files were created.
 
 ```python
 # Create a CellArrDataset object from the existing dataset
@@ -86,4 +92,4 @@ expression_data = dataset[10, ["gene1", "gene10", "gene500"]]
 ## Note
 
 This project has been set up using PyScaffold 4.5. For details and usage
-information on PyScaffold see https://pyscaffold.org/.
+information on PyScaffold see <https://pyscaffold.org/>.
diff --git a/docs/index.md b/docs/index.md
index 7283ccf..8f9872f 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -16,11 +16,11 @@ pip install cellarr
 :maxdepth: 2
 
 Overview <tutorial>
-Contributions & Help <contributing>
-License <license>
+Module Reference <api/modules>
 Authors <authors>
 Changelog <changelog>
-Module Reference <api/modules>
+Contributions & Help <contributing>
+License <license>
 ```
 
 ## Indices and tables
diff --git a/docs/tutorial.md b/docs/tutorial.md
index 127c86f..f9fa957 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -10,9 +10,9 @@ Cell Arrays is a Python package that provides a TileDB-backed store for large co
 
 ## Usage
 
-### Create the `CellArrDataset`
+### Build the `CellArrDataset`
 
-Creating a CellArrDataset generates three TileDB files in the specified output directory:
+Creating a `CellArrDataset` generates three TileDB files in the specified output directory:
 
 - `gene_metadata`: Contains feature annotations.
 - `cell_metadata`: Contains cell or sample metadata.
@@ -43,11 +43,12 @@ dataset = build_cellarrdataset(
      matrix_dim_dtype=np.float32
 )
 ```
+
 ----
 
 #### TODO: This following section does not work yet.
 
-Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializng it to the path where the files were created.
+Users have the option to reuse the `dataset` object retuned when building the dataset or by creating a `CellArrDataset` object by initializing it to the path where the files were created.
 
 ```python
 # Create a CellArrDataset object from the existing dataset
diff --git a/src/cellarr/CellArrDataset.py b/src/cellarr/CellArrDataset.py
index 0dd467a..536cd8a 100644
--- a/src/cellarr/CellArrDataset.py
+++ b/src/cellarr/CellArrDataset.py
@@ -1,34 +1,46 @@
 import os
-from typing import Union
+from typing import List, Union, Sequence
 
+import pandas as pd
 import tiledb
 
+from . import queryutils_tiledb_frame as qtd
+
 __author__ = "Jayaram Kancherla"
 __copyright__ = "Jayaram Kancherla"
 __license__ = "MIT"
 
 
 class CellArrDataset:
-    """A class that represent a collection of cells in TileDB."""
+    """A class that represent a collection of cells and their associated metadata in a TileDB backed store."""
 
     def __init__(
         self,
         dataset_path: str,
-        counts_tdb_uri: str = "counts",
-        gene_metadata_uri: str = "gene_metadata",
+        matrix_tdb_uri: str = "counts",
+        gene_annotation_uri: str = "gene_annotation",
         cell_metadata_uri: str = "cell_metadata",
+        sample_metadata_uri: str = "sample_metadata",
     ):
-        """Initialize a ``CellArr`` dataset.
+        """Initialize a ``CellArrDataset``.
 
         Args:
+            dataset_path:
+                Path to the directory containing the tiledb stores.
+                Usually the ``output_path`` from the
+                :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
+
             counts_tdb_uri:
-                Path to counts TileDB.
+                Relative path to matrix store.
 
-            gene_metadata_uri:
-                Path to gene metadata TileDB.
+            gene_annotation_uri:
+                Relative path to gene annotation store.
 
             cell_metadata_uri:
-                Path to cell metadata TileDB.
+                Relative path to cell metadata store.
+
+            sample_metadata_uri:
+                Relative path to sample metadata store.
         """
 
         if not os.path.isdir(dataset_path):
@@ -36,71 +48,132 @@ def __init__(
 
         self._dataset_path = dataset_path
         # TODO: Maybe switch to on-demand loading of these objects
-        self._counts_tdb_tdb = tiledb.open(f"{dataset_path}/{counts_tdb_uri}", "r")
-        self._gene_metadata_tdb = tiledb.open(
-            f"{dataset_path}/{gene_metadata_uri}", "r"
+        self._matrix_tdb_tdb = tiledb.open(f"{dataset_path}/{matrix_tdb_uri}", "r")
+        self._gene_annotation_tdb = tiledb.open(
+            f"{dataset_path}/{gene_annotation_uri}", "r"
         )
         self._cell_metadata_tdb = tiledb.open(
             f"{dataset_path}/{cell_metadata_uri}", "r"
         )
 
     def __del__(self):
-        self._counts_tdb_tdb.close()
-        self._gene_metadata_tdb.close()
+        self._matrix_tdb_tdb.close()
+        self._gene_annotation_tdb.close()
         self._cell_metadata_tdb.close()
 
-    # TODO:
-    # Methods to implement
-    # search by gene
-    # search by cell metadata
-    # slice counts after search
+    def get_cell_metadata_columns(self) -> List[str]:
+        """Get column names from ``cell_metadata`` store.
 
-    def get_cell_metadata_columns(self):
-        columns = []
-        for i in range(self._cell_metadata_tdb.schema.nattr):
-            columns.append(self._cell_metadata_tdb.schema.attr(i).name)
+        Returns:
+            List of available metadata columns.
+        """
+        return qtd.get_schema_names_frame(self._cell_metadata_tdb)
 
-        return columns
+    def get_cell_metadata_column(self, column_name: str) -> list:
+        """Access a column from the ``cell_metadata`` store.
 
-    def get_cell_metadata_column(self, column_name: str):
-        return self._cell_metadata_tdb.query(attrs=[column_name]).df[:]
+        Args:
+            column_name:
+                Name of the column or attribute. Usually one of the column names
+                from of :py:meth:`~get_cell_metadata_columns`.
+
+        Returns:
+            A list of values for this column.
+        """
+        return qtd.get_a_column(self._cell_metadata_tdb, column_name=column_name)
 
     def get_cell_subset(
         self, subset: Union[slice, tiledb.QueryCondition], columns=None
-    ):
-        if columns is None:
-            columns = self.get_cell_metadata_columns()
+    ) -> pd.DataFrame:
+        """Slice the ``cell_metadata`` store.
+
+        Args:
+            subset:
+                A list of integer indices to subset the ``cell_metadata``
+                store.
+
+                Alternatively, may also provide a
+                :py:class:`tiledb.QueryCondition` to query the store.
 
-        query = self._cell_metadata_tdb.query(cond=subset, attrs=columns)
-        data = query.df[:]
-        result = data.dropna()
-        return result
+            columns:
+                List of specific column names to access.
 
-    def get_gene_metadata_columns(self):
-        columns = []
-        for i in range(self._gene_metadata_tdb.schema.nattr):
-            columns.append(self._gene_metadata_tdb.schema.attr(i).name)
+                Defaults to None, in which case all columns are extracted.
 
-        return columns
+        Returns:
+            A pandas Dataframe of the subset.
+        """
+        return qtd.subset_frame(self._cell_metadata_tdb, subset=subset, columns=columns)
+
+    def get_gene_metadata_columns(self) -> List[str]:
+        """Get annotation column names from ``gene_metadata`` store.
+
+        Returns:
+            List of available annotations.
+        """
+        return qtd.get_schema_names_frame(self._gene_annotation_tdb)
 
     def get_gene_metadata_column(self, column_name: str):
-        return self._gene_metadata_tdb.query(attrs=[column_name]).df[:]
+        """Access a column from the ``gene_metadata`` store.
+
+        Args:
+            column_name:
+                Name of the column or attribute. Usually one of the column names
+                from of :py:meth:`~get_gene_metadata_columns`.
+
+        Returns:
+            A list of values for this column.
+        """
+        return qtd.get_a_column(self._gene_annotation_tdb, column_name=column_name)
+
+    def get_gene_metadata_index(self):
+        """Get index of the ``gene_metadata`` store. This typically should store all unique gene symbols.
+
+        Returns:
+            List of unique symbols.
+        """
+        return qtd.get_index(self._gene_annotation_tdb)
+
+    def _get_indices_for_gene_list(self, query: list) -> List[int]:
+        _gene_index = self.get_gene_metadata_index()
+        return qtd._match_to_list(_gene_index, query=query)
 
     def get_gene_subset(
-        self, subset: Union[slice, tiledb.QueryCondition], columns=None
+        self, subset: Union[slice, List[str], tiledb.QueryCondition], columns=None
     ):
-        if columns is None:
-            columns = self.get_gene_metadata_columns()
+        """Slice the ``gene_metadata`` store.
+
+        Args:
+            subset:
+                A list of integer indices to subset the ``gene_metadata``
+                store.
+
+                Alternatively, may provide a
+                :py:class:`tiledb.QueryCondition` to query the store.
+
+                Alternatively, may provide a list of strings to match with
+                the index of ``gene_metadata`` store.
+
+            columns:
+                List of specific column names to access.
+
+                Defaults to None, in which case all columns are extracted.
+
+        Returns:
+            A pandas Dataframe of the subset.
+        """
+
+        if qtd._is_list_strings(subset):
+            subset = self._get_indices_for_gene_list(subset)
 
-        query = self._gene_metadata_tdb.query(cond=subset, attrs=columns)
-        data = query.df[:]
-        result = data.dropna()
-        return result
+        return qtd.subset_frame(
+            self._gene_annotation_tdb, subset=subset, columns=columns
+        )
 
     def get_slice(
         self,
         cell_subset: Union[slice, tiledb.QueryCondition],
-        gene_subset: Union[slice, tiledb.QueryCondition],
+        gene_subset: Union[slice, List[str], tiledb.QueryCondition],
     ):
         _csubset = self.get_cell_subset(cell_subset)
         _cell_indices = _csubset.index.tolist()
@@ -108,4 +181,46 @@ def get_slice(
         _gsubset = self.get_gene_subset(gene_subset)
         _gene_indices = _gsubset.index.tolist()
 
-        return self._counts_tdb_tdb.multi_index[_cell_indices, _gene_indices]
+        return self._matrix_tdb_tdb.multi_index[_cell_indices, _gene_indices]
+
+    def __getitem__(
+        self,
+        args: Union[int, str, Sequence, tuple],
+    ):
+        """Subset a ``CellArrDataset``.
+
+        Args:
+            args:
+                Integer indices, a boolean filter, or (if the current object is
+                named) names specifying the ranges to be extracted.
+
+                Alternatively a tuple of length 1. The first entry specifies
+                the rows to retain based on their names or indices.
+
+                Alternatively a tuple of length 2. The first entry specifies
+                the rows to retain, while the second entry specifies the
+                columns to retain, based on their names or indices.
+
+        Raises:
+            ValueError:
+                If too many or too few slices provided.
+        """
+        if isinstance(args, (str, int)):
+            return self.get_slice(args, slice(None))
+
+        if isinstance(args, tuple):
+            if len(args) == 0:
+                raise ValueError("At least one slicing argument must be provided.")
+
+            if len(args) == 1:
+                return self.get_slice(args[0], slice(None))
+            elif len(args) == 2:
+                return self.get_slice(args[0], args[1])
+            else:
+                raise ValueError(
+                    f"`{type(self).__name__}` only supports 2-dimensional slicing."
+                )
+
+        raise TypeError(
+            "args must be a sequence or a scalar integer or string or a tuple of atmost 2 values."
+        )
diff --git a/src/cellarr/__init__.py b/src/cellarr/__init__.py
index e7c6f0c..3bcc975 100644
--- a/src/cellarr/__init__.py
+++ b/src/cellarr/__init__.py
@@ -15,5 +15,6 @@
 finally:
     del version, PackageNotFoundError
 
-from .build_cellarrdataset import build_cellarrdataset
+from .build_options import CellMetadataOptions, GeneAnnotationOptions, MatrixOptions, SampleMetadataOptions
+from .buildutils_cellarrdataset import build_cellarrdataset
 from .CellArrDataset import CellArrDataset
diff --git a/src/cellarr/build_cellarrdataset.py b/src/cellarr/build_cellarrdataset.py
deleted file mode 100644
index 2e0eb9c..0000000
--- a/src/cellarr/build_cellarrdataset.py
+++ /dev/null
@@ -1,414 +0,0 @@
-"""Build the `CellArrDatset`
-
-The `CellArrDataset` method is designed to store single-cell RNA-seq
-datasets but can be generalized to store any 2-dimensional experimental data.
-
-This method creates three TileDB files in the directory specified by `output_path`:
-- `gene_metadata`: A TileDB file containing gene metadata.
-- `cell_metadata`: A TileDB file containing cell metadata.
-- A matrix TileDB file named as specified by the `layer_matrix_name` parameter.
-
-The TileDB matrix file is stored in a cell X gene orientation. This orientation
-is chosen because the fastest-changing dimension as new files are added to the
-collection is usually the cells rather than genes.
-
-## Process
-
-1. **Scan the Collection**: Scan the entire collection of files to create
-a unique set of gene symbols. Store this gene set as the
-`gene_metadata` TileDB file.
-2. **Store Cell Metadata**: Store cell metadata as the
-`cell_metadata` TileDB file.
-3. **Remap and Orient Data**: For each dataset in the collection,
-remap and orient the gene dimension using the gene set from Step 1.
-This step ensures consistency in gene measurement and order, even if
-some genes are unmeasured or ordered differently in the original experiments.
-
-
-Example:
-
-    .. code-block:: python
-
-        import anndata
-        import numpy as np
-        import tempfile
-        from cellarr import build_cellarrdataset, CellArrDataset
-
-        # Create a temporary directory
-        tempdir = tempfile.mkdtemp()
-
-        # Read AnnData objects
-        adata1 = anndata.read_h5ad("path/to/object1.h5ad")
-        # or just provide the path
-        adata2 = "path/to/object2.h5ad"
-
-        # Build CellArrDataset
-        dataset = build_cellarrdataset(
-            output_path=tempdir,
-            h5ad_or_adata=[adata1, adata2],
-            matrix_dim_dtype=np.float32
-        )
-"""
-
-import os
-import warnings
-from typing import List, Union
-
-import anndata
-import numpy as np
-import pandas as pd
-
-from . import utils_anndata as uad
-from . import utils_tiledb_array as uta
-from . import utils_tiledb_frame as utf
-from .CellArrDataset import CellArrDataset
-
-__author__ = "Jayaram Kancherla"
-__copyright__ = "Jayaram Kancherla"
-__license__ = "MIT"
-
-
-def build_cellarrdataset(
-    files: List[Union[str, anndata.AnnData]],
-    output_path: str,
-    num_cells: int = None,
-    num_genes: int = None,
-    cell_metadata: Union[pd.DataFrame, str] = None,
-    gene_metadata: Union[List[str], dict, str, pd.DataFrame] = None,
-    var_gene_column: str = "index",
-    layer_matrix_name: str = "counts",
-    skip_gene_tiledb: bool = False,
-    skip_cell_tiledb: bool = False,
-    skip_matrix_tiledb: bool = False,
-    cell_dim_dtype: np.dtype = np.uint32,
-    gene_dim_dtype: np.dtype = np.uint32,
-    matrix_dim_dtype: np.dtype = np.uint32,
-    optimize_tiledb: bool = True,
-    num_threads: int = 1,
-):
-    """Generate the `CellArrDataset`.
-
-    All files are expected to be consistent and any modifications
-    to make them consistent is outside the scope of this function
-    and package.
-
-    There's a few assumptions this process makes:
-    - If object in ``files`` is an AnnData or H5AD object, these must
-    contain an assay matrix in layer names as ``layer_matrix_name``
-    parameter.
-    - Feature information must contain a column defined by
-    ``var_gene_column`` that contains gene symbols or a common entity
-    across all files.
-    -
-
-    Args:
-        files:
-            List of file paths to `H5AD` or ``AnnData`` objects.
-            Each object in this list must contain
-            - gene symbols as index or the column specified by
-            ``var_gene_column``.
-            - Must contain a layers with a matrix named as
-            ``layer_matrix_name``.
-
-        output_path:
-            Path to where the output tiledb files should be stored.
-
-        num_cells:
-            Number of cells across all files.
-
-            Defualts to None, in which case, automatically inferred by
-            scanning all objects in ``h5ad_or_adata``.
-
-        num_genes:
-            Number of genes across all cells.
-
-            Defualts to None, in which case, automatically inferred by
-            scanning all objects in ``h5ad_or_adata``.
-
-        cell_metadata:
-            Path to the file containing a concatenated cell metadata across
-            all cells. In this case, the first row is expected to contain the
-            column names.
-
-            Alternatively, may also provide a dataframe containing the cell
-            metadata across all objects.
-
-            Regardless of the input type, the number of rows in the file or
-            DataFrame must match the ``num_cells`` argument.
-
-            Defaults to None, then a simple range index is created using the
-            ``num_cells`` argument.
-
-        gene_metadata:
-            Path to the file containing a concatenated gene annotations across
-            all datasets. In this case, the first row is
-            expected to contain the column names and an index column
-            containing the gene symbols to remap the matrix.
-
-            Alternatively, may also provide a dataframe containing the gene
-            annotations across all objects.
-
-            Alternatively, a list or a dictionary of gene symbols.
-
-            Regardless of the input type, the number of rows in the file or
-            DataFrame must match the ``num_genes`` argument.
-
-            Defaults to None, then a gene set is generated by scanning all
-            objects in ``h5ad_or_adata``.
-
-        var_gene_column:
-            Column name from ``var`` slot that contains the gene symbols.
-            Must be consistent across all objects in ``h5ad_or_adata``.
-
-            Defaults to "index".
-
-        layer_matrix_name:
-            Matrix name from ``layers`` slot to add to tiledb.
-            Must be consistent across all objects in ``h5ad_or_adata``.
-
-            Defaults to "counts".
-
-        skip_gene_tiledb:
-            Whether to skip generating gene metadata tiledb.
-
-            Defaults to False.
-
-        skip_cell_tiledb:
-            Whether to skip generating cell metadata tiledb.
-
-            Defaults to False.
-
-        skip_matrix_tiledb:
-            Whether to skip generating matrix tiledb.
-
-            Defaults to False.
-
-        cell_dim_dtype:
-            NumPy dtype for the cell dimension.
-            Defaults to np.uint32.
-
-            Note: make sure the number of cells fit
-            within the range limits of unsigned-int32.
-
-        gene_dim_dtype:
-            NumPy dtype for the gene dimension.
-            Defaults to np.uint32.
-
-            Note: make sure the number of genes fit
-            within the range limits of unsigned-int32.
-
-        matrix_dim_dtype:
-            NumPy dtype for the values in the matrix.
-            Defaults to np.uint32.
-
-            Note: make sure the matrix values fit
-            within the range limits of unsigned-int32.
-
-        num_threads:
-            Number of threads.
-
-            Defaults to 1.
-    """
-    if not os.path.isdir(output_path):
-        raise ValueError("'output_path' must be a directory.")
-
-    if gene_metadata is None:
-        warnings.warn(
-            "Scanning all files for gene symbols, this may take long", UserWarning
-        )
-        gene_set = uad.scan_for_genes(
-            files, var_gene_column=var_gene_column, num_threads=num_threads
-        )
-
-        gene_set = sorted(gene_set)
-
-        gene_metadata = pd.DataFrame({"genes": gene_set}, index=gene_set)
-    elif isinstance(gene_metadata, list):
-        _gene_list = sorted(list(set(gene_metadata)))
-        gene_metadata = pd.DataFrame({"genes": _gene_list}, index=_gene_list)
-    elif isinstance(gene_metadata, dict):
-        _gene_list = sorted(list(gene_metadata.keys()))
-        gene_metadata = pd.DataFrame({"genes": _gene_list}, index=_gene_list)
-    elif isinstance(gene_metadata, str):
-        gene_metadata = pd.read_csv(gene_metadata, index=True, header=True)
-
-    gene_metadata["genes_index"] = gene_metadata.index.tolist()
-
-    if not isinstance(gene_metadata, pd.DataFrame):
-        raise TypeError("'gene_metadata' must be a pandas dataframe.")
-
-    if len(gene_metadata.index.unique()) != len(gene_metadata.index.tolist()):
-        raise ValueError("'gene_metadata' must contain a unique index.")
-
-    if num_genes is None:
-        num_genes = len(gene_metadata)
-
-    # Create the gene metadata tiledb
-    if not skip_gene_tiledb:
-        _col_types = {}
-        for col in gene_metadata.columns:
-            _col_types[col] = "ascii"
-
-        _gene_output_uri = f"{output_path}/gene_metadata"
-        generate_metadata_tiledb_frame(
-            _gene_output_uri, gene_metadata, column_types=_col_types
-        )
-
-        if optimize_tiledb:
-            uta.optimize_tiledb_array(_gene_output_uri)
-
-    if cell_metadata is None:
-        if num_cells is None:
-            warnings.warn(
-                "Scanning all files to compute cell counts, this may take long",
-                UserWarning,
-            )
-            cell_counts = uad.scan_for_cellcounts(files, num_threads=num_threads)
-            num_cells = sum(cell_counts)
-
-        cell_metadata = pd.DataFrame({"cell_index": [x for x in range(num_cells)]})
-
-    if isinstance(cell_metadata, str):
-        warnings.warn(
-            "Scanning 'cell_metadata' to count number of cells, this may take long",
-            UserWarning,
-        )
-        with open(cell_metadata) as fp:
-            count = 0
-            for _ in fp:
-                count += 1
-
-        num_cells = count - 1  # removing 1 for the header line
-    elif isinstance(cell_metadata, pd.DataFrame):
-        num_cells = len(cell_metadata)
-
-    if num_cells is None:
-        raise ValueError(
-            "Cannot determine 'num_cells', we recommend setting this parameter."
-        )
-
-    # Create the cell metadata tiledb
-    if not skip_cell_tiledb:
-        _cell_output_uri = f"{output_path}/cell_metadata"
-
-        if isinstance(cell_metadata, str):
-            _cell_metaframe = pd.read_csv(cell_metadata, chunksize=5, header=True)
-            generate_metadata_tiledb_csv(
-                _cell_output_uri, cell_metadata, _cell_metaframe.columns
-            )
-        elif isinstance(cell_metadata, pd.DataFrame):
-            _col_types = {}
-            for col in gene_metadata.columns:
-                _col_types[col] = "ascii"
-
-            _to_write = gene_metadata.astype(str)
-
-            generate_metadata_tiledb_frame(
-                _cell_output_uri, _to_write, column_types=_col_types
-            )
-
-        if optimize_tiledb:
-            uta.optimize_tiledb_array(_cell_output_uri)
-
-    # create the counts metadata
-    if not skip_matrix_tiledb:
-        gene_idx = gene_metadata.index.tolist()
-        gene_set = {}
-        for i, x in enumerate(gene_idx):
-            gene_set[x] = i
-
-        _counts_uri = f"{output_path}/{layer_matrix_name}"
-        uta.create_tiledb_array(
-            _counts_uri,
-            num_cells=num_cells,
-            num_genes=num_genes,
-            matrix_attr_name=layer_matrix_name,
-            x_dim_dtype=cell_dim_dtype,
-            y_dim_dtype=gene_dim_dtype,
-            matrix_dim_dtype=matrix_dim_dtype,
-        )
-
-        offset = 0
-        for fd in files:
-            mat = uad.remap_anndata(
-                fd,
-                gene_set,
-                var_gene_column=var_gene_column,
-                layer_matrix_name=layer_matrix_name,
-            )
-            uta.write_csr_matrix_to_tiledb(
-                _counts_uri, matrix=mat, row_offset=offset, value_dtype=matrix_dim_dtype
-            )
-            offset += int(mat.shape[0])
-
-        if optimize_tiledb:
-            uta.optimize_tiledb_array(_counts_uri)
-
-    return CellArrDataset(dataset_path=output_path, counts_tdb_uri=layer_matrix_name)
-
-
-def generate_metadata_tiledb_frame(
-    output_uri: str, input: pd.DataFrame, column_types: dict = None
-):
-    """Generate metadata tiledb from a :pu:class:`~pandas.DataFrame`.
-
-    Args:
-        output_uri:
-            TileDB URI or path to save the file.
-
-        input:
-            Input dataframe.
-
-        column_types:
-            You can specify type of each column name to cast into.
-            "ascii" or str works best for most scenarios.
-
-            Defaults to None.
-    """
-    _to_write = input.astype(str)
-    utf.create_tiledb_frame_from_dataframe(
-        output_uri, _to_write, column_types=column_types
-    )
-
-
-def generate_metadata_tiledb_csv(
-    output_uri: str,
-    input: str,
-    column_dtype=str,
-    chunksize=1000,
-):
-    """Generate a metadata tiledb from csv.
-
-    The difference between this and ``generate_metadata_tiledb_frame``
-    is when the csv is super large and it won't fit into memory.
-
-    Args:
-        output_uri:
-            TileDB URI or path to save the file.
-
-        input:
-            Path to the csv file. The first row is expected to
-            contain the column names.
-
-        column_dtype:
-            Dtype of the columns.
-            Defaults to str.
-
-        chunksize:
-            Chunk size to read the dataframe.
-            Defaults to 1000.
-    """
-    chunksize = 1000
-    initfile = True
-    offset = 0
-
-    for chunk in pd.read_csv(input, chunksize=chunksize, header=True):
-        if initfile:
-            utf.create_tiledb_frame_from_column_names(
-                output_uri, chunk.columns, column_dtype
-            )
-            initfile = False
-
-        _to_write = chunk.astype(str)
-        utf.append_to_tiledb_frame(output_uri, _to_write, offset)
-        offset += len(chunk)
diff --git a/src/cellarr/build_options.py b/src/cellarr/build_options.py
new file mode 100644
index 0000000..94ab665
--- /dev/null
+++ b/src/cellarr/build_options.py
@@ -0,0 +1,125 @@
+import numpy as np
+
+from dataclasses import dataclass
+
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+
+
+@dataclass
+class CellMetadataOptions:
+    """Optional arguments for the ``cell_metadata`` store for
+    :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
+
+    Attributes:
+        skip:
+            Whether to skip generating cell metadata tiledb.
+            Defaults to False.
+
+        dtype:
+            NumPy dtype for the cell dimension.
+            Defaults to np.uint32.
+
+            Note: make sure the number of cells fit
+            within the integer limits of unsigned-int32.
+
+        tiledb_store_name:
+            Name of the tiledb file.
+            Defaults to "cell_metadata".
+    """
+
+    skip: bool = False
+    dtype: np.dtype = np.uint32
+    tiledb_store_name: str = "cell_metadata"
+
+
+@dataclass
+class GeneAnnotationOptions:
+    """Optional arguments for the ``gene_annotation`` store for
+    :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
+
+    Attributes:
+        feature_column:
+            Column in ``var`` containing the feature ids (e.g. gene symbols).
+            Defaults to the index of the ``var`` slot.
+
+        skip:
+            Whether to skip generating gene annotation tiledb.
+            Defaults to False.
+
+        dtype:
+            NumPy dtype for the gene dimension.
+            Defaults to np.uint32.
+
+            Note: make sure the number of genes fit
+            within the integer limits of unsigned-int32.
+
+        tiledb_store_name:
+            Name of the tiledb file.
+            Defaults to "gene_annotation".
+    """
+
+    skip: bool = False
+    feature_column: str = "index"
+    dtype: np.dtype = np.uint32
+    tiledb_store_name: str = "gene_annotation"
+
+
+@dataclass
+class MatrixOptions:
+    """Optional arguments for the ``matrix`` store for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
+
+    Attributes:
+        matrix_name:
+            Matrix name from ``layers`` slot to add to tiledb.
+            Must be consistent across all objects in ``files``.
+
+            Defaults to "counts".
+
+        skip:
+            Whether to skip generating matrix tiledb.
+            Defaults to False.
+
+        dtype:
+            NumPy dtype for the values in the matrix.
+            Defaults to np.uint16.
+
+            Note: make sure the matrix values fit
+            within the range limits of unsigned-int16.
+
+        tiledb_store_name:
+            Name of the tiledb file.
+            Defaults to `matrix`.
+    """
+
+    skip: bool = False
+    matrix_name: str = "counts"
+    dtype: np.dtype = np.uint16
+    tiledb_store_name: str = "counts"
+
+
+@dataclass
+class SampleMetadataOptions:
+    """Optional arguments for the ``sample`` store for :py:func:`~cellarr.build_cellarrdataset.build_cellarrdataset`.
+
+    Attributes:
+        skip:
+            Whether to skip generating sample tiledb.
+            Defaults to False.
+
+        dtype:
+            NumPy dtype for the sample dimension.
+            Defaults to np.uint32.
+
+            Note: make sure the number of samples fit
+            within the integer limits of unsigned-int32.
+
+        tiledb_store_name:
+            Name of the tiledb file.
+            Defaults to "sample_metadata".
+    """
+
+    skip: bool = False
+    dtype: np.dtype = np.uint32
+    tiledb_store_name: str = "sample_metadata"
diff --git a/src/cellarr/buildutils_cellarrdataset.py b/src/cellarr/buildutils_cellarrdataset.py
new file mode 100644
index 0000000..61b0b9f
--- /dev/null
+++ b/src/cellarr/buildutils_cellarrdataset.py
@@ -0,0 +1,476 @@
+"""Build the `CellArrDatset`.
+
+The `CellArrDataset` method is designed to store single-cell RNA-seq
+datasets but can be generalized to store any 2-dimensional experimental data.
+
+This method creates three TileDB files in the directory specified by `output_path`:
+
+- `cell_metadata`: A TileDB file containing cell metadata.
+- `gene_annotation`: A TileDB file containing feature/gene annotations.
+- A matrix TileDB file named by the `layer_matrix_name` parameter.
+- `sample_metadata`: A TileDB file containing sample metadata.
+
+The TileDB matrix file is stored in a ``cell X gene`` orientation. This orientation
+is chosen because the fastest-changing dimension as new files are added to the
+collection is usually the cells rather than genes.
+
+Process:
+
+1. **Scan the Collection**: Scan the entire collection of files to create
+a unique set of feature ids (e.g. gene symbols). Store this set as the
+`gene_annotation` TileDB file.
+
+2. **Store Cell Metadata**: Store cell metadata in the `cell_metadata`
+TileDB file.
+
+3. **Remap and Orient Data**: For each dataset in the collection,
+remap and orient the feature dimension using the feature set from Step 1.
+This step ensures consistency in gene measurement and order, even if
+some genes are unmeasured or ordered differently in the original experiments.
+
+4. **Sample Metadata**: Store sample metadata in `sample_metadata`
+TileDB file.
+
+Example:
+
+    .. code-block:: python
+
+        import anndata
+        import numpy as np
+        import tempfile
+        from cellarr import build_cellarrdataset, CellArrDataset
+
+        # Create a temporary directory
+        tempdir = tempfile.mkdtemp()
+
+        # Read AnnData objects
+        adata1 = anndata.read_h5ad("path/to/object1.h5ad")
+        # or just provide the path
+        adata2 = "path/to/object2.h5ad"
+
+        # Build CellArrDataset
+        dataset = build_cellarrdataset(
+            output_path=tempdir,
+            files=[adata1, adata2],
+            matrix_dim_dtype=np.float32
+        )
+"""
+
+import os
+import warnings
+from typing import List, Union
+
+import anndata
+import pandas as pd
+
+from . import utils_anndata as uad
+from . import buildutils_tiledb_array as uta
+from . import buildutils_tiledb_frame as utf
+from .CellArrDataset import CellArrDataset
+from . import build_options as bopt
+
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+
+
+# TODO: Accept files as a dictionary with names to each dataset.
+def build_cellarrdataset(
+    files: List[Union[str, anndata.AnnData]],
+    output_path: str,
+    gene_annotation: Union[List[str], str, pd.DataFrame] = None,
+    sample_metadata: Union[pd.DataFrame, str] = None,
+    cell_metadata: Union[pd.DataFrame, str] = None,
+    sample_metadata_options: bopt.SampleMetadataOptions = bopt.SampleMetadataOptions(),
+    cell_metadata_options: bopt.CellMetadataOptions = bopt.CellMetadataOptions(),
+    gene_annotation_options: bopt.GeneAnnotationOptions = bopt.GeneAnnotationOptions(),
+    matrix_options: bopt.MatrixOptions = bopt.MatrixOptions(),
+    optimize_tiledb: bool = True,
+    num_threads: int = 1,
+):
+    """Generate the `CellArrDataset`.
+
+    All files are expected to be consistent and any modifications
+    to make them consistent is outside the scope of this function
+    and package.
+
+    There's a few assumptions this process makes:
+    - If object in ``files`` is an :py:class:`~anndata.AnnData`
+    or H5AD object, these must contain an assay matrix in layer
+    names as ``layer_matrix_name`` parameter.
+    - Feature information must contain a column defined by
+    ``feature_column`` in the
+    :py:class:`~cellarr.build_options.GeneAnnotationOptions.` that
+    contains feature ids or gene symbols across all files.
+    - If no ``cell_metadata`` is provided, we scan to count the number of cells
+    and create a simple range index.
+
+    Args:
+        files:
+            List of file paths to `H5AD` or ``AnnData`` objects.
+
+        output_path:
+            Path to where the output tiledb files should be stored.
+
+        gene_metadata:
+            A :py:class:`~pandas.DataFrame` containing the feature/gene
+            annotations across all objects.
+
+            Alternatively, may provide a path to the file containing
+            a concatenated gene annotations across all datasets.
+            In this case, the first row is expected to contain the
+            column names and an index column containing the feature ids
+            or gene symbols.
+
+            Alternatively, a list or a dictionary of gene symbols.
+
+            Irrespective of the input, the object will be appended
+            with a ``cellarr_gene_index`` column that contains the
+            store the gene index across all objects.
+
+            Defaults to None, then a gene set is generated by scanning all
+            objects in ``files``.
+
+        sample_metadata:
+            A :py:class:`~pandas.DataFrame` containing the sample
+            metadata for each file in ``files``. Hences the number of rows
+            in the dataframe must match the number of files.
+
+            Alternatively, may provide path to the file containing a
+            concatenated sample metadata across all cells. In this case,
+            the first row is expected to contain the column names.
+
+            Additionally, the order of rows is expected to be in the same
+            order as the input list of ``files``.
+
+            Irrespective of the input, this object is appended with a
+            ``cellarr_original_gene_list`` column that contains the original
+            set of feature ids (or gene symbols) from the dataset to
+            differentiate between zero-expressed vs unmeasured genes.
+
+            Defaults to `None`, in which case, we create a simple sample
+            metadata dataframe containing the list of datasets.
+            Each dataset is named as ``sample_{i}`` where `i` refers to
+            the index position of the object in ``files``.
+
+        cell_metadata:
+            A :py:class:`~pandas.DataFrame` containing the cell
+            metadata for cells across ``files``. Hences the number of rows
+            in the dataframe must match the number of cells across
+            all files.
+
+            Alternatively, may provide path to the file containing a
+            concatenated cell metadata across all cells. In this case,
+            the first row is expected to contain the column names.
+
+            Additionally, the order of cells is expected to be in the same
+            order as the input list of ``files``. If the input is a path,
+            the file is expected to contain mappings between cells and
+            datasets (or samples).
+
+            Defaults to None, we scan all files to count the number of cells,
+            then create a simple cell metadata DataFrame containing mappings from
+            cells to their associated datasets. Each dataset is named as
+            ``sample_{i}`` where `i` refers to the index position of
+            the object in ``files``.
+
+        sample_metadata_options:
+            Optional parameters when generating ``sample_metadata`` store.
+
+        cell_metadata_options:
+            Optional parameters when generating ``cell_metadata`` store.
+
+        gene_annotation_options:
+            Optional parameters when generating ``gene_annotation`` store.
+
+        matrix_options:
+            Optional parameters when generating ``matrix`` store.
+
+        optimize_tiledb:
+            Whether to run TileDb's vaccum and consolidation (may take long).
+
+        num_threads:
+            Number of threads.
+            Defaults to 1.
+    """
+    if not os.path.isdir(output_path):
+        raise ValueError("'output_path' must be a directory.")
+
+    ####
+    ## Writing gene annotation file
+    ####
+    if gene_annotation is None:
+        warnings.warn(
+            "Scanning all files for feature ids (e.g. gene symbols), this may take long",
+            UserWarning,
+        )
+        gene_set = uad.scan_for_features(
+            files,
+            var_feature_column=gene_annotation_options.feature_column,
+            num_threads=num_threads,
+        )
+
+        gene_set = sorted(gene_set)
+
+        gene_annotation = pd.DataFrame({"cellarr_gene_index": gene_set}, index=gene_set)
+    elif isinstance(gene_annotation, list):
+        _gene_list = sorted(list(set(gene_annotation)))
+        gene_annotation = pd.DataFrame(
+            {"cellarr_gene_index": _gene_list}, index=_gene_list
+        )
+    elif isinstance(gene_annotation, str):
+        gene_annotation = pd.read_csv(gene_annotation, index=True, header=True)
+        gene_annotation["cellarr_gene_index"] = gene_annotation.index.tolist()
+    else:
+        raise TypeError("'gene_annotation' is not an expected type.")
+
+    if not isinstance(gene_annotation, pd.DataFrame):
+        raise TypeError("'gene_annotation' must be a pandas dataframe.")
+
+    if len(gene_annotation.index.unique()) != len(gene_annotation.index.tolist()):
+        raise ValueError(
+            "'gene_annotation' must contain unique feature ids or gene symbols."
+        )
+
+    gene_annotation.reset_index(drop=True, inplace=True)
+
+    # Create the gene annotation tiledb
+    if not gene_annotation_options.skip:
+        _col_types = {}
+        for col in gene_annotation.columns:
+            _col_types[col] = "ascii"
+
+        _gene_output_uri = f"{output_path}/{gene_annotation_options.tiledb_store_name}"
+        generate_metadata_tiledb_frame(
+            _gene_output_uri, gene_annotation, column_types=_col_types
+        )
+
+        if optimize_tiledb:
+            uta.optimize_tiledb_array(_gene_output_uri)
+
+    ####
+    ## Writing the sample metadata file
+    ####
+    _samples = []
+    for idx, _ in enumerate(files):
+        _samples.append(f"sample_{idx}")
+    if sample_metadata is None:
+        warnings.warn(
+            "Sample metadata is not provided, each dataset in 'files' is considered a sample",
+            UserWarning,
+        )
+
+        sample_metadata = pd.DataFrame({"cellarr_sample": _samples})
+    elif isinstance(sample_metadata, str):
+        sample_metadata = pd.read_csv(sample_metadata, header=True)
+        sample_metadata["cellarr_sample"] = _samples
+    else:
+        raise TypeError("'sample_metadata' is not an expected type.")
+
+    if not sample_metadata_options.skip:
+        warnings.warn(
+            "Scanning all files for feature ids (e.g. gene symbols), this may take long",
+            UserWarning,
+        )
+        gene_scan_set = uad.scan_for_features(
+            files,
+            var_feature_column=gene_annotation_options.feature_column,
+            num_threads=num_threads,
+            unique=False,
+        )
+        gene_set_str = [",".join(x) for x in gene_scan_set]
+        sample_metadata["cellarr_original_gene_set"] = gene_set_str
+
+        _col_types = {}
+        for col in sample_metadata.columns:
+            _col_types[col] = "ascii"
+
+        _sample_output_uri = (
+            f"{output_path}/{sample_metadata_options.tiledb_store_name}"
+        )
+        generate_metadata_tiledb_frame(
+            _sample_output_uri, sample_metadata, column_types=_col_types
+        )
+
+        if optimize_tiledb:
+            uta.optimize_tiledb_array(_sample_output_uri)
+
+    ####
+    ## Writing the cell metadata file
+    ####
+    warnings.warn(
+        "Scanning all files to compute cell counts, this may take long",
+        UserWarning,
+    )
+    cell_counts = uad.scan_for_cellcounts(files, num_threads=num_threads)
+    _cellindex_in_dataset = []
+    _dataset = []
+    for idx, cci in enumerate(cell_counts):
+        _cellindex_in_dataset.extend([x for x in range(cci)])
+        _dataset.extend([f"dataset_{idx}" for _ in range(cci)])
+
+    if cell_metadata is None:
+        cell_metadata = pd.DataFrame(
+            {"cellarr_cell_counts": _cellindex_in_dataset, "cellarr_sample": _dataset}
+        )
+    elif isinstance(cell_metadata, str):
+        warnings.warn(
+            "Scanning 'cell_metadata' csv file to count number of cells, this may take long",
+            UserWarning,
+        )
+        with open(cell_metadata) as fp:
+            count = 0
+            for _ in fp:
+                count += 1
+
+        if sum(cell_counts) != count - 1:
+            raise ValueError(
+                "Number of rows in the 'cell_metadata' csv does not match the number of cells across files."
+            )
+
+        warnings.warn(
+            "'cell_metadata' csv file is expected to contain mapping between cells and samples",
+            UserWarning,
+        )
+    elif isinstance(cell_metadata, pd.DataFrame):
+        if sum(cell_counts) != len(cell_metadata):
+            raise ValueError(
+                "Number of rows in 'cell_metadata' does not match the number of cells across files."
+            )
+
+        cell_metadata["cellarr_sample"] = _dataset
+
+    # Create the cell metadata tiledb
+    if not cell_metadata_options.skip:
+        _cell_output_uri = f"{output_path}/cell_metadata"
+
+        if isinstance(cell_metadata, str):
+            _cell_metaframe = pd.read_csv(cell_metadata, chunksize=5, header=True)
+            generate_metadata_tiledb_csv(
+                _cell_output_uri, cell_metadata, _cell_metaframe.columns
+            )
+        elif isinstance(cell_metadata, pd.DataFrame):
+            _col_types = {}
+            for col in cell_metadata.columns:
+                _col_types[col] = "ascii"
+
+            _to_write = cell_metadata.astype(str)
+
+            generate_metadata_tiledb_frame(
+                _cell_output_uri, _to_write, column_types=_col_types
+            )
+
+        if optimize_tiledb:
+            uta.optimize_tiledb_array(_cell_output_uri)
+
+    ####
+    ## Writing the matrix file
+    ####
+    if not matrix_options.skip:
+        gene_idx = gene_annotation["cellarr_gene_index"].tolist()
+        gene_set = {}
+        for i, x in enumerate(gene_idx):
+            gene_set[x] = i
+
+        _counts_uri = f"{output_path}/{matrix_options.tiledb_store_name}"
+        uta.create_tiledb_array(
+            _counts_uri,
+            matrix_attr_name=matrix_options.matrix_name,
+            x_dim_dtype=cell_metadata_options.dtype,
+            y_dim_dtype=gene_annotation_options.dtype,
+            matrix_dim_dtype=matrix_options.dtype,
+        )
+
+        offset = 0
+        for fd in files:
+            mat = uad.remap_anndata(
+                fd,
+                gene_set,
+                var_feature_column=gene_annotation_options.feature_column,
+                layer_matrix_name=matrix_options.matrix_name,
+            )
+            uta.write_csr_matrix_to_tiledb(
+                _counts_uri,
+                matrix=mat,
+                row_offset=offset,
+                value_dtype=matrix_options.dtype,
+            )
+            offset += int(mat.shape[0])
+
+        if optimize_tiledb:
+            uta.optimize_tiledb_array(_counts_uri)
+
+    return CellArrDataset(
+        dataset_path=output_path,
+        sample_metadata_uri=sample_metadata_options.tiledb_store_name,
+        cell_metadata_uri=cell_metadata_options.tiledb_store_name,
+        gene_annotation_uri=gene_annotation_options.tiledb_store_name,
+        matrix_tdb_uri=matrix_options.tiledb_store_name,
+    )
+
+
+def generate_metadata_tiledb_frame(
+    output_uri: str, input: pd.DataFrame, column_types: dict = None
+):
+    """Generate metadata tiledb from a :pu:class:`~pandas.DataFrame`.
+
+    Args:
+        output_uri:
+            TileDB URI or path to save the file.
+
+        input:
+            Input dataframe.
+
+        column_types:
+            You can specify type of each column name to cast into.
+            "ascii" or str works best for most scenarios.
+
+            Defaults to None.
+    """
+    _to_write = input.astype(str)
+    utf.create_tiledb_frame_from_dataframe(
+        output_uri, _to_write, column_types=column_types
+    )
+
+
+def generate_metadata_tiledb_csv(
+    output_uri: str,
+    input: str,
+    column_dtype=str,
+    chunksize=1000,
+):
+    """Generate a metadata tiledb from csv.
+
+    The difference between this and ``generate_metadata_tiledb_frame``
+    is when the csv is super large and it won't fit into memory.
+
+    Args:
+        output_uri:
+            TileDB URI or path to save the file.
+
+        input:
+            Path to the csv file. The first row is expected to
+            contain the column names.
+
+        column_dtype:
+            Dtype of the columns.
+            Defaults to str.
+
+        chunksize:
+            Chunk size to read the dataframe.
+            Defaults to 1000.
+    """
+    chunksize = 1000
+    initfile = True
+    offset = 0
+
+    for chunk in pd.read_csv(input, chunksize=chunksize, header=True):
+        if initfile:
+            utf.create_tiledb_frame_from_column_names(
+                output_uri, chunk.columns, column_dtype
+            )
+            initfile = False
+
+        _to_write = chunk.astype(str)
+        utf.append_to_tiledb_frame(output_uri, _to_write, offset)
+        offset += len(chunk)
diff --git a/src/cellarr/utils_tiledb_array.py b/src/cellarr/buildutils_tiledb_array.py
similarity index 83%
rename from src/cellarr/utils_tiledb_array.py
rename to src/cellarr/buildutils_tiledb_array.py
index f967616..383d424 100644
--- a/src/cellarr/utils_tiledb_array.py
+++ b/src/cellarr/buildutils_tiledb_array.py
@@ -13,8 +13,8 @@
 
 def create_tiledb_array(
     tiledb_uri_path: str,
-    num_cells: int,
-    num_genes: int,
+    x_dim_length: int = None,
+    y_dim_length: int = None,
     x_dim_name: str = "cell_index",
     y_dim_name: str = "gene_index",
     matrix_attr_name: str = "counts",
@@ -32,11 +32,17 @@ def create_tiledb_array(
         tiledb_uri_path:
             Path to create the array tiledb file.
 
-        num_cells:
-            Number of cells (x/fastest-changing dimension).
+        x_dim_length:
+            Number of entries along the x/fastest-changing dimension.
+            e.g. Number of cells.
+            Defaults to None, in which case, the max integer value of
+            ``x_dim_dtype`` is used.
 
-        num_genes:
-            Number fo genes (y dimension).
+        y_dim_length:
+            Number of entries along the y dimension.
+            e.g. Number of genes.
+            Defaults to None, in which case, the max integer value of
+            ``y_dim_dtype`` is used.
 
         x_dim_name:
             Name for the x-dimension.
@@ -67,8 +73,14 @@ def create_tiledb_array(
             Defaults to True.
     """
 
-    xdim = tiledb.Dim(name=x_dim_name, domain=(0, num_cells - 1), dtype=x_dim_dtype)
-    ydim = tiledb.Dim(name=y_dim_name, domain=(0, num_genes - 1), dtype=y_dim_dtype)
+    if x_dim_length is None:
+        x_dim_length = np.iinfo(x_dim_dtype).max
+
+    if y_dim_length is None:
+        y_dim_length = np.iinfo(y_dim_dtype).max
+
+    xdim = tiledb.Dim(name=x_dim_name, domain=(0, x_dim_length - 1), dtype=x_dim_dtype)
+    ydim = tiledb.Dim(name=y_dim_name, domain=(0, y_dim_length - 1), dtype=y_dim_dtype)
 
     dom = tiledb.Domain(xdim, ydim)
 
diff --git a/src/cellarr/utils_tiledb_frame.py b/src/cellarr/buildutils_tiledb_frame.py
similarity index 100%
rename from src/cellarr/utils_tiledb_frame.py
rename to src/cellarr/buildutils_tiledb_frame.py
diff --git a/src/cellarr/queryutils_tiledb_frame.py b/src/cellarr/queryutils_tiledb_frame.py
new file mode 100644
index 0000000..50af7fc
--- /dev/null
+++ b/src/cellarr/queryutils_tiledb_frame.py
@@ -0,0 +1,78 @@
+from functools import lru_cache
+from typing import List, Union
+from warnings import warn
+
+import pandas as pd
+import tiledb
+
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+
+
+@lru_cache
+def get_schema_names_frame(tiledb_obj: tiledb.Array) -> List[str]:
+    columns = []
+    for i in range(tiledb_obj.schema.nattr):
+        columns.append(tiledb_obj.schema.attr(i).name)
+
+    return columns
+
+
+def subset_frame(
+    tiledb_obj: tiledb.Array,
+    subset: Union[slice, tiledb.QueryCondition],
+    columns: Union[str, list] = None,
+) -> pd.DataFrame:
+
+    _avail_columns = get_schema_names_frame(tiledb_obj)
+
+    if columns is None:
+        columns = _avail_columns
+    else:
+        _not_avail = []
+        for col in columns:
+            if col not in _avail_columns:
+                _not_avail.append(col)
+
+        if len(_not_avail) > 0:
+            raise ValueError(f"Columns '{', '.join(_not_avail)}' are not available.")
+
+    if isinstance(columns, str):
+        warn(
+            "provided subset is string, its expected to be a 'query_condition'",
+            UserWarning,
+        )
+
+        query = tiledb_obj.query(cond=subset, attrs=columns)
+        data = query.df[:]
+    else:
+        data = query.df[subset, columns]
+    result = data.dropna()
+    return result
+
+
+def get_a_column(tiledb_obj: tiledb.Array, column_name: str) -> list:
+    if column_name not in get_schema_names_frame(tiledb_obj):
+        raise ValueError(f"Column '{column_name}' does not exist.")
+
+    return tiledb_obj.query(attrs=[column_name]).df[:]
+
+
+@lru_cache
+def get_index(tiledb_obj: tiledb.Array) -> list:
+    _index = tiledb_obj.unique_dim_values("__tiledb_rows")
+    return [x.decode() for x in _index]
+
+
+def _match_to_list(x: list, query: list):
+    return sorted([x.index(x) for x in query])
+
+
+def _is_list_strings(x: list):
+    _ret = False
+
+    if isinstance(x, (list, tuple)) and all(isinstance(y, str) for y in x):
+        _ret = True
+
+    return _ret
diff --git a/src/cellarr/utils_anndata.py b/src/cellarr/utils_anndata.py
index 5921e08..787748d 100644
--- a/src/cellarr/utils_anndata.py
+++ b/src/cellarr/utils_anndata.py
@@ -6,18 +6,19 @@
 import numpy as np
 from scipy.sparse import coo_matrix, csr_array, csr_matrix
 
-__author__ = "Jayaram Kancherla, Tony Kuo"
+__author__ = "Jayaram Kancherla"
 __copyright__ = "Jayaram Kancherla"
 __license__ = "MIT"
 
 
 def remap_anndata(
     h5ad_or_adata: Union[str, anndata.AnnData],
-    gene_set_order: dict,
-    var_gene_column: str = "index",
+    feature_set_order: dict,
+    var_feature_column: str = "index",
     layer_matrix_name: str = "counts",
 ) -> csr_matrix:
-    """Extract and remap the count matrix to the provided gene set order from the :py:class:`~anndata.AnnData` object.
+    """Extract and remap the count matrix to the provided feature (gene) set order from the :py:class:`~anndata.AnnData`
+    object.
 
     Args:
         adata:
@@ -25,16 +26,17 @@ def remap_anndata(
 
             Alternatively, may also provide a path to the H5ad file.
 
-            The index of the `var` slot must contain the gene symbols
+            The index of the `var` slot must contain the feature ids
             for the columns in the matrix.
 
-        gene_set_order:
-            A dictionary with the symbols as keys and their index as value.
-            The gene symbols from the ``AnnData`` object are remapped to the
-            gene order from this dictionary.
+        feature_set_order:
+            A dictionary with the feature ids as keys and their index as
+            value (e.g. gene symbols). The feature ids from the
+            ``AnnData`` object are remapped to the feature order from
+            this dictionary.
 
-        var_gene_column:
-            Column in ``var`` containing the symbols.
+        var_feature_column:
+            Column in ``var`` containing the feature ids (e.g. gene symbols).
             Defaults to the index of the ``var`` slot.
 
         layer_matrix_name:
@@ -55,19 +57,19 @@ def remap_anndata(
 
     mat = adata.layers[layer_matrix_name]
 
-    if var_gene_column == "index":
+    if var_feature_column == "index":
         symbols = adata.var.index.tolist()
     else:
-        symbols = adata.var[var_gene_column].tolist()
+        symbols = adata.var[var_feature_column].tolist()
 
-    indices_to_keep = [i for i, x in enumerate(symbols) if x in gene_set_order]
+    indices_to_keep = [i for i, x in enumerate(symbols) if x in feature_set_order]
     symbols_to_keep = [symbols[i] for i in indices_to_keep]
 
     mat = mat[:, indices_to_keep].copy()
 
     indices_to_map = []
     for x in symbols_to_keep:
-        indices_to_map.append(gene_set_order[x])
+        indices_to_map.append(feature_set_order[x])
 
     if isinstance(mat, np.ndarray):
         mat_coo = coo_matrix(mat)
@@ -80,13 +82,13 @@ def remap_anndata(
 
     return coo_matrix(
         (mat_coo.data, (mat_coo.row, new_col)),
-        shape=(adata.shape[0], len(gene_set_order)),
+        shape=(adata.shape[0], len(feature_set_order)),
     ).tocsr()
 
 
-def _get_genes_index(
+def _get_feature_index(
     h5ad_or_adata: Union[str, anndata.AnnData],
-    var_gene_column: str = "index",
+    var_feature_column: str = "index",
 ) -> List[str]:
     if isinstance(h5ad_or_adata, str):
         adata = anndata.read_h5ad(h5ad_or_adata, backed=True)
@@ -96,32 +98,33 @@ def _get_genes_index(
 
         adata = h5ad_or_adata
 
-    if var_gene_column == "index":
+    if var_feature_column == "index":
         symbols = adata.var.index.tolist()
     else:
-        symbols = adata.var[var_gene_column].tolist()
+        symbols = adata.var[var_feature_column].tolist()
 
     return symbols
 
 
-def _wrapper_get_genes(args):
+def _wrapper_get_feature_ids(args):
     file, gcol = args
-    return _get_genes_index(file, gcol)
+    return _get_feature_index(file, gcol)
 
 
-def scan_for_genes(
+def scan_for_features(
     h5ad_or_adata: List[Union[str, anndata.AnnData]],
-    var_gene_column: str = "index",
+    var_feature_column: str = "index",
     num_threads: int = 1,
+    unique: bool = True,
 ) -> List[str]:
-    """Extract and generate the list of unique genes identifiers across files.
+    """Extract and generate the list of unique feature identifiers across files.
 
     Args:
         h5ad_or_adata:
             List of anndata objects or path to h5ad files.
 
-        var_gene_column:
-            Column containing the gene symbols.
+        var_feature_column:
+            Column containing the feature ids (e.g. gene symbols).
             Defaults to "index".
 
         num_threads:
@@ -129,12 +132,15 @@ def scan_for_genes(
             Defaults to 1.
 
     Returns:
-        List of all unique gene symbols across all files.
+        List of all unique feature ids across all files.
     """
     with Pool(num_threads) as p:
-        _args = [(file_info, var_gene_column) for file_info in h5ad_or_adata]
-        all_symbols = p.map(_wrapper_get_genes, _args)
-        return list(set(itertools.chain.from_iterable(all_symbols)))
+        _args = [(file_info, var_feature_column) for file_info in h5ad_or_adata]
+        all_symbols = p.map(_wrapper_get_feature_ids, _args)
+        if unique:
+            return list(set(itertools.chain.from_iterable(all_symbols)))
+
+        return all_symbols
 
 
 def _get_cellcounts(h5ad_or_adata: Union[str, anndata.AnnData]) -> int:
diff --git a/tests/test_build.py b/tests/test_build.py
index a09a23a..c6cabad 100644
--- a/tests/test_build.py
+++ b/tests/test_build.py
@@ -5,7 +5,7 @@
 import pandas as pd
 import pytest
 import tiledb
-from cellarr import build_cellarrdataset
+from cellarr import build_cellarrdataset, MatrixOptions
 
 __author__ = "Jayaram Kancherla"
 __copyright__ = "Jayaram Kancherla"
@@ -28,31 +28,44 @@ def generate_adata(n, d, k):
     return adata
 
 
-def test_build_tiledb():
+def test_build_cellarrdataset():
     tempdir = tempfile.mkdtemp()
 
     adata1 = generate_adata(1000, 100, 10)
     adata2 = generate_adata(100, 1000, 100)
 
     build_cellarrdataset(
-        output_path=tempdir, files=[adata1, adata2], matrix_dim_dtype=np.float32
+        output_path=tempdir,
+        files=[adata1, adata2],
+        matrix_options=MatrixOptions(dtype=np.float32),
     )
 
     cfp = tiledb.open(f"{tempdir}/counts", "r")
-    gfp = tiledb.open(f"{tempdir}/gene_metadata", "r")
+    gfp = tiledb.open(f"{tempdir}/gene_annotation", "r")
 
     genes = gfp.df[:]
 
+    print(genes)
+
     gene_list = ["gene_1", "gene_95", "gene_50"]
-    gene_indices_tdb = sorted([genes.index.tolist().index(x) for x in gene_list])
+    _genes_from_tile = genes["cellarr_gene_index"].tolist()
+    # print(_genes_from_tile)
+    gene_indices_tdb = sorted([_genes_from_tile.index(x) for x in gene_list])
 
+    print(gene_indices_tdb)
     adata1_gene_indices = sorted(
         [adata1.var.index.tolist().index(x) for x in gene_list]
     )
+
+    print(adata1_gene_indices)
     adata2_gene_indices = sorted(
         [adata2.var.index.tolist().index(x) for x in gene_list]
     )
 
+    print(adata2_gene_indices)
+
+    print(cfp.multi_index[0, gene_indices_tdb])
+
     assert np.allclose(
         cfp.multi_index[0, gene_indices_tdb]["counts"],
         adata1.layers["counts"][0, adata1_gene_indices],
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
deleted file mode 100644
index 1b99c5a..0000000
--- a/tests/test_dataset.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import tempfile
-
-import anndata
-import numpy as np
-import pandas as pd
-import pytest
-import tiledb
-from cellarr import CellArrDataset, build_cellarrdataset
-
-__author__ = "Jayaram Kancherla"
-__copyright__ = "Jayaram Kancherla"
-__license__ = "MIT"
-
-
-def generate_adata(n, d, k):
-    np.random.seed(1)
-
-    z = np.random.normal(loc=np.arange(k), scale=np.arange(k) * 2, size=(n, k))
-    w = np.random.normal(size=(d, k))
-    y = np.dot(z, w.T)
-
-    gene_index = [f"gene_{i+1}" for i in range(d)]
-    var_df = pd.DataFrame({"names": gene_index}, index=gene_index)
-    obs_df = pd.DataFrame({"cells": [f"cell1_{j+1}" for j in range(n)]})
-
-    adata = anndata.AnnData(layers={"counts": y}, var=var_df, obs=obs_df)
-
-    return adata
-
-
-def test_build_tiledb():
-    tempdir = tempfile.mkdtemp()
-
-    adata1 = generate_adata(1000, 100, 10)
-    adata2 = generate_adata(100, 1000, 100)
-
-    dataset = build_cellarrdataset(
-        output_path=tempdir, files=[adata1, adata2], matrix_dim_dtype=np.float32
-    )
-
-    assert dataset is not None
-    assert isinstance(dataset, CellArrDataset)
-
-    cfp = tiledb.open(f"{tempdir}/counts", "r")
-    gfp = tiledb.open(f"{tempdir}/gene_metadata", "r")
-
-    genes = gfp.df[:]
-
-    gene_list = ["gene_1", "gene_95", "gene_50"]
-    gene_indices_tdb = sorted([genes.index.tolist().index(x) for x in gene_list])
-
-    adata1_gene_indices = sorted(
-        [adata1.var.index.tolist().index(x) for x in gene_list]
-    )
-    adata2_gene_indices = sorted(
-        [adata2.var.index.tolist().index(x) for x in gene_list]
-    )
-
-    assert np.allclose(
-        cfp.multi_index[0, gene_indices_tdb]["counts"],
-        adata1.layers["counts"][0, adata1_gene_indices],
-    )
-    assert np.allclose(
-        cfp.multi_index[1000, gene_indices_tdb]["counts"],
-        adata2.layers["counts"][0, adata2_gene_indices],
-    )
diff --git a/tests/test_query.py b/tests/test_query.py
new file mode 100644
index 0000000..4d8ce7d
--- /dev/null
+++ b/tests/test_query.py
@@ -0,0 +1,59 @@
+# import tempfile
+
+# import anndata
+# import numpy as np
+# import pandas as pd
+# import pytest
+# import tiledb
+# from cellarr import CellArrDataset, build_cellarrdataset
+
+# __author__ = "Jayaram Kancherla"
+# __copyright__ = "Jayaram Kancherla"
+# __license__ = "MIT"
+
+
+# def generate_adata(n, d, k):
+#     np.random.seed(1)
+
+#     z = np.random.normal(loc=np.arange(k), scale=np.arange(k) * 2, size=(n, k))
+#     w = np.random.normal(size=(d, k))
+#     y = np.dot(z, w.T)
+
+#     gene_index = [f"gene_{i+1}" for i in range(d)]
+#     var_df = pd.DataFrame({"names": gene_index}, index=gene_index)
+#     obs_df = pd.DataFrame({"cells": [f"cell1_{j+1}" for j in range(n)]})
+
+#     adata = anndata.AnnData(layers={"counts": y}, var=var_df, obs=obs_df)
+
+#     return adata
+
+
+# def test_query_cellarrdataset():
+#     tempdir = tempfile.mkdtemp()
+
+#     adata1 = generate_adata(1000, 100, 10)
+#     adata2 = generate_adata(100, 1000, 100)
+
+#     dataset = build_cellarrdataset(
+#         output_path=tempdir, files=[adata1, adata2], matrix_dim_dtype=np.float32
+#     )
+
+#     assert dataset is not None
+#     assert isinstance(dataset, CellArrDataset)
+
+#     cfp = tiledb.open(f"{tempdir}/counts", "r")
+#     gfp = tiledb.open(f"{tempdir}/gene_metadata", "r")
+
+#     cd = CellArrDataset(dataset_path=tempdir)
+
+#     gene_list = ["gene_1", "gene_95", "gene_50"]
+#     result1 = cd[0, gene_list]
+
+#     assert np.allclose(
+#         cfp.multi_index[0, gene_indices_tdb]["counts"],
+#         adata1.layers["counts"][0, adata1_gene_indices],
+#     )
+#     assert np.allclose(
+#         cfp.multi_index[1000, gene_indices_tdb]["counts"],
+#         adata2.layers["counts"][0, adata2_gene_indices],
+#     )