angelolab · srivarra · Dec 19, 2023 · Oct 24, 2023 · Oct 25, 2023 · Nov 7, 2023
diff --git a/docs/conf.py b/docs/conf.py
@@ -72,6 +72,10 @@
                         'feather',
                         'google',
                         'h5py',
+                        'dask',
+                        'distributed',
+                        'anndata',
+                        'torchdata',
                         'ipywidgets',
                         'natsort',
                         'numba',
@@ -98,7 +102,8 @@
                         'mpl_toolkits',
                         'tqdm',
                         'ark.utils._bootstrapping',
-                        'xmltodict']
+                        'xmltodict',
+                        'zarr',]
 
 # prefix each section label with the name of the document it is in, followed by a colon
 # autosection_label_prefix_document = True

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,9 @@ build-backend = "setuptools.build_meta"
 [project]
 dependencies = [
     "alpineer==0.1.10",
+    "anndata",
     "Cython>=0.29,<1",
+    "dask[distributed]",
     "datasets>=2.6,<3.0",
     "dill>=0.3.5,<0.4",
     "feather-format>=0.4.1,<1",
@@ -29,16 +31,20 @@ dependencies = [
     "requests>=2.20,<3",
     "scikit-image<=0.19.3",
     "scikit-learn>=1.1,<2",
+    "graphviz",
     "scipy>=1.7,<2",
     "seaborn>=0.12,<1",
     "spatial-lda>=0.1.3,<1",
     "statsmodels>=0.13.2,<1",
+    "squidpy",
     "tifffile>=2022",
+    "torchdata",
     "tqdm>=4,<5",
     "umap-learn>=0.5,<1.0",
     "xarray>=2022",
     "xmltodict>=0.13.0,<1",
     "zstandard>=0.19.0,<1",
+    "zarr",
     "ark-analysis[colors]",
 ]
 name = "ark-analysis"

diff --git a/src/ark/utils/data_utils.py b/src/ark/utils/data_utils.py
@@ -3,7 +3,8 @@
 import os
 import pathlib
 import re
-from typing import List, Union
+from typing import List, Literal, Union, Sequence
+
 from numpy.typing import ArrayLike, DTypeLike
 from numpy import ma
 import feather
@@ -17,6 +18,19 @@
 import xarray as xr
 from ark import settings
 from skimage.segmentation import find_boundaries
+import dask.dataframe as dd
+from dask import delayed
+from anndata import AnnData, read_zarr
+from anndata.experimental import AnnCollection
+from anndata.experimental.multi_files._anncollection import ConvertType
+from tqdm.dask import TqdmCallback
+from torchdata.datapipes.iter import IterDataPipe
+from typing import Iterator, Optional
+try:
+    from typing import TypedDict, Unpack
+except ImportError:
+    from typing_extensions import TypedDict, Unpack
+
 
 
 def save_fov_mask(fov, data_dir, mask_data, sub_dir=None, name_suffix=''):
@@ -177,7 +191,6 @@ def fov_mapping(self, fov: str) -> pd.DataFrame:
         """
         misc_utils.verify_in_list(requested_fov=[fov], all_fovs=self.unique_fovs)
         fov_data: pd.DataFrame = self.mapping[self.mapping[self.fov_column] == fov]
-
         return fov_data.reset_index(drop=True)
 
     @property
@@ -792,3 +805,199 @@ def stitch_images_by_shape(data_dir, stitched_dir, img_sub_folder=None, channels
         current_img = stitched_data.loc['stitched_image', :, :, chan].values
         image_utils.save_image(os.path.join(stitched_subdir, chan + '_stitched' + file_ext),
                                current_img)
+
+
+@delayed
+def _convert_ct_fov_to_adata(fov_dd: dd.DataFrame, var_names: list[str], obs_names: list[str], save_dir=None) -> str:
+    """Converts the cell table for a single FOV to an `AnnData` object and saves it to disk as a
+    `Zarr` store.
+
+    Parameters
+    ----------
+    fov_dd : dd.DataFrame
+        The cell table subset on a single FOV.
+    var_names: list[str]
+        The marker names to extract from the cell table.
+    obs_names: list[str]
+        The cell-level measurements and properties to extract from the cell table.
+    save_dir: str | os.PathLike
+        The directory to save the `AnnData` object to.
+
+    Returns
+    -------
+    str
+        The path of the saved `AnnData` object.
+    """
+
+    fov_dd: dd.DataFrame = fov_dd.sort_values(by=settings.CELL_LABEL, key=ns.natsort_key).reset_index()
+    fov_id: str = fov_dd[settings.FOV_ID].iloc[0]
+
+    # Set the index to be the FOV and the segmentation label to create a unique index
+    fov_dd.index = list(map(lambda label: f"{fov_id}_{int(label)}", fov_dd[settings.CELL_LABEL]))
+
+    # Extract the X matrix
+    X_dd: dd.DataFrame = fov_dd[var_names]
+
+    # Extract the obs dataframe and convert the cell label to integer
+    obs_dd: dd.DataFrame = fov_dd[obs_names].astype({settings.CELL_LABEL: int, settings.FOV_ID: str})
+    obs_dd["cell_meta_cluster"] = pd.Categorical(obs_dd["cell_meta_cluster"].astype(str))
+
+    # Move centroids from obs to obsm["spatial"]
+    obsm_dd = obs_dd[[settings.CENTROID_0, settings.CENTROID_1]].rename(columns={settings.CENTROID_0: "centroid_x", settings.CENTROID_1: "centroid_y"})
+    obs_dd = obs_dd.drop(columns=[settings.CENTROID_0, settings.CENTROID_1])
+
+    # Create the AnnData object
+    adata: AnnData = AnnData(X=X_dd, obs=obs_dd, obsm={"spatial": obsm_dd})
+
+    # Convert any extra string labels to categorical if it's beneficial.
+    adata.strings_to_categoricals()
+
+    adata.write_zarr(pathlib.Path(save_dir, f"{fov_id}.zarr"), chunks=(1000, 1000))
+    return pathlib.Path(save_dir, f"{fov_id}.zarr").as_posix()
+
+
+class ConvertToAnnData:
+    """ A class which converts the Cell Table `.csv` file to a series of `AnnData` objects,
+    one object per FOV.
+
+    The default parameters stored in `.obs` parameters include
+
+    Args:
+        cell_table_path (os.PathLike): The path to the cell table.
+        markers (list[str], optional): The markers to extract and store in `.X`. Defaults to None,
+        which will extract all markers.
+        extra_obs_parameters (list[str], optional): Extra parameters to load in `.obs`. Defaults to None.
+    """
+
+    def __init__(self, cell_table_path: os.PathLike,
+                 markers: list[str] = None,
+                 extra_obs_parameters: list[str] = None) -> None:
+
+        io_utils.validate_paths(paths=cell_table_path)
+
+
+        # Read in the cell table
+        cell_table: dd.DataFrame = dd.read_csv(cell_table_path)
+        ct_columns = cell_table.columns
+
+        # Get te marker column indices
+        marker_index_start: int = ct_columns.get_loc(settings.PRE_CHANNEL_COL) + 1
+        marker_index_stop: int = ct_columns.get_loc(settings.POST_CHANNEL_COL)
+        obs_index_start: int = ct_columns.get_loc(settings.POST_CHANNEL_COL) + 1
+
+        if not markers:
+            # Default to all markers based on settings Pre and Post channel column values
+            markers: list[str] = ct_columns[marker_index_start:marker_index_stop].to_list()
+        else:
+            # Verify that the correct markers exist
+            misc_utils.verify_in_list(requested_markers=markers, 
+                                    all_markers=ct_columns[marker_index_start:marker_index_stop].to_list())
+        self.var_names = markers
+
+        # Verify extra obs parameters
+        if extra_obs_parameters:
+            misc_utils.verify_in_list(requested_parameters=extra_obs_parameters, 
+                                    all_parameters=ct_columns[obs_index_start:].to_list())
+        else:
+            extra_obs_parameters = []
+        obs_names = [settings.CELL_LABEL, settings.CELL_SIZE, *ct_columns[obs_index_start:].to_list(), *extra_obs_parameters]
+
+        # Use "area" as the default area id instead of settings.CELL_SIZE to account for
+        # non-cellular observations (ez_seg, fiber, etc...)
+        if ("area" in obs_names) and (settings.CELL_SIZE in obs_names):
+            obs_names.remove(settings.CELL_SIZE)
+        elif ("area" not in obs_names) and (settings.CELL_SIZE in obs_names):
+            cell_table = cell_table.rename(columns={settings.CELL_SIZE: "area"})
+            obs_names.remove(settings.CELL_SIZE)
+            obs_names.append("area")
+        self.obs_names: list[str] = obs_names
+        self.cell_table = cell_table
+
+
+    def convert_to_adata(
+        self,
+        save_dir: os.PathLike,
+    ) -> dict[str, str]:
+        """Converts the cell table to a FOV-level `AnnData` object, and saves the results as
+        a `Zarr` store to disk in the `save_dir`.
+
+        Args:
+            save_dir (os.PathLike): The directory to save the `AnnData` objects to.
+
+        Returns:
+            dict[str, str]: A dictionary containing the names of the FOVs and the paths where
+            they were saved.
+        """
+
+        if not isinstance(save_dir, pathlib.Path):
+            save_dir = pathlib.Path(save_dir)
+        if not save_dir.exists():
+            save_dir.mkdir(parents=True, exist_ok=True)
+
+
+        with TqdmCallback(desc="Converting to AnnData"):
+            g: pd.Series = (
+                self.cell_table.groupby(by=settings.FOV_ID, sort=True)
+                .apply(
+                    _convert_ct_fov_to_adata,
+                    var_names=self.var_names,
+                    obs_names=self.obs_names,
+                    save_dir=save_dir,
+                    meta=("anndata_save_results", str),
+                )
+            ).compute()
+
+        return g.to_dict()
+
+
+class AnnCollectionKwargs(TypedDict):
+    join_obs: Optional[Literal["inner", "outer"]]
+    join_obsm: Optional[Literal["inner"]]
+    join_vars: Optional[Literal["inner"]]
+    label: Optional[str]
+    keys: Optional[Sequence[str]]
+    index_unique: Optional[str]
+    convert: Optional[ConvertType]
+    harmonize_dtypes: bool
+    indices_strict: bool
+
+
+def load_anndatas(anndata_dir: os.PathLike, **anncollection_kwargs: Unpack[AnnCollectionKwargs]) -> AnnCollection:
+    """Lazily loads a directory of `AnnData` objects into an `AnnCollection`. The concatination happens across the `.obs` axis.
+
+    For `AnnCollection` kwargs, see https://anndata.readthedocs.io/en/latest/generated/anndata.experimental.AnnCollection.html
+
+    Args:
+        anndata_dir (os.PathLike): The directory containing the `AnnData` objects.
+
+    Returns:
+        AnnCollection: The `AnnCollection` containing the `AnnData` objects.
+    """
+    if not isinstance(anndata_dir, pathlib.Path):
+        anndata_dir = pathlib.Path(anndata_dir)
+
+    adata_zarr_stores = {f.stem: read_zarr(f) for f in ns.natsorted(anndata_dir.glob("*.zarr"))}
+    return AnnCollection(adatas=adata_zarr_stores, **anncollection_kwargs)
+
+
+class AnnDataIterDataPipe(IterDataPipe):
+    """The TorchData Iterable-style DataPipe. Takes an `AnnCollection`
+    and makes it iterable by FOV for easy and flexible data pipelines.
+
+    Args:
+        fovs (AnnCollection): The `AnnCollection` containing the `AnnData` objects.
+    """
+
+    @property
+    def fovs(self) -> AnnCollection:
+        return self._fovs
+
+    @fovs.setter
+    def fovs(self, value: AnnCollection) -> None:
+        self._fovs: AnnCollection = value
+
+    def __init__(self, fovs: AnnCollection):
+        self.fovs = fovs
+
+    def __iter__(self) -> Iterator[AnnData]:
+        yield from self.fovs.adatas
diff --git a/templates/anndata_conversion.ipynb b/templates/anndata_conversion.ipynb
diff --git a/tests/analysis/dimensionality_reduction_test.py b/tests/analysis/dimensionality_reduction_test.py
@@ -10,7 +10,7 @@
 
 def test_plot_dim_reduced_data():
     # this only tests errors, test_dimensionality_reduction tests the meat of this function
-    random_cell_data = test_utils.make_cell_table(50)
+    random_cell_data = test_utils.make_cell_table(n_cells=50, n_markers=10)
 
     with pytest.raises(FileNotFoundError):
         # trying to save to a non-existant directory
@@ -34,8 +34,9 @@ def test_plot_dim_reduced_data():
 
 
 def test_dimensionality_reduction():
-    random_cell_data = test_utils.make_cell_table(50)
-    test_cols = test_utils.TEST_MARKERS
+    n_markers = 4
+    random_cell_data = test_utils.make_cell_table(n_cells=50, n_markers=n_markers)
+    test_cols = [f"marker_{i}" for i in range(n_markers)]
 
     test_algorithms = ['PCA', 'tSNE', 'UMAP']