angelolab · srivarra · Dec 19, 2023 · Oct 24, 2023 · Oct 25, 2023 · Nov 7, 2023
diff --git a/docs/_images/anndata_schema.png b/docs/_images/anndata_schema.png
diff --git a/docs/_rtd/data_types.md b/docs/_rtd/data_types.md
@@ -40,16 +40,17 @@ For each cell, the following morphology features calculated from `skimage.measur
 * `perimeter`: perimeter of object which approximates the contour as a line through the centers of border pixels using a 4-connectivity.
 * `convex_area`: the area of the convex hull.
 * `equivalent_diameter`: the diameter of the circle with the same area as the cell.
-* `centroid-0`: the $x$-coordinate of the centroid.
-* `centroid-1`: the $y$-coordinate of the centroid.
+* Centroids: Note that all the arrays are NumPy arrays, therefore the origin $(0,0)$ is in the "top-left corner" of the image / array.
+  * `centroid-0`: the $y$-coordinate of the centroid.
+  * `centroid-1`: the $x$-coordinate of the centroid.
 * `fov`: The FOV from which the cell originates from.
 
 The base `regionprops` metric often don't provide enough morphological information about each cell on their own. We add the following derived metrics to provide more complete information about the segmented cells:
 * `major_minor_axis_ratio`: the major axis length divided by the minor axis length.
 * `perim_square_over_area`: the square of the perimeter divided by the area. 
 * `major_axis_equiv_diam_ratio`: the major axis length divided by the equivalent diameter.
 * `convex_hull_resid`: the difference between the convex area and the area divided by the convex area.
-* `centroid_dif`: the normalized euclidian distance between the cell centroid and the corresponding convex hull centroid.
+* `centroid_dif`: the normalized euclidean distance between the cell centroid and the corresponding convex hull centroid.
 * `num_concavities`: the number of concavities of the region.
 * `nc_ratio`: for nuclear segmentation only. The nuclear area divided by the total area.
 
@@ -90,3 +91,70 @@ The CSV should contain the following columns
 * `fov`: name of the FOV the cell comes from
 * `label`: the name of the segmentation label
 * A set of expression columns defining the properties of each cell desired for clustering
+
+---
+
+Name: AnnData   
+Type: anndata.AnnData   
+Created by: [ConvertToAnnData](https://ark-analysis.readthedocs.io/en/latest/_markdown/ark.utils.html#ark.utils.data_utils.ConvertToAnnData)    
+Used by: [anndata_conversion.ipynb](https://github.com/angelolab/ark-analysis/blob/main/templates/anndata_conversion.ipynb)
+
+<p align="center">
+  <img width="50%" src="../_images/anndata_schema.png" alt="AnnData Schema"/>
+</p>
+
+
+`AnnData` is a data structure consisting of matrices, annotated by DataFrames and Indexes.
+
+A `AnnData` object is composed of the following components:
+
+- **X**
+- **var**
+- **obs**
+- **obsm**
+- **obsp**
+- **varm**
+- **varp**
+
+Each of these components have specific use cases and will be described below:
+
+#### 1. X, var, obs
+
+<p align="center">
+  <img width="50%" alt="image" src="https://github.com/angelolab/ark-analysis/assets/8909315/a5011077-d350-4aab-b8f8-609b11087bba">
+</p>
+
+- `X` is a matrix of shape `(n_obs, n_vars)` where `n_obs` is the number of observations and `n_vars` is the number of variables. 
+- `var` is a `DataFrame` of shape `(n_var_features, n_vars)`, where the index is `var_names`. This `DataFrame` contains attributes of each variable.
+- `obs` is a `DataFrame` of shape `(n_obs, n_obs_features)`, where the index is `obs_names`. This `DataFrame` contains information about each observation, such as numeric metrics from `regionprops` or categorical data such as cell phenotype, or patient-level information.
+- `n_obs` is the number of segmented regions or objects of interest. These can be cell segmentations, or more complex objects such as nuclei masks, or object masks. Whatever it is, it should be the smallest, most atomic unit of analysis.
+- `obs_names` is a `Pandas` Index where each value is a unique identifier for each observation. These are the names of the segmented regions, and should be unique.
+- `n_vars` is the number of variables, and in this case it is the number of channels. Each channel is a variable, and each observation has a value for each channel.
+- `var_names` is a `Pandas` Index where each value is a unique identifier for each variable. These are the names of the channels and should be unique.
+
+#### 2. obsm, varm
+
+<p align="center">
+  <img width="50%" alt="image" src="https://github.com/angelolab/ark-analysis/assets/8909315/8ea1c794-f80b-49a3-b814-3357d2718f7b">
+</p>
+
+- `obsm` is a Matrix of shape `(n_obs, a)`, where `a` is an integer. This contains observation level matrices, and we use a mapping `str -> NDArray` to store them. For example, `X_umap`
+would store the UMAP embedding of the sparse matrix `X`, and `X_pca` would store the PCA embedding of `X`.
+- `varm` is a Matrix of shape `(n_vars, b)`, where `b` is an integer. This contains variable level matrices, and we use a mapping `str -> NDArray` to store them. For example, `Marker_umap` would store the UMAP embedding of the matrix `var`.
+
+
+#### 3. obsp, varp
+
+<p align="center">
+  <img width="50%" alt="image" src="https://github.com/angelolab/ark-analysis/assets/8909315/2201f36d-3a7c-4154-8ab6-e875e9811eb4">
+</p>
+
+- `obsp` is a square matrix of shape `(n_obs, n_obs)`, and its purpose is to store pairwise computations between observations.
+- `varp` is a square matrix of shape `(n_vars, n_vars)`, and its purpose is to store pairwise computations between variables.
+#### 4. **uns**
+
+<p align="center">
+  <img width="303" alt="image" src="https://github.com/angelolab/ark-analysis/assets/8909315/881a2c63-3ea4-4874-b6d6-b0bc2532f283">
+</p>
+
+- `uns` is a free slot for storing *almost* anything. It's a mapping from a string label to anything.
diff --git a/docs/_rtd/development.md b/docs/_rtd/development.md
@@ -144,3 +144,189 @@ Finally, to save an `xarray` to a file, use:
 You can load the `xarray` back in using:
 
 `arr = xr.load_dataarray(path)`
+
+
+### Working with `AnnData`
+
+We can load a single `AnnData` object using the function `anndata.read_zarr`, and several `AnnData` objects using the function `load_anndatas` from `ark.utils.data_utils`.
+
+```python
+from anndata import read_zarr
+from ark.utils.data_utils import load_anndatas
+```
+
+```python
+fov0 = read_zarr("data/example_dataset/fov0.zarr")
+```
+
+The channel intensities for each observation in the `AnnData` object with the `.to_df()` method, and get the channel names with `.var_names`.
+
+```python
+fov0.var_names
+fov0.to_df()
+```
+
+The observations and their properties with the `obs` property of the `AnnData` object. The data here consists of measurements such as `area`, `perimeter`, and categorical information like `cell_meta_cluster` for each cell.
+
+```python
+fov0.obs
+```
+
+The $x$ and $y$ centroids of each cell can be accessed with the `obsm` attribute and the key `"spatial"`.
+
+```python
+fov0.obsm["spatial"]
+```
+
+We can load all the `AnnData` objects in a directory lazily with `load_anndatas`. We get a view of the `AnnData` objects in the directory.
+
+```python
+fovs_ac = load_anndatas(anndata_dir = "data/example_dataset/fov0.zarr")
+```
+
+We can utilize `AnnData` objects or `AnnCollections` in a similar way to a Pandas DataFrame. For example, we can filter the `AnnCollection` to only include cells that have a `cell_meta_cluster` label of `"CD4T"`.
+
+```python
+fovs_ac_cd4t = fovs_ac[fovs_ac.obs["cell_meta_cluster"] == "CD4T"]
+print(type(fovs_ac_cd4t))
+fovs_ac_cd4t.obs.df
+```
+The type of `fovs_ac_cd4t` is not an `AnnData` object, but instead an `AnnCollectionView`.
+This is a `view` of the subset of the `AnnCollection`. This object can *only* access `.obs`, `.obsm`, `.layers` and `.X`.
+
+
+We can subset a `AnnCollectionView` to only include the first $n$ observations objects with the following code. The slice based indexing behaves like a `numpy` array.
+
+```python
+n = 100
+fovs_ac_cdt4_100 = fovs_ac_cd4t[:n]
+fovs_ac_cd4t_100.obs.df
+```
+
+Often we will want to subset the `AnnCollection` to only include observations contained within a specific FOV.
+
+```python
+fov1_adata = fovs_ac[fovs_ac.obs["fov"] == "fov1"]
+
+fov1_adata.obs.df
+```
+
+We can loop over all FOVs in a `AnnCollection` with the following code (there is alternative method in ):
+
+```python
+all_fovs = fovs_ac.obs["fov"].unique()
+
+for fov in all_fovs:
+    fov_adata = fovs_ac[fovs_ac.obs["fov"] == fov]
+    # do something with fov_adata
+```
+
+Functions which take in `AnnData` objects can often be applied to `AnnCollections`.
+
+The following works as expected:
+```python
+def dist(adata):
+    x = adata.obsm["spatial"]["centroid_x"]
+    y = adata.obsm["spatial"]["centroid_y"]
+    return np.sqrt(x**2 + y**2)
+
+dist(fovs_ac)
+```
+
+While the example below does not:
+```python
+from squidpy import gr
+gr.spatial_neighbors(adata=fovs_ac, spatial_key="spatial")
+```
+
+This is due to a `AnnCollection` object not having a `uns` property.
+
+#### Utilizing `DataLoaders`
+
+While a `AnnCollection` can sometimes be used to apply a function over all FOVs, in some instances we either cannot do that, or perhaps we want to apply functions to each FOV independently.
+
+We can access the underlying `AnnData` objects with `.adatas`.
+```python
+fovs_ac.adatas
+```
+
+In these instances we can construct data pipelines with [`torchdata`](https://pytorch.org/data/beta/index.html).
+
+
+As an example, let's create a multi-stage `DataLoader` which does the following:
+- Only extracts the observations with an area greater than `300`.
+- Only extracts the observations which have a `cell_meta_cluster` label of `"CD4T"`.
+- Compute the Spatial Neighbors graph for those observations (using [`squidpy.gr.spatial_neighbors`](https://squidpy.readthedocs.io/en/stable/api/squidpy.gr.spatial_neighbors.html#squidpy.gr.spatial_neighbors)).
+
+
+In order to construct a `torchdata` [`DataLoader2`](https://pytorch.org/data/beta/dataloader2.html) iterator we first need to create a `torchdata` [`IterDataPipe`](https://pytorch.org/data/beta/torchdata.datapipes.iter.html). This implements the `__iter__()` protocol, and represents an iterable over data samples.
+
+We can convert the `AnnCollection` to a `torchdata` `IterDataPipe` with `ark.utils.data_utils.AnnDataIterDataPipe`.
+
+
+```python
+from ark.utils.data_utils import AnnDataIterDataPipe
+
+fovs_ip = AnnDataIterDataPipe(fovs=fovs_ac)
+```
+
+The following two functions are used to filter the observations in the `AnnData` objects 
+to only include cells with an area greater than `min_area` and cells with a `cell_meta_cluster` label of `cluster`.
+
+```python
+from anndata import AnnData
+
+def filter_cells_by_cluster(adata: AnnData, cluster_label: str) -> AnnData:
+    return adata[adata.obs["cell_meta_cluster"] == cluster_label]
+
+def filter_cells_by_area(adata: AnnData, min_area: int) -> AnnData:
+    return adata[adata.obs["area"] > min_area]
+```
+
+The following function is used to filter out `AnnData` objects which have no observations.
+```python
+def filter_empty_adata(fov: AnnData) -> bool:
+    return len(fov) > 0
+```
+
+
+We can apply these functions to the `IterDataPipe` with the `map` and the `filter` method.
+Because those methods return a new `IterDataPipe` object, we can chain them together.
+
+```python
+from functools import partial
+
+cd4t_obs_filter = partial(filter_cells_by_cluster, cluster_label="CD4T")
+area_obs_filter = partial(filter_cells_by_area, min_area=300)
+
+fovs_subset = fovs_ip.map(cd4t_obs_filter).map(area_obs_filter).filter(filter_empty_adata)
+```
+
+The data pipeline can be visualized with `to_graph` function.
+
+```python
+from torchdata.datapipes.utils import to_graph
+
+to_graph(fovs_subset)
+```
+
+
+The `DataLoader` can now be constructed.
+```python
+from torchdata.dataloader2.dataloader2 import DataLoader2
+
+fovs_subset_dl = DataLoader2(fovs_subset)
+```
+
+We can now loop over the `DataLoader` and compute the Spatial Neighbors graph per FOV with the filtered observations.
+
+```python
+for fov in fovs_subset_dl:
+    gr.spatial_neighbors(adata=fov, radius=350, spatial_key="spatial", coord_type="generic")
+```
+
+#### Further Reading
+- [Official AnnData Documentation](https://anndata.readthedocs.io/en/latest/)
+  - [Getting Started Tutorial](https://anndata.readthedocs.io/en/latest/tutorials/notebooks/getting-started.html)
+- [Converting from Single Cell Experiment and Seurat Objects](https://scanpy.readthedocs.io/en/stable/tutorials.html#conversion-anndata-singlecellexperiment-and-seurat-objects)
+- [MuData - Multimodal AnnData](https://mudata.readthedocs.io/en/latest/index.html)
diff --git a/docs/conf.py b/docs/conf.py
@@ -72,6 +72,10 @@
                         'feather',
                         'google',
                         'h5py',
+                        'dask',
+                        'distributed',
+                        'anndata',
+                        'torchdata',
                         'ipywidgets',
                         'natsort',
                         'numba',
@@ -98,7 +102,8 @@
                         'mpl_toolkits',
                         'tqdm',
                         'ark.utils._bootstrapping',
-                        'xmltodict']
+                        'xmltodict',
+                        'zarr',]
 
 # prefix each section label with the name of the document it is in, followed by a colon
 # autosection_label_prefix_document = True

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,9 @@ build-backend = "setuptools.build_meta"
 [project]
 dependencies = [
     "alpineer==0.1.10",
+    "anndata",
     "Cython>=0.29,<1",
+    "dask[distributed]",
     "datasets>=2.6,<3.0",
     "dill>=0.3.5,<0.4",
     "feather-format>=0.4.1,<1",
@@ -29,16 +31,20 @@ dependencies = [
     "requests>=2.20,<3",
     "scikit-image<=0.19.3",
     "scikit-learn>=1.1,<2",
+    "graphviz",
     "scipy>=1.7,<2",
     "seaborn>=0.12,<1",
     "spatial-lda>=0.1.3,<1",
     "statsmodels>=0.13.2,<1",
+    "squidpy",
     "tifffile>=2022",
+    "torchdata",
     "tqdm>=4,<5",
     "umap-learn>=0.5,<1.0",
     "xarray>=2022",
     "xmltodict>=0.13.0,<1",
     "zstandard>=0.19.0,<1",
+    "zarr",
     "ark-analysis[colors]",
 ]
 name = "ark-analysis"