Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AnnData Conversion Notebook #1079

Merged
merged 25 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
13f03a4
test notebook
srivarra Oct 24, 2023
f0e1c0f
Merge branch 'main' into anndata-conversion-fov0
srivarra Oct 25, 2023
10e7f29
testing conversion workflow
srivarra Nov 7, 2023
6ca8f2b
Merge branch 'main' into anndata-conversion-fov0
srivarra Nov 7, 2023
3112049
tests finalized
srivarra Nov 29, 2023
fca5491
Merge branch 'main' into anndata-conversion-fov0
srivarra Nov 29, 2023
b9e7bd6
notebook updated
srivarra Nov 30, 2023
18fdeb6
added dask, torchdata deps, updated notebook
srivarra Nov 30, 2023
009385c
convert settings.CELL_SIZE to 'area' for AnnData
srivarra Nov 30, 2023
13ef3ff
updated docs/conf.py with new deps
srivarra Nov 30, 2023
a65107b
replaced | with Union / Optinal[<type>]
srivarra Nov 30, 2023
969fa24
replaced | with Union / Optional[<type>] in test_utils
srivarra Nov 30, 2023
4f3e728
added zarr as dependency
srivarra Nov 30, 2023
eb11059
small notebook fixes
srivarra Nov 30, 2023
ddaeef6
made requested changes
srivarra Dec 8, 2023
2287eec
Merge branch 'main' into anndata-conversion-fov0
srivarra Dec 8, 2023
84360bb
fixed docs
srivarra Dec 8, 2023
b6c8a25
do i look i know what a 'qhull v Qbb Qz Qc' is?
srivarra Dec 8, 2023
81e097e
replaced svg with png
srivarra Dec 8, 2023
ab98846
fixed image in nb, specified x,y, and the numpy coordinate system in …
srivarra Dec 8, 2023
c51d230
undid formatting of 'test_utils'
srivarra Dec 11, 2023
3a192e2
pycodestyle
srivarra Dec 11, 2023
8885b3d
Update src/ark/utils/data_utils.py
srivarra Dec 13, 2023
1512049
Merge branch 'main' into anndata-conversion-fov0
srivarra Dec 13, 2023
21d8c61
updated data_types
srivarra Dec 18, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@
'feather',
'google',
'h5py',
'dask',
'distributed',
'anndata',
'torchdata',
'ipywidgets',
'natsort',
'numba',
Expand All @@ -98,7 +102,8 @@
'mpl_toolkits',
'tqdm',
'ark.utils._bootstrapping',
'xmltodict']
'xmltodict',
'zarr',]

# prefix each section label with the name of the document it is in, followed by a colon
# autosection_label_prefix_document = True
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ build-backend = "setuptools.build_meta"
[project]
dependencies = [
"alpineer==0.1.10",
"anndata",
"Cython>=0.29,<1",
"dask[distributed]",
"datasets>=2.6,<3.0",
"dill>=0.3.5,<0.4",
"feather-format>=0.4.1,<1",
Expand All @@ -29,16 +31,20 @@ dependencies = [
"requests>=2.20,<3",
"scikit-image<=0.19.3",
"scikit-learn>=1.1,<2",
"graphviz",
"scipy>=1.7,<2",
"seaborn>=0.12,<1",
"spatial-lda>=0.1.3,<1",
"statsmodels>=0.13.2,<1",
"squidpy",
"tifffile>=2022",
"torchdata",
"tqdm>=4,<5",
"umap-learn>=0.5,<1.0",
"xarray>=2022",
"xmltodict>=0.13.0,<1",
"zstandard>=0.19.0,<1",
"zarr",
"ark-analysis[colors]",
]
name = "ark-analysis"
Expand Down
213 changes: 211 additions & 2 deletions src/ark/utils/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import os
import pathlib
import re
from typing import List, Union
from typing import List, Literal, Union, Sequence

from numpy.typing import ArrayLike, DTypeLike
from numpy import ma
import feather
Expand All @@ -17,6 +18,19 @@
import xarray as xr
from ark import settings
from skimage.segmentation import find_boundaries
import dask.dataframe as dd
from dask import delayed
from anndata import AnnData, read_zarr
from anndata.experimental import AnnCollection
from anndata.experimental.multi_files._anncollection import ConvertType
from tqdm.dask import TqdmCallback
from torchdata.datapipes.iter import IterDataPipe
from typing import Iterator, Optional
try:
from typing import TypedDict, Unpack
except ImportError:
from typing_extensions import TypedDict, Unpack



def save_fov_mask(fov, data_dir, mask_data, sub_dir=None, name_suffix=''):
Expand Down Expand Up @@ -177,7 +191,6 @@ def fov_mapping(self, fov: str) -> pd.DataFrame:
"""
misc_utils.verify_in_list(requested_fov=[fov], all_fovs=self.unique_fovs)
fov_data: pd.DataFrame = self.mapping[self.mapping[self.fov_column] == fov]

return fov_data.reset_index(drop=True)

@property
Expand Down Expand Up @@ -792,3 +805,199 @@ def stitch_images_by_shape(data_dir, stitched_dir, img_sub_folder=None, channels
current_img = stitched_data.loc['stitched_image', :, :, chan].values
image_utils.save_image(os.path.join(stitched_subdir, chan + '_stitched' + file_ext),
current_img)


@delayed
def _convert_ct_fov_to_adata(fov_dd: dd.DataFrame, var_names: list[str], obs_names: list[str], save_dir=None) -> str:
srivarra marked this conversation as resolved.
Show resolved Hide resolved
"""Converts the cell table for a single FOV to an `AnnData` object and saves it to disk as a
`Zarr` store.

Parameters
----------
fov_dd : dd.DataFrame
The cell table subset on a single FOV.
var_names: list[str]
The marker names to extract from the cell table.
obs_names: list[str]
The cell-level measurements and properties to extract from the cell table.
save_dir: str | os.PathLike
The directory to save the `AnnData` object to.

Returns
-------
str
The path of the saved `AnnData` object.
"""

fov_dd: dd.DataFrame = fov_dd.sort_values(by=settings.CELL_LABEL, key=ns.natsort_key).reset_index()
fov_id: str = fov_dd[settings.FOV_ID].iloc[0]

# Set the index to be the FOV and the segmentation label to create a unique index
fov_dd.index = list(map(lambda label: f"{fov_id}_{int(label)}", fov_dd[settings.CELL_LABEL]))

# Extract the X matrix
X_dd: dd.DataFrame = fov_dd[var_names]

# Extract the obs dataframe and convert the cell label to integer
obs_dd: dd.DataFrame = fov_dd[obs_names].astype({settings.CELL_LABEL: int, settings.FOV_ID: str})
obs_dd["cell_meta_cluster"] = pd.Categorical(obs_dd["cell_meta_cluster"].astype(str))

# Move centroids from obs to obsm["spatial"]
obsm_dd = obs_dd[[settings.CENTROID_0, settings.CENTROID_1]].rename(columns={settings.CENTROID_0: "centroid_x", settings.CENTROID_1: "centroid_y"})
srivarra marked this conversation as resolved.
Show resolved Hide resolved
obs_dd = obs_dd.drop(columns=[settings.CENTROID_0, settings.CENTROID_1])

# Create the AnnData object
adata: AnnData = AnnData(X=X_dd, obs=obs_dd, obsm={"spatial": obsm_dd})

# Convert any extra string labels to categorical if it's beneficial.
adata.strings_to_categoricals()
srivarra marked this conversation as resolved.
Show resolved Hide resolved

adata.write_zarr(pathlib.Path(save_dir, f"{fov_id}.zarr"), chunks=(1000, 1000))
return pathlib.Path(save_dir, f"{fov_id}.zarr").as_posix()


class ConvertToAnnData:
""" A class which converts the Cell Table `.csv` file to a series of `AnnData` objects,
one object per FOV.

The default parameters stored in `.obs` parameters include
srivarra marked this conversation as resolved.
Show resolved Hide resolved

Args:
cell_table_path (os.PathLike): The path to the cell table.
markers (list[str], optional): The markers to extract and store in `.X`. Defaults to None,
which will extract all markers.
extra_obs_parameters (list[str], optional): Extra parameters to load in `.obs`. Defaults to None.
"""

def __init__(self, cell_table_path: os.PathLike,
markers: list[str] = None,
extra_obs_parameters: list[str] = None) -> None:

io_utils.validate_paths(paths=cell_table_path)


# Read in the cell table
cell_table: dd.DataFrame = dd.read_csv(cell_table_path)
ct_columns = cell_table.columns

# Get te marker column indices
srivarra marked this conversation as resolved.
Show resolved Hide resolved
marker_index_start: int = ct_columns.get_loc(settings.PRE_CHANNEL_COL) + 1
marker_index_stop: int = ct_columns.get_loc(settings.POST_CHANNEL_COL)
obs_index_start: int = ct_columns.get_loc(settings.POST_CHANNEL_COL) + 1

if not markers:
# Default to all markers based on settings Pre and Post channel column values
markers: list[str] = ct_columns[marker_index_start:marker_index_stop].to_list()
else:
# Verify that the correct markers exist
misc_utils.verify_in_list(requested_markers=markers,
all_markers=ct_columns[marker_index_start:marker_index_stop].to_list())
self.var_names = markers

# Verify extra obs parameters
if extra_obs_parameters:
misc_utils.verify_in_list(requested_parameters=extra_obs_parameters,
all_parameters=ct_columns[obs_index_start:].to_list())
else:
extra_obs_parameters = []
obs_names = [settings.CELL_LABEL, settings.CELL_SIZE, *ct_columns[obs_index_start:].to_list(), *extra_obs_parameters]

# Use "area" as the default area id instead of settings.CELL_SIZE to account for
# non-cellular observations (ez_seg, fiber, etc...)
if ("area" in obs_names) and (settings.CELL_SIZE in obs_names):
obs_names.remove(settings.CELL_SIZE)
elif ("area" not in obs_names) and (settings.CELL_SIZE in obs_names):
cell_table = cell_table.rename(columns={settings.CELL_SIZE: "area"})
obs_names.remove(settings.CELL_SIZE)
obs_names.append("area")
srivarra marked this conversation as resolved.
Show resolved Hide resolved
self.obs_names: list[str] = obs_names
self.cell_table = cell_table


def convert_to_adata(
self,
save_dir: os.PathLike,
) -> dict[str, str]:
"""Converts the cell table to a FOV-level `AnnData` object, and saves the results as
a `Zarr` store to disk in the `save_dir`.

Args:
save_dir (os.PathLike): The directory to save the `AnnData` objects to.

Returns:
dict[str, str]: A dictionary containing the names of the FOVs and the paths where
they were saved.
"""

if not isinstance(save_dir, pathlib.Path):
save_dir = pathlib.Path(save_dir)
if not save_dir.exists():
save_dir.mkdir(parents=True, exist_ok=True)


with TqdmCallback(desc="Converting to AnnData"):
g: pd.Series = (
self.cell_table.groupby(by=settings.FOV_ID, sort=True)
.apply(
_convert_ct_fov_to_adata,
var_names=self.var_names,
obs_names=self.obs_names,
save_dir=save_dir,
meta=("anndata_save_results", str),
)
).compute()
srivarra marked this conversation as resolved.
Show resolved Hide resolved

return g.to_dict()


class AnnCollectionKwargs(TypedDict):
join_obs: Optional[Literal["inner", "outer"]]
join_obsm: Optional[Literal["inner"]]
join_vars: Optional[Literal["inner"]]
label: Optional[str]
keys: Optional[Sequence[str]]
index_unique: Optional[str]
convert: Optional[ConvertType]
harmonize_dtypes: bool
indices_strict: bool


def load_anndatas(anndata_dir: os.PathLike, **anncollection_kwargs: Unpack[AnnCollectionKwargs]) -> AnnCollection:
"""Lazily loads a directory of `AnnData` objects into an `AnnCollection`. The concatination happens across the `.obs` axis.

For `AnnCollection` kwargs, see https://anndata.readthedocs.io/en/latest/generated/anndata.experimental.AnnCollection.html

Args:
anndata_dir (os.PathLike): The directory containing the `AnnData` objects.

Returns:
AnnCollection: The `AnnCollection` containing the `AnnData` objects.
"""
if not isinstance(anndata_dir, pathlib.Path):
anndata_dir = pathlib.Path(anndata_dir)

adata_zarr_stores = {f.stem: read_zarr(f) for f in ns.natsorted(anndata_dir.glob("*.zarr"))}
return AnnCollection(adatas=adata_zarr_stores, **anncollection_kwargs)


class AnnDataIterDataPipe(IterDataPipe):
"""The TorchData Iterable-style DataPipe. Takes an `AnnCollection`
and makes it iterable by FOV for easy and flexible data pipelines.

Args:
fovs (AnnCollection): The `AnnCollection` containing the `AnnData` objects.
"""

@property
def fovs(self) -> AnnCollection:
return self._fovs

@fovs.setter
def fovs(self, value: AnnCollection) -> None:
self._fovs: AnnCollection = value

def __init__(self, fovs: AnnCollection):
self.fovs = fovs

def __iter__(self) -> Iterator[AnnData]:
yield from self.fovs.adatas
784 changes: 784 additions & 0 deletions templates/anndata_conversion.ipynb

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions tests/analysis/dimensionality_reduction_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

def test_plot_dim_reduced_data():
# this only tests errors, test_dimensionality_reduction tests the meat of this function
random_cell_data = test_utils.make_cell_table(50)
random_cell_data = test_utils.make_cell_table(n_cells=50, n_markers=10)

with pytest.raises(FileNotFoundError):
# trying to save to a non-existant directory
Expand All @@ -34,8 +34,9 @@ def test_plot_dim_reduced_data():


def test_dimensionality_reduction():
random_cell_data = test_utils.make_cell_table(50)
test_cols = test_utils.TEST_MARKERS
n_markers = 4
random_cell_data = test_utils.make_cell_table(n_cells=50, n_markers=n_markers)
test_cols = [f"marker_{i}" for i in range(n_markers)]

test_algorithms = ['PCA', 'tSNE', 'UMAP']

Expand Down
Loading
Loading