Merge pull request #111 from gustaveroussy/dev

Dev
gustaveroussy · Aug 21, 2024 · f152ab1 · f152ab1
2 parents 18da9a2 + d440a10
commit f152ab1
Show file tree

Hide file tree

Showing 11 changed files with 158 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,17 +1,26 @@
 ## [1.x.x] - 2024-xx-xx
 
+## [1.1.4] - 2024-08-21
+
+### Hotfix
+- Fixed Baysor issue on MERSCOPE data with the Vizgen prior
+- Fix patch-maker issue due to new API temporary implementation
+
+
+## [1.1.3] - 2024-08-18
+
 ### Fix
 - Fixed aggregation issue when gene names are `NaN` or `None` (#101)
 - Fix Xenium reader for old Xenium data format (#105)
 
 ### Added
 - Support multipolygons in ROI rasterization
 - Added bins aggregation
-- Added Visium HD reader (tutorial comming soon)
+- Added Visium HD reader (tutorial coming soon)
 
 ### Changed
 - Import submodules in init (segmentation, io, utils)
-- API simplification in progress (new API + tutorial comming soon)
+- API simplification in progress (new API + tutorial coming soon)
 
 ## [1.1.2] - 2024-07-24
 

diff --git a/docs/api/segmentation/aggregate.md b/docs/api/segmentation/aggregate.md
@@ -1,11 +1,11 @@
-::: sopa.segmentation.aggregate.overlay_segmentation
+::: sopa.segmentation.aggregation.overlay_segmentation
 
-::: sopa.segmentation.aggregate.average_channels
+::: sopa.segmentation.aggregation.average_channels
 
-::: sopa.segmentation.aggregate._average_channels_aligned
+::: sopa.segmentation.aggregation._average_channels_aligned
 
-::: sopa.segmentation.aggregate.count_transcripts
+::: sopa.segmentation.aggregation.count_transcripts
 
-::: sopa.segmentation.aggregate._count_transcripts_aligned
+::: sopa.segmentation.aggregation._count_transcripts_aligned
 
-::: sopa.segmentation.aggregate.Aggregator
+::: sopa.segmentation.aggregation.Aggregator
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sopa"
-version = "1.1.3"
+version = "1.1.4"
 description = "Spatial-omics pipeline and analysis"
 documentation = "https://gustaveroussy.github.io/sopa"
 homepage = "https://gustaveroussy.github.io/sopa"

diff --git a/sopa/_constants.py b/sopa/_constants.py
@@ -58,6 +58,7 @@ class ROI:
 
 class SopaFiles:
     SOPA_CACHE_DIR = ".sopa_cache"
+    TRANSCRIPT_TEMP_DIR = "transcript_patches"
     PATCHES_FILE_IMAGE = "patches_file_image"
     PATCHES_DIRS_BAYSOR = "patches_file_baysor"
     PATCHES_DIRS_COMSEG = "patches_file_comseg"

diff --git a/sopa/_sdata.py b/sopa/_sdata.py
@@ -1,18 +1,19 @@
 from __future__ import annotations
 
 import logging
-from typing import Iterator
+from pathlib import Path
+from typing import Any, Iterator
 
 import geopandas as gpd
 import pandas as pd
-import xarray as xr
+from anndata import AnnData
 from datatree import DataTree
 from spatialdata import SpatialData
 from spatialdata.models import SpatialElement
 from spatialdata.transformations import Identity, get_transformation, set_transformation
 from xarray import DataArray
 
-from ._constants import SopaAttrs, SopaKeys
+from ._constants import SopaAttrs, SopaFiles, SopaKeys
 
 log = logging.getLogger(__name__)
 
@@ -112,14 +113,14 @@ def get_intensities(sdata: SpatialData) -> pd.DataFrame | None:
     return adata.to_df()
 
 
-def iter_scales(image: DataTree) -> Iterator[xr.DataArray]:
+def iter_scales(image: DataTree) -> Iterator[DataArray]:
     """Iterates through all the scales of a `DataTree`
 
     Args:
         image: a `DataTree`
 
     Yields:
-        Each scale (as a `xr.DataArray`)
+        Each scale (as a `DataArray`)
     """
     assert isinstance(image, DataTree), f"Multiscale iteration is reserved for type DataTree. Found {type(image)}"
 
@@ -154,22 +155,42 @@ def get_spatial_element(
     if len(element_dict) == 1:
         key = next(iter(element_dict.keys()))
 
-        assert valid_attr is None or element_dict[key].attrs.get(
+        assert valid_attr is None or _get_spatialdata_attrs(element_dict[key]).get(
             valid_attr, True
         ), f"Element {key} is not valid for the attribute {valid_attr}."
 
         return _return_element(element_dict, key, return_key, as_spatial_image)
 
     assert valid_attr is not None, "Multiple elements found. Provide an element key."
 
-    keys = [key for key, element in element_dict.items() if element.attrs.get(valid_attr)]
+    keys = [key for key, element in element_dict.items() if _get_spatialdata_attrs(element).get(valid_attr)]
 
     assert len(keys) > 0, f"No element with the attribute {valid_attr}. Provide an element key."
     assert len(keys) == 1, f"Multiple valid elements found: {keys}. Provide an element key."
 
     return _return_element(element_dict, keys[0], return_key, as_spatial_image)
 
 
+def _get_spatialdata_attrs(element: SpatialElement) -> dict[str, Any]:
+    if isinstance(element, DataTree):
+        element = next(iter(element["scale0"].values()))
+    return element.attrs.get("spatialdata_attrs", {})
+
+
+def _update_spatialdata_attrs(element: SpatialElement, attrs: dict):
+    if isinstance(element, DataTree):
+        for image_scale in iter_scales(element):
+            _update_spatialdata_attrs(image_scale, attrs)
+        return
+
+    old_attrs = element.uns if isinstance(element, AnnData) else element.attrs
+
+    if "spatialdata_attrs" not in old_attrs:
+        old_attrs["spatialdata_attrs"] = {}
+
+    old_attrs["spatialdata_attrs"].update(attrs)
+
+
 def get_spatial_image(
     sdata: SpatialData,
     key: str | None = None,
@@ -205,3 +226,9 @@ def _return_element(
         element = next(iter(element["scale0"].values()))
 
     return (key, element) if return_key else element
+
+
+def get_cache_dir(sdata: SpatialData) -> Path:
+    assert sdata.is_backed(), "SpatialData not saved on-disk. Save the object, or provide a cache directory."
+
+    return sdata.path / SopaFiles.SOPA_CACHE_DIR
diff --git a/sopa/cli/patchify.py b/sopa/cli/patchify.py
@@ -202,7 +202,7 @@ def _patchify_transcripts(
         config or config_path is not None
     ), "Provide '--config-path', the path to a Baysor config file (toml) or comseg file (jsons)"
 
-    df_key = get_spatial_element(sdata.points)
+    df_key, _ = get_spatial_element(sdata.points, return_key=True)
     patches = Patches2D(sdata, df_key, patch_width_microns, patch_overlap_microns)
     valid_indices = patches.patchify_transcripts(
         temp_dir,

diff --git a/sopa/io/reader/visium_hd.py b/sopa/io/reader/visium_hd.py
@@ -7,6 +7,7 @@
 from spatialdata_io.readers.visium_hd import visium_hd as visium_hd_spatialdata_io
 
 from ..._constants import SopaAttrs
+from ..._sdata import _update_spatialdata_attrs
 from ...utils import string_channel_names
 from .utils import _default_image_kwargs
 
@@ -44,14 +45,19 @@ def visium_hd(
 
     string_channel_names(sdata)  # Ensure that channel names are strings
 
+    ### Add Sopa attributes to detect the spatial elements
     for key, image in sdata.images.items():
         if key.endswith("_full_image"):
-            image.attrs[SopaAttrs.CELL_SEGMENTATION] = True
+            _update_spatialdata_attrs(image, {SopaAttrs.CELL_SEGMENTATION: True})
         elif key.endswith("_hires_image"):
-            image.attrs[SopaAttrs.TISSUE_SEGMENTATION] = True
+            _update_spatialdata_attrs(image, {SopaAttrs.TISSUE_SEGMENTATION: True})
 
     for key, geo_df in sdata.shapes.items():
         if key.endswith("_002um"):
-            geo_df.attrs[SopaAttrs.BINS_AGGREGATION] = True
+            _update_spatialdata_attrs(geo_df, {SopaAttrs.BINS_AGGREGATION: True})
+
+    for key, table in sdata.tables.items():
+        if key.endswith("_002um"):
+            _update_spatialdata_attrs(table, {SopaAttrs.BINS_TABLE: True})
 
     return sdata
diff --git a/sopa/io/reader/xenium.py b/sopa/io/reader/xenium.py
@@ -7,6 +7,7 @@
 from spatialdata_io.readers.xenium import xenium as xenium_spatialdata_io
 
 from ..._constants import SopaAttrs
+from ..._sdata import _update_spatialdata_attrs
 from ...utils import string_channel_names
 from .utils import _default_image_kwargs
 
@@ -56,6 +57,6 @@ def xenium(
 
     for key, image in sdata.images.items():
         if key.startswith("morphology"):
-            image.attrs[SopaAttrs.CELL_SEGMENTATION] = True
+            _update_spatialdata_attrs(image, {SopaAttrs.CELL_SEGMENTATION: True})
 
     return sdata
diff --git a/sopa/patches/patches.py b/sopa/patches/patches.py
@@ -20,6 +20,7 @@
 from .._constants import EPS, ROI, SopaFiles, SopaKeys
 from .._sdata import (
     get_boundaries,
+    get_cache_dir,
     get_spatial_element,
     get_spatial_image,
     to_intrinsic,
@@ -435,6 +436,33 @@ def _assign_prior(series: dd.Series, unassigned_value: int | str | None) -> pd.S
     if series.dtype == "int":
         if unassigned_value is None or unassigned_value == 0:
             return series
-        return series.replace(unassigned_value, 0)
+        return series.replace(int(unassigned_value), 0)
 
     raise ValueError(f"Invalid dtype {series.dtype} for prior cell ids. Must be int or string.")
+
+
+def make_image_patches(
+    sdata: SpatialData, patch_width: int = 2000, patch_overlap: int = 50, image_key: str | None = None
+):
+    image_key, _ = get_spatial_image(sdata, key=image_key, return_key=True)
+    patches = Patches2D(sdata, image_key, patch_width=patch_width, patch_overlap=patch_overlap)
+
+    patches.write()
+
+
+def make_transcript_patches(
+    sdata: SpatialData,
+    config: dict = {},
+    patch_width: int = 2000,
+    patch_overlap: int = 50,
+    points_key: str | None = None,
+    cache_dir: str | Path | None = None,
+) -> list[int]:
+    points_key, _ = get_spatial_element(sdata, key=points_key, return_key=True)
+    patches = Patches2D(sdata, points_key, patch_width=patch_width, patch_overlap=patch_overlap)
+
+    cache_dir = Path(cache_dir or get_cache_dir(sdata)) / SopaFiles.TRANSCRIPT_TEMP_DIR
+
+    valid_indices = patches.patchify_transcripts(cache_dir, config=config)
+
+    return valid_indices
diff --git a/sopa/utils/data.py b/sopa/utils/data.py
@@ -37,6 +37,7 @@ def uniform(
     include_image: bool = True,
     apply_blur: bool = True,
     as_output: bool = False,
+    transcript_cell_id_as_merscope: bool = False,
 ) -> SpatialData:
     """Generate a dummy dataset composed of cells generated uniformly in a square. It also has transcripts.
 
@@ -149,7 +150,7 @@ def uniform(
     sdata = SpatialData(images=images, points=points, shapes=shapes)
 
     _map_transcript_to_cell(sdata, "cell_id", sdata["transcripts"], sdata["cells"])
-    sdata["transcripts"]["cell_id"] = sdata["transcripts"]["cell_id"].astype(int)
+    sdata["transcripts"]["cell_id"] = sdata["transcripts"]["cell_id"].astype(int) - int(transcript_cell_id_as_merscope)
 
     if as_output:
         _add_table(sdata)

diff --git a/workflow/config/toy/uniform_baysor_vizgen.yaml b/workflow/config/toy/uniform_baysor_vizgen.yaml
@@ -0,0 +1,62 @@
+# For parameters details, see this commented example: https://github.com/gustaveroussy/sopa/blob/master/workflow/config/example_commented.yaml
+read:
+  technology: uniform
+  kwargs:
+    transcript_cell_id_as_merscope: true
+
+patchify:
+  patch_width_pixel: 1200
+  patch_overlap_pixel: 50
+  patch_width_microns: 400
+  patch_overlap_microns: 20
+
+segmentation:
+  baysor:
+    min_area: 10
+    cell_key: "cell_id"
+    unassigned_value: -1
+
+    config:
+      data:
+        force_2d: true
+        min_molecules_per_cell: 10
+        x: "x"
+        y: "y"
+        z: "z"
+        gene: "genes"
+        min_molecules_per_gene: 0
+        min_molecules_per_segment: 3
+        confidence_nn_id: 6
+
+      segmentation:
+        scale: 3 # typical cell radius
+        scale_std: "25%" # cell radius standard deviation
+        prior_segmentation_confidence: 0
+        estimate_scale_from_centers: false
+        n_clusters: 4
+        iters: 500
+        n_cells_init: 0
+        nuclei_genes: ""
+        cyto_genes: ""
+        new_component_weight: 0.2
+        new_component_fraction: 0.3
+
+aggregate:
+  average_intensities: true
+  min_transcripts: 5 # [optional] cells whose transcript count is below that this threshold are filtered
+
+annotation:
+  method: fluorescence
+  args:
+    marker_cell_dict:
+      CK: Tumoral cell
+      CD3: T cell
+      CD20: B cell
+
+explorer:
+  gene_column: "genes"
+  ram_threshold_gb: 4
+  pixel_size: 0.1
+
+executables:
+  baysor: ~/.julia/bin/baysor