diff --git a/docs/cli.md b/docs/cli.md index 674f1575..30420874 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -347,12 +347,13 @@ $ sopa patchify [OPTIONS] COMMAND [ARGS]... **Commands**: -* `baysor`: Prepare the patches for Baysor segmentation +* `baysor`: Prepare patches for transcript-based... +* `comseg`: Prepare patches for transcript-based... * `image`: Prepare patches for staining-based... #### `sopa patchify baysor` -Prepare the patches for Baysor segmentation +Prepare patches for transcript-based segmentation with Baysor **Usage**: @@ -370,12 +371,37 @@ $ sopa patchify baysor [OPTIONS] SDATA_PATH * `--patch-overlap-microns FLOAT`: Number of overlapping microns between the patches. We advise to choose approximately twice the diameter of a cell [required] * `--baysor-temp-dir TEXT`: Temporary directory where baysor inputs and outputs will be saved. By default, uses `.sopa_cache/baysor_boundaries` * `--config-path TEXT`: Path to the baysor config (you can also directly provide the argument via the `config` option) -* `--config TEXT`: Dictionnary of baysor parameters [default: {}] -* `--cell-key TEXT`: Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation +* `--config TEXT`: Dictionnary of baysor parameters, overwrite the config_path argument if provided [default: {}] +* `--cell-key TEXT`: Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation Default is 'cell' if cell_key=None * `--unassigned-value INTEGER`: If --cell-key is provided, this is the value given to transcripts that are not inside any cell (if it's already 0, don't provide this argument) * `--use-prior / --no-use-prior`: Whether to use cellpose segmentation as a prior for baysor (if True, make sure to first run Cellpose) [default: no-use-prior] * `--help`: Show this message and exit. +#### `sopa patchify comseg` + +Prepare patches for transcript-based segmentation with ComSeg + +**Usage**: + +```console +$ sopa patchify comseg [OPTIONS] SDATA_PATH +``` + +**Arguments**: + +* `SDATA_PATH`: Path to the SpatialData `.zarr` directory [required] + +**Options**: + +* `--patch-width-microns FLOAT`: Width (and height) of each patch in microns [required] +* `--patch-overlap-microns FLOAT`: Number of overlapping microns between the patches. We advise to choose approximately twice the diameter of a cell [required] +* `--comseg-temp-dir TEXT`: Temporary directory where baysor inputs and outputs will be saved. By default, uses `.sopa_cache/comseg_boundaries` +* `--config-path TEXT`: Path to the ComSeg json config file (you can also directly provide the argument via the `config` option) +* `--config TEXT`: Dictionnary of ComSeg parameters, overwrite the config_path argument if provided [default: {}] +* `--cell-key TEXT`: Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation. Default is cell if cell_key=None +* `--unassigned-value INTEGER`: If --cell-key is provided, this is the value given to transcripts that are not inside any cell (if it's already 0, don't provide this argument) +* `--help`: Show this message and exit. + #### `sopa patchify image` Prepare patches for staining-based segmentation (including Cellpose) @@ -457,6 +483,7 @@ $ sopa resolve [OPTIONS] COMMAND [ARGS]... * `baysor`: Resolve patches conflicts after baysor... * `cellpose`: Resolve patches conflicts after cellpose... +* `comseg`: Resolve patches conflicts after comseg... * `generic`: Resolve patches conflicts after generic... #### `sopa resolve baysor` @@ -500,6 +527,28 @@ $ sopa resolve cellpose [OPTIONS] SDATA_PATH * `--patch-dir TEXT`: Directory containing the cellpose segmentation on patches (or multiple directories if using multi-step segmentation). By default, uses the `.sopa_cache/cellpose_boundaries` directory * `--help`: Show this message and exit. +#### `sopa resolve comseg` + +Resolve patches conflicts after comseg segmentation. Provide either `--comseg-temp-dir` or `--patches-dirs` + +**Usage**: + +```console +$ sopa resolve comseg [OPTIONS] SDATA_PATH +``` + +**Arguments**: + +* `SDATA_PATH`: Path to the SpatialData `.zarr` directory [required] + +**Options**: + +* `--gene-column TEXT`: Column of the transcripts dataframe containing the genes names [required] +* `--comseg-temp-dir TEXT`: Path to the directory containing all the comseg patches (see `sopa patchify`). By default, uses the `.sopa_cache/comseg_boundaries` directory +* `--min-area FLOAT`: Cells with an area less than this value (in microns^2) will be filtered [default: 0] +* `--patches-dirs TEXT`: List of patches directories inside `comseg_temp_dir` +* `--help`: Show this message and exit. + #### `sopa resolve generic` Resolve patches conflicts after generic segmentation @@ -537,6 +586,7 @@ $ sopa segmentation [OPTIONS] COMMAND [ARGS]... **Commands**: * `cellpose`: Perform cellpose segmentation. +* `comseg`: Perform ComSeg segmentation. * `generic-staining`: Perform generic staining-based segmentation. #### `sopa segmentation cellpose` @@ -575,6 +625,26 @@ $ sopa segmentation cellpose [OPTIONS] SDATA_PATH * `--method-kwargs TEXT`: Kwargs for the cellpose method builder. This should be a dictionnary, in inline string format. [default: {}] * `--help`: Show this message and exit. +#### `sopa segmentation comseg` + +Perform ComSeg segmentation. This can be done on all patches directly, or on one individual patch. + +**Usage**: + +```console +$ sopa segmentation comseg [OPTIONS] SDATA_PATH +``` + +**Arguments**: + +* `SDATA_PATH`: Path to the SpatialData `.zarr` directory [required] + +**Options**: + +* `--patch-index INTEGER`: Index of the patch on which the segmentation method should be run.` +* `--patch-dir TEXT`: Path to the temporary the segmentation method directory inside which we will store each individual patch segmentation. By default, saves into the `.sopa_cache/comseg` directory +* `--help`: Show this message and exit. + #### `sopa segmentation generic-staining` Perform generic staining-based segmentation. This can be done on all patches directly, or on one individual patch. diff --git a/docs/tutorials/cli_other_segmentation.md b/docs/tutorials/cli_other_segmentation.md new file mode 100644 index 00000000..3cb62624 --- /dev/null +++ b/docs/tutorials/cli_other_segmentation.md @@ -0,0 +1,76 @@ + +### Option 3: ComSeg + + +[ComSeg](https://github.com/fish-quant/ComSeg) is a transcript-based segmentation method. It uses a segmentation prior (here, Cellpose) and improves it using the transcripts information. + +#### Run Cellpose to segment nuclei + +``` +sopa patchify image tuto.zarr --patch-width-pixel 1500 --patch-overlap-pixel 50 +sopa segmentation cellpose tuto.zarr --channels DAPI --diameter 35 --min-area 2000 +sopa resolve cellpose tuto.zarr +``` + +#### Save a ComSeg config file as config.jsons +More information on the parameters can be found in the [ComSeg documentation](https://comseg.readthedocs.io/en/latest/userguide/Minimal_example.html). +Below we display a minimal example of a ComSeg config file. + + +```json +{"dict_scale": {"x": 1, "y": 1, "z": 1}, +"mean_cell_diameter": 15, +"max_cell_radius": 50, +"alpha": 0.5, +"min_rna_per_cell": 5, +"gene_column": "genes"} +``` + +#### Run ComSeg with the sopa command line tool + +1) create the ComSeg patches +On the toy dataset, we will generate 4 patches. +``` +sopa patchify comseg tuto.zarr --config-path config.json --patch-width-microns 200 --patch-overlap-microns 50 +``` + +2) run ComSeg on all patches + +!!! tip + Manually running the commands below can involve using many consecutive commands, so we recommend automatizing it. For instance, this can be done using Snakemake or Nextflow. This will help you parallelize it since you can run each task on separate jobs or using multithreading. You can also see how we do it in the [Sopa Snakemake pipeline](https://github.com/gustaveroussy/sopa/blob/master/workflow/Snakefile). + + To automatically get the number of patches, you can open the `tuto.zarr/.sopa_cache/patches_file_comseg` file. This lists the names of the directories inside `tuto.zarr/.sopa_cache/comseg` related to each patch. If you selected an ROI, the excluded patches are effectively not in the `patches_file_comseg` file. + +=== "Patch 0" + ```sh + cd tuto.zarr/.sopa_cache/comseg_boundaries/0 + + # 'comseg' is the official comseg executable. If unavailable, replace it with your path to the executable + comseg run --save-polygons GeoJSON -c config.toml transcripts.csv + ``` +=== "Patch 1" + ```sh + cd tuto.zarr/.sopa_cache/comseg_boundaries/1 + + # 'comseg' is the official comseg executable. If unavailable, replace it with your path to the executable + comseg run --save-polygons GeoJSON -c config.toml transcripts.csv + ``` +=== "Patch 2" + ```sh + cd tuto.zarr/.sopa_cache/comseg_boundaries/2 + + # 'comseg' is the official comseg executable. If unavailable, replace it with your path to the executable + comseg run --save-polygons GeoJSON -c config.toml transcripts.csv + ``` +=== "Patch 3" + ```sh + cd tuto.zarr/.sopa_cache/comseg_boundaries/3 + + # 'comseg' is the official comseg executable. If unavailable, replace it with your path to the executable + comseg run --save-polygons GeoJSON -c config.toml transcripts.csv + ``` + +3) Merge the results +```sh +sopa resolve comseg tuto.zarr --gene-column genes +``` diff --git a/docs/tutorials/comseg.ipynb b/docs/tutorials/comseg.ipynb new file mode 100644 index 00000000..c6da6514 --- /dev/null +++ b/docs/tutorials/comseg.ipynb @@ -0,0 +1,768 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ComSeg segmentation\n", + "\n", + "[ComSeg](https://github.com/fish-quant/ComSeg) is a transcript-based segmentation method that creates a KNN graph of RNA molecules, weighted by the co-expression of RNA species. Initially, this KNN graph is partitioned into communities of RNAs likely to belong to the same cell. ComSeg then merges these RNA communities to compute the final cell segmentation. The method uses nucleus segmentation as prior to initialize the partitioning of the KNN graph of RNA molecules. \n", + "ComSeg only segments cells with their nuclei segmented and does not take into account cells without missing nucleus. \n", + "\n", + "If nucleus segmentation is not available, ComSeg can operate using other staining segmentation or solely nucleus centroids obtained from other sources. For more details, please refer to the [documentation](https://comseg.readthedocs.io/en/latest/).\n", + "\n", + "## Requirements\n", + "\n", + "To use ComSeg, run pip install comseg in your environment. ComSeg optionally uses sctransform normalization. If you want to use sctransform, execute the following commands in your R console:\n", + "```\n", + "install.packages(\"sctransform\")\n", + "install.packages(\"feather\")\n", + "install.packages(\"arrow\")\n", + "```\n", + "Then, choose one the the three use cases (i.e., Snakemake or CLI or API) below.\n", + "\n", + "## Snakemake usage\n", + "You can run ComSeg with snakemake in a similar way than other methods. Example of configuration can be found in the config folder.\n", + "\n", + "```snakemake --config data_path=/path/to/directory --configfile=config/xenium/comseg.yaml --cores 1 --use-conda```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CLI usage\n", + "\n", + "\n", + "#### 1) Save a ComSeg config file as config.jsons\n", + "\n", + "Below we display a minimal example of a ComSeg config.jsons\n", + "\n", + "```json\n", + "{\"dict_scale\": {\"x\": 1, \"y\": 1, \"z\": 1},\n", + "\"mean_cell_diameter\": 15,\n", + "\"max_cell_radius\": 20,\n", + "\"alpha\": 0.5,\n", + "\"min_rna_per_cell\": 5,\n", + "\"norm_vector\": false,\n", + "\"gene_column\": \"genes\"}\n", + "```\n", + "\n", + "\n", + "If you did not install the needed external R packages, set ```\"norm_vector\": false```. More information on the parameters can be found in the [ComSeg documentation](https://comseg.readthedocs.io/en/latest/userguide/Minimal_example.html).\n", + "#### 2) create the ComSeg patches\n", + "On the toy dataset, we will generate 4 patches.\n", + "```\n", + "sopa patchify comseg tuto.zarr --config-path config.json --patch-width-microns 200 --patch-overlap-microns 50 --min-transcripts-per-patch 100 --min-cells-per-patch 1 --shapes-key cellpose_boundaries\n", + "```\n", + "The shapes-key argument is the name of the nuclei boundaries shape in the sdata object that will be used for the prior and centroid. In this example, it is set to cellpose_boundaries, which assumes that the Cellpose segmentation has already been run.\n", + "\n", + "#### 3) run ComSeg \n", + "\n", + "- on a specific patch \n", + "```sopa segmentation comseg tuto.zarr --patch-index ```\n", + "- Our sequentially on all the patches\n", + "```sopa segmentation comseg tuto.zarr```\n", + "\n", + "!!! tip\n", + " Manually running the commands above can involve using many consecutive commands, so we recommend automatizing it. For instance, this can be done using Snakemake or Nextflow. This will help you parallelize it since you can run each task on separate jobs or using multithreading. You can also see how we do it in the [Sopa Snakemake pipeline](https://github.com/gustaveroussy/sopa/blob/master/workflow/Snakefile).\n", + "\n", + "#### 3) Merge the results \n", + "\n", + "```sopa resolve comseg tuto.zarr --gene-column genes```\n", + "\n", + "#### 4) Merge the results \n", + "the aggregation is then the same than for other methods\n", + "\n", + "```sopa aggregate tuto.zarr --gene-column genes --min-transcripts 10```\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API usage\n", + "\n", + "In this tutorial, we first compute the nucleus segmentation prior using Cellpose on the DAPI staining\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import sopa\n", + "import sopa.io\n", + "import sopa.segmentation\n", + "from sopa.segmentation.transcripts import resolve\n", + "from sopa.segmentation.methods import comseg_patch\n", + "import importlib\n", + "import sopa.segmentation.methods as method " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Running Cellpose as a prior\n", + "\n", + "First, we generate the bounding boxes of the patches on which Cellpose will be run. Here, the patches have a width and height of 1500 pixels and an overlap of 50 pixels. We advise bigger sizes for real datasets (see our default parameters in one of our config files). On the toy dataset, this will generate 4 patches." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36;20m[INFO] (sopa.utils.data)\u001b[0m Image of size ((4, 2048, 2048)) with 400 cells and 100 transcripts per cell\n", + "\u001b[36;20m[INFO] (sopa.patches.patches)\u001b[0m 4 patches were saved in sdata['sopa_patches']\n", + "\u001b[33;20m[WARNING] (sopa.segmentation.stainings)\u001b[0m Running segmentation in a sequential manner. This is not recommended on large images because it can be extremely slow (see https://github.com/gustaveroussy/sopa/discussions/36 for more details)\n", + "Run all patches: 0%| | 0/4 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sdata\\\n", + " .pl.render_points(size=0.01, color=\"r\")\\\n", + " .pl.render_images()\\\n", + " .pl.render_shapes(shapes_key, outline=True, fill_alpha=0, outline_color=\"w\")\\\n", + " .pl.show(\"global\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also use the Xenium Explorer:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\u001b[36;20m[INFO] (sopa.io.explorer.table)\u001b[0m Writing table with 5 columns\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.table)\u001b[0m Writing 2 cell categories: region, slide\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.shapes)\u001b[0m Writing 367 cell polygons\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.points)\u001b[0m Writing 40000 transcripts\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.points)\u001b[0m > Level 0: 40000 transcripts\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.points)\u001b[0m > Level 1: 10000 transcripts\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m Writing multiscale image with procedure=semi-lazy (load in memory when possible)\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m (Loading image of shape (4, 2048, 2048)) in memory\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 2048, 2048)\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 1024, 1024)\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 512, 512)\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 256, 256)\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 128, 128)\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 64, 64)\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.converter)\u001b[0m Saved files in the following directory: tuto.explorer\n", + "\u001b[36;20m[INFO] (sopa.io.explorer.converter)\u001b[0m You can open the experiment with 'open tuto.explorer/experiment.xenium'\n" + ] + } + ], + "source": [ + "sopa.io.write(\"tuto.explorer\", sdata, image_key, points_key=\"transcripts\", gene_column=gene_column, shapes_key=\"comseg_boundaries\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sopa", + "language": "python", + "name": "sopa" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/tutorials/other_segmentations.ipynb b/docs/tutorials/other_segmentations.ipynb deleted file mode 100644 index 3dfbcf91..00000000 --- a/docs/tutorials/other_segmentations.ipynb +++ /dev/null @@ -1,706 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import sopa\n", - "import sopa.io\n", - "import sopa.segmentation\n", - "from sopa.segmentation.transcripts import resolve\n", - "from sopa.segmentation.methods import comseg_patch\n", - "import importlib\n", - "import sopa.segmentation.methods as method \n", - "importlib.reload(method)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ComSeg segmentation\n", - "\n", - "[ComSeg](https://github.com/fish-quant/ComSeg) is a transcript-based segmentation method that creates a KNN graph of RNA molecules, weighted by the co-expression of RNA species. Initially, this KNN graph is partitioned into communities of RNAs likely to belong to the same cell. ComSeg then merges these RNA communities to compute the final cell segmentation. The method uses nucleus segmentation as prior to initialize the partitioning of the KNN graph of RNA molecules. \n", - "ComSeg only segments cells with their nuclei segmented and does not take into account cells withou missing nucleus. \n", - "\n", - "If nucleus segmentation is not available, ComSeg can operate using other staining segmentation or solely nucleus centroids obtained from other sources. For more details, please refer to the [documentation](https://comseg.readthedocs.io/en/latest/)\n", - "\n", - "In this tutorial, we first compute the nucleus segmentation prior using Cellpose on the DAPI staining\n", - "\n", - "## 1. Running Cellpose as a prior\n", - "\n", - "First, we generate the bounding boxes of the patches on which Cellpose will be run. Here, the patches have a width and height of 1500 pixels and an overlap of 50 pixels. We advise bigger sizes for real datasets (see our default parameters in one of our config files). On the toy dataset, this will generate 4 patches.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[36;20m[INFO] (sopa.utils.data)\u001b[0m Image of size ((4, 2048, 2048)) with 400 cells and 100 transcripts per cell\n", - "\u001b[36;20m[INFO] (sopa.patches.patches)\u001b[0m 4 patches were saved in sdata['sopa_patches']\n", - "\u001b[33;20m[WARNING] (sopa.segmentation.stainings)\u001b[0m Running segmentation in a sequential manner. This is not recommended on large images because it can be extremely slow (see https://github.com/gustaveroussy/sopa/discussions/36 for more details)\n", - "Run all patches: 0%| | 0/4 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "sdata\\\n", - " .pl.render_points(size=0.01, color=\"r\")\\\n", - " .pl.render_images()\\\n", - " .pl.render_shapes(shapes_key, outline=True, fill_alpha=0, outline_color=\"w\")\\\n", - " .pl.show(\"global\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can also use the Xenium Explorer:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[36;20m[INFO] (sopa.io.explorer.table)\u001b[0m Writing table with 5 columns\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.table)\u001b[0m Writing 2 cell categories: region, slide\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.shapes)\u001b[0m Writing 367 cell polygons\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.points)\u001b[0m Writing 40000 transcripts\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.points)\u001b[0m > Level 0: 40000 transcripts\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.points)\u001b[0m > Level 1: 10000 transcripts\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m Writing multiscale image with procedure=semi-lazy (load in memory when possible)\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m (Loading image of shape (4, 2048, 2048)) in memory\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 2048, 2048)\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 1024, 1024)\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 512, 512)\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 256, 256)\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 128, 128)\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.images)\u001b[0m > Image of shape (4, 64, 64)\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.converter)\u001b[0m Saved files in the following directory: tuto.explorer\n", - "\u001b[36;20m[INFO] (sopa.io.explorer.converter)\u001b[0m You can open the experiment with 'open tuto.explorer/experiment.xenium'\n" - ] - } - ], - "source": [ - "sopa.io.write(\"tuto.explorer\", sdata, image_key, points_key=\"transcripts\", gene_column=gene_column, shapes_key=\"comseg_boundaries\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "sopa", - "language": "python", - "name": "sopa" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/mkdocs.yml b/mkdocs.yml index ad13cce5..797d2de2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -20,6 +20,8 @@ nav: - Spatial statistics: tutorials/spatial.ipynb - H&E usage: tutorials/he.ipynb - Advanced segmentation: tutorials/advanced_segmentation.md + - Other segmentation methods: + - ComSeg: tutorials/comseg.ipynb - CLI: cli.md - API: - sopa.spatial: api/spatial.md diff --git a/sopa/_constants.py b/sopa/_constants.py index f82816a9..f6fb3b57 100644 --- a/sopa/_constants.py +++ b/sopa/_constants.py @@ -51,6 +51,7 @@ class SopaFiles: SOPA_CACHE_DIR = ".sopa_cache" PATCHES_FILE_IMAGE = "patches_file_image" PATCHES_DIRS_BAYSOR = "patches_file_baysor" + PATCHES_DIRS_COMSEG = "patches_file_comseg" TRANSCRIPTS_FILE = "transcripts.csv" CENTROIDS_FILE = "centroids.csv" JSON_CONFIG_FILE = "config.json" diff --git a/sopa/cli/patchify.py b/sopa/cli/patchify.py index 0cedded2..5995862a 100644 --- a/sopa/cli/patchify.py +++ b/sopa/cli/patchify.py @@ -4,6 +4,7 @@ import typer +from .._constants import SopaKeys from .utils import SDATA_HELPER app_patchify = typer.Typer() @@ -55,11 +56,12 @@ def baysor( config: str = typer.Option( default={}, callback=ast.literal_eval, - help="Dictionnary of baysor parameters", + help="Dictionnary of baysor parameters, overwrite the config_path argument if provided", ), cell_key: str = typer.Option( None, - help="Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation", + help="Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation" + f" Default is '{SopaKeys.DEFAULT_CELL_KEY}' if cell_key=None", ), unassigned_value: int = typer.Option( None, @@ -70,31 +72,163 @@ def baysor( help="Whether to use cellpose segmentation as a prior for baysor (if True, make sure to first run Cellpose)", ), ): - """Prepare the patches for Baysor segmentation""" + """Prepare patches for transcript-based segmentation with Baysor""" from sopa._constants import SopaFiles, SopaKeys + + from .utils import _default_boundary_dir + + if baysor_temp_dir is None: + baysor_temp_dir = _default_boundary_dir(sdata_path, SopaKeys.BAYSOR_BOUNDARIES) + return _patchify_transcripts( + sdata_path=sdata_path, + patch_width_microns=patch_width_microns, + patch_overlap_microns=patch_overlap_microns, + temp_dir=baysor_temp_dir, + filename=SopaFiles.PATCHES_DIRS_BAYSOR, + config_name=SopaFiles.TOML_CONFIG_FILE, + config_path=config_path, + config=config, + cell_key=cell_key, + unassigned_value=unassigned_value, + use_prior=use_prior, + ) + + +@app_patchify.command() +def comseg( + sdata_path: str = typer.Argument(help=SDATA_HELPER), + patch_width_microns: float = typer.Option(help="Width (and height) of each patch in microns"), + patch_overlap_microns: float = typer.Option( + help="Number of overlapping microns between the patches. We advise to choose approximately twice the diameter of a cell" + ), + comseg_temp_dir: str = typer.Option( + None, + help="Temporary directory where baysor inputs and outputs will be saved. By default, uses `.sopa_cache/comseg_boundaries`", + ), + config_path: str = typer.Option( + None, + help="Path to the ComSeg json config file (you can also directly provide the argument via the `config` option)", + ), + config: str = typer.Option( + default={}, + callback=ast.literal_eval, + help="Dictionnary of ComSeg parameters, overwrite the config_path argument if provided", + ), + cell_key: str = typer.Option( + None, + help="Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation." + f" Default is {SopaKeys.DEFAULT_CELL_KEY} if cell_key=None", + ), + unassigned_value: int = typer.Option( + None, + help="If --cell-key is provided, this is the value given to transcripts that are not inside any cell (if it's already 0, don't provide this argument)", + ), + min_transcripts_per_patch: int = typer.Option( + 1000, help="Minimum number of transcripts per patch" + ), + min_cells_per_patch: int = typer.Option(1, help="Minimum number of cells per patch"), + shapes_key: str = typer.Option( + SopaKeys.CELLPOSE_BOUNDARIES, + help="Name of the nuclei boundaries shape use for the prior and centroid in the sdata object", + ), +): + """Prepare patches for transcript-based segmentation with ComSeg""" + + from sopa._constants import SopaFiles, SopaKeys + + from .utils import _default_boundary_dir + + if comseg_temp_dir is None: + comseg_temp_dir = _default_boundary_dir(sdata_path, SopaKeys.COMSEG_BOUNDARIES) + + return _patchify_transcripts( + sdata_path=sdata_path, + patch_width_microns=patch_width_microns, + patch_overlap_microns=patch_overlap_microns, + temp_dir=comseg_temp_dir, + filename=SopaFiles.PATCHES_DIRS_COMSEG, + config_name=SopaFiles.JSON_CONFIG_FILE, + config_path=config_path, + config=config, + cell_key=cell_key, + unassigned_value=unassigned_value, + use_prior=True, + min_transcripts_per_patch=min_transcripts_per_patch, + min_cells_per_patch=min_cells_per_patch, + shapes_key=shapes_key, + ) + + +def _patchify_transcripts( + sdata_path: str, + patch_width_microns: float, + patch_overlap_microns: float, + temp_dir: str, + filename: str, + config_name: str, + config_path: str, + config: str, + cell_key: str, + use_prior: bool, + unassigned_value: int, + min_transcripts_per_patch: int, + min_cells_per_patch: int, + shapes_key: str = SopaKeys.CELLPOSE_BOUNDARIES, +): + """Prepare patches for transcript-based segmentation for the different available methods (Baysor, ComSeg). + + Args: + sdata_path: Path to the SpatialData object + patch_width_microns: Width (and height) of each patch in microns + patch_overlap_microns: Number of overlapping microns between the patches. We advise to choose approximately twice the diameter of a cell + temp_dir: Temporary directory where baysor inputs and outputs will be saved. By default, uses `.sopa_cache/baysor_boundaries` + filename: Name of the file to indicating the patch's index + config_name: Name of the config file created for each patch + config_path: Path to the config file (you can also directly provide the argument via the `config` option) + config: Dictionnary of parameters + cell_key: Optional column of the transcripts dataframe that indicates in which cell-id each transcript is, in order to use prior segmentation. + use_prior: Whether to use cellpose segmentation as a prior for baysor and comseg (if True, make sure to first run Cellpose) + unassigned_value: If cell-key is provided, this is the value given to transcripts that are not inside any cell (if it's already 0, don't provide this argument) + + """ + from sopa._constants import SopaFiles from sopa._sdata import get_key from sopa.io.standardize import read_zarr_standardized, sanity_check from sopa.patches import Patches2D - from .utils import _default_boundary_dir - sdata = read_zarr_standardized(sdata_path) sanity_check(sdata) assert ( config or config_path is not None - ), "Provide '--config-path', the path to a Baysor config file (toml)" - - if baysor_temp_dir is None: - baysor_temp_dir = _default_boundary_dir(sdata_path, SopaKeys.BAYSOR_BOUNDARIES) + ), "Provide '--config-path', the path to a Baysor config file (toml) or comseg file (jsons)" df_key = get_key(sdata, "points") patches = Patches2D(sdata, df_key, patch_width_microns, patch_overlap_microns) valid_indices = patches.patchify_transcripts( - baysor_temp_dir, cell_key, unassigned_value, use_prior, config, config_path + temp_dir, + cell_key, + unassigned_value, + use_prior, + config, + config_path, + config_name=config_name, + min_transcripts_per_patch=min_transcripts_per_patch, + shapes_key=shapes_key, ) - _save_cache(sdata_path, SopaFiles.PATCHES_DIRS_BAYSOR, "\n".join(map(str, valid_indices))) + if filename == SopaFiles.PATCHES_DIRS_COMSEG: + assert ( + use_prior + ), "For ComSeg, you must use the prior segmentation of nuclei or from other staining" + valid_indices_centoid = patches.patchify_centroids( + temp_dir, + shapes_key=shapes_key, + min_cells_per_patch=min_cells_per_patch, + ) + valid_indices = list(set(valid_indices_centoid).intersection(set(valid_indices))) + + _save_cache(sdata_path, filename, "\n".join(map(str, valid_indices))) def _save_cache(sdata_path: str, filename: str, content: str): diff --git a/sopa/cli/resolve.py b/sopa/cli/resolve.py index 25bd6218..e3fbedf9 100644 --- a/sopa/cli/resolve.py +++ b/sopa/cli/resolve.py @@ -96,3 +96,42 @@ def baysor( sdata = read_zarr_standardized(sdata_path) resolve(sdata, baysor_temp_dir, gene_column, patches_dirs, min_area) + + +@app_resolve.command() +def comseg( + sdata_path: str = typer.Argument(help=SDATA_HELPER), + gene_column: str = typer.Option( + help="Column of the transcripts dataframe containing the genes names" + ), + comseg_temp_dir: str = typer.Option( + None, + help="Path to the directory containing all the comseg patches (see `sopa patchify`). By default, uses the `.sopa_cache/comseg_boundaries` directory", + ), + min_area: float = typer.Option( + 0, help="Cells with an area less than this value (in microns^2) will be filtered" + ), + patches_dirs: list[str] = typer.Option( + [], help="List of patches directories inside `comseg_temp_dir`" + ), +): + """Resolve patches conflicts after comseg segmentation. Provide either `--comseg-temp-dir` or `--patches-dirs`""" + from sopa._constants import SopaKeys + from sopa.io.standardize import read_zarr_standardized + from sopa.segmentation.transcripts import resolve + + from .utils import _default_boundary_dir + + if not len(patches_dirs) and comseg_temp_dir is None: + comseg_temp_dir = _default_boundary_dir(sdata_path, SopaKeys.COMSEG_BOUNDARIES) + + sdata = read_zarr_standardized(sdata_path) + + resolve( + sdata, + comseg_temp_dir, + gene_column, + patches_dirs, + min_area, + shapes_key=SopaKeys.COMSEG_BOUNDARIES, + ) diff --git a/sopa/cli/segmentation.py b/sopa/cli/segmentation.py index 9f4e5934..89ae5232 100644 --- a/sopa/cli/segmentation.py +++ b/sopa/cli/segmentation.py @@ -176,3 +176,52 @@ def _run_staining_segmentation( segmentation.write_patches_cells(patch_dir) else: segmentation.write_patch_cells(patch_dir, patch_index) + + +@app_segmentation.command() +def comseg( + sdata_path: str = typer.Argument(help=SDATA_HELPER), + patch_index: int = typer.Option( + default=None, + help="Index of the patch on which the segmentation method should be run.`", + ), + patch_dir: str = typer.Option( + default=None, + help="Path to the temporary the segmentation method directory inside which we will store each individual patch segmentation. By default, saves into the `.sopa_cache/comseg` directory", + ), +): + """Perform ComSeg segmentation. This can be done on all patches directly, or on one individual patch.""" + import json + import logging + from pathlib import Path + + from tqdm import tqdm + + from sopa._constants import SopaFiles, SopaKeys + from sopa.segmentation.methods import comseg_patch + + from .utils import _default_boundary_dir + + log = logging.getLogger(__name__) + + config_name = SopaFiles.JSON_CONFIG_FILE + + if patch_dir is None: + patch_dir = _default_boundary_dir(sdata_path, SopaKeys.COMSEG_BOUNDARIES) + + if patch_index is None: + log.warn( + "Running segmentation in a sequential manner. This is not recommended on large images because it can be extremely slow (see https://github.com/gustaveroussy/sopa/discussions/36 for more details)" + ) + for path_index_folder in tqdm(list(Path(patch_dir).glob("*")), desc="Run all patches"): + patch_index = int(path_index_folder.stem) + config_path = Path(patch_dir) / f"{patch_index}/{config_name}" + with open(config_path, "r") as f: + config = json.load(f) + + comseg_patch(temp_dir=patch_dir, patch_index=int(path_index_folder.stem), config=config) + else: + config_path = Path(patch_dir) / f"{patch_index}/{config_name}" + with open(config_path, "r") as f: + config = json.load(f) + comseg_patch(temp_dir=patch_dir, patch_index=patch_index, config=config) diff --git a/sopa/patches/patches.py b/sopa/patches/patches.py index 101f8277..761c6727 100644 --- a/sopa/patches/patches.py +++ b/sopa/patches/patches.py @@ -228,6 +228,7 @@ def patchify_transcripts( config_name: str = SopaFiles.TOML_CONFIG_FILE, csv_name: str = SopaFiles.TRANSCRIPTS_FILE, min_transcripts_per_patch: int = 4000, + shapes_key: str = SopaKeys.CELLPOSE_BOUNDARIES, ) -> list[int]: """Creation of patches for the transcripts. @@ -253,7 +254,7 @@ def patchify_transcripts( """ return TranscriptPatches( self, self.element, config_name, csv_name, min_transcripts_per_patch - ).write(temp_dir, cell_key, unassigned_value, use_prior, config, config_path) + ).write(temp_dir, cell_key, unassigned_value, use_prior, config, config_path, shapes_key) def patchify_centroids( self, @@ -271,7 +272,9 @@ def patchify_centroids( centroids["y"] = centroids.geometry.y centroids["z"] = 0 - TranscriptPatches(self, centroids, None, csv_name, min_cells_per_patch).write(temp_dir) + return TranscriptPatches(self, centroids, None, csv_name, min_cells_per_patch).write( + temp_dir, shapes_key=shapes_key + ) class TranscriptPatches: @@ -299,6 +302,7 @@ def write( use_prior: bool = False, config: dict = {}, config_path: str | None = None, + shapes_key: str = SopaKeys.CELLPOSE_BOUNDARIES, ): from sopa.segmentation.transcripts import copy_segmentation_config @@ -312,10 +316,8 @@ def write( self.df[cell_key] = _assign_prior(self.df[cell_key], unassigned_value) if use_prior: - prior_boundaries = self.sdata[SopaKeys.CELLPOSE_BOUNDARIES] - _map_transcript_to_cell( - self.sdata, SopaKeys.DEFAULT_CELL_KEY, self.df, prior_boundaries - ) + prior_boundaries = self.sdata[shapes_key] + _map_transcript_to_cell(self.sdata, cell_key, self.df, prior_boundaries) self._setup_patches_directory() @@ -376,6 +378,8 @@ def _write_points(self, patches_gdf: gpd.GeoDataFrame, points_gdf: gpd.GeoDataFr def _check_min_lines(path: str, n: int) -> bool: + if not Path(path).exists(): # empty file are not written at all + return False with open(path, "r") as f: for count, _ in enumerate(f): if count + 1 >= n: diff --git a/sopa/segmentation/methods.py b/sopa/segmentation/methods.py index ceb5b4e4..8587e3fd 100644 --- a/sopa/segmentation/methods.py +++ b/sopa/segmentation/methods.py @@ -88,7 +88,7 @@ def comseg_patch(temp_dir: str, patch_index: int, config: dict): from comseg import dictionary except ModuleNotFoundError: raise ModuleNotFoundError( - "Install the ComSeg package (`pip install comseg`) for this method to work" + "Install the comseg (`pip install comseg`) for this method to work" ) path_dataset_folder = Path(temp_dir) / str(patch_index) @@ -101,6 +101,7 @@ def comseg_patch(temp_dir: str, patch_index: int, config: dict): image_csv_files=["transcripts.csv"], centroid_csv_files=["centroids.csv"], path_cell_centroid=path_dataset_folder, + min_nb_rna_patch=config.get("min_nb_rna_patch", 0), ) dataset.compute_edge_weight(config=config) @@ -108,7 +109,7 @@ def comseg_patch(temp_dir: str, patch_index: int, config: dict): Comsegdict = dictionary.ComSegDict( dataset=dataset, mean_cell_diameter=config["mean_cell_diameter"], - prior_name=SopaKeys.DEFAULT_CELL_KEY, + prior_name=config.get("prior_name", SopaKeys.DEFAULT_CELL_KEY), ) Comsegdict.run_all(config=config) diff --git a/workflow/Snakefile b/workflow/Snakefile index a81652d5..1c7d0f3a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -68,6 +68,28 @@ checkpoint patchify_baysor: sopa patchify baysor {paths.sdata_path} {params.args_patchify} {params.args_baysor} {params.arg_prior} """ +checkpoint patchify_comseg: + input: + paths.sdata_zgroup, + output: + patches_file = paths.smk_patches_file_comseg, + smk_comseg_temp_dir = directory(paths.smk_comseg_temp_dir), + params: + args_patchify = str(args["patchify"].where([ + "patch_width_microns", + "patch_overlap_microns", + 'min_transcripts_per_patch', + 'min_cells_per_patch', + 'shapes_key',], + )), + args_comseg = args.dump_comseg_patchify() if args.comseg else "", + conda: + "sopa" + shell: + """ + sopa patchify comseg {paths.sdata_path} {params.args_patchify} {params.args_comseg} + """ + rule patch_segmentation_cellpose: input: paths.smk_patches_file_image, @@ -104,6 +126,22 @@ rule patch_segmentation_baysor: {config[executables][baysor]} run --save-polygons GeoJSON -c config.toml transcripts.csv {params.args_baysor_prior_seg} """ +rule patch_segmentation_comseg: + input: + patches_file = paths.smk_patches_file_comseg, + baysor_patch = paths.smk_comseg_temp_dir / "{index}", + output: + paths.smk_comseg_temp_dir / "{index}" / "segmentation_polygons.json", + paths.smk_comseg_temp_dir / "{index}" / "segmentation_counts.h5ad", + conda: + "sopa" + resources: + mem_mb=128_000, + shell: + """ + sopa segmentation comseg {paths.sdata_path} --patch-dir {paths.smk_comseg_temp_dir} --patch-index {wildcards.index} + """ + def get_input_resolve(name, dirs=False): def _(wilcards): with getattr(checkpoints, f"patchify_{name}").get(**wilcards).output.patches_file.open() as f: @@ -141,9 +179,38 @@ rule resolve_baysor: rm -r {paths.smk_baysor_temp_dir} # cleanup large baysor files """ +rule resolve_comseg: + input: + files = get_input_resolve("comseg"), + dirs = get_input_resolve("comseg",dirs=True), + output: + touch(paths.smk_comseg_boundaries), + touch(paths.smk_table), + conda: + "sopa" + params: + args_patches_dirs=lambda _, input: " ".join(f"--patches-dirs {directory}" for directory in input.dirs), + args_min_area=args.min_area("comseg"), + shell: + """ + sopa resolve comseg {paths.sdata_path} --gene-column {args.gene_column} {params.args_min_area} {params.args_patches_dirs} + + rm -r {paths.smk_comseg_temp_dir} # cleanup large comseg files + """ + +def get_smk_boundaries(args): + if args.baysor: + return paths.smk_baysor_boundaries + elif args.comseg: + return paths.smk_comseg_boundaries + elif args.cellpose: + return paths.smk_cellpose_boundaries + else: + raise ValueError("No segmentation method selected") + rule aggregate: input: - paths.smk_baysor_boundaries if args.baysor else paths.smk_cellpose_boundaries, + get_smk_boundaries(args), output: touch(paths.smk_aggregation), conda: @@ -194,7 +261,7 @@ rule image_write: rule report: input: - paths.smk_baysor_boundaries if args.baysor else paths.smk_cellpose_boundaries, + get_smk_boundaries(args), paths.smk_aggregation, paths.annotations if args.annotate else [], output: @@ -208,7 +275,7 @@ rule report: rule explorer: input: - paths.smk_baysor_boundaries if args.baysor else paths.smk_cellpose_boundaries, + get_smk_boundaries(args), paths.smk_aggregation, paths.annotations if args.annotate else [], output: diff --git a/workflow/config/cosmx/comseg_cellpose.yaml b/workflow/config/cosmx/comseg_cellpose.yaml new file mode 100644 index 00000000..7d2914f2 --- /dev/null +++ b/workflow/config/cosmx/comseg_cellpose.yaml @@ -0,0 +1,50 @@ +# For parameters details, see this commented example: https://github.com/gustaveroussy/sopa/blob/master/workflow/config/example_commented.yaml +read: + technology: cosmx + +patchify: + patch_width_pixel: 6000 + patch_overlap_pixel: 150 + patch_width_microns: 800 + patch_overlap_microns: 50 + +segmentation: + cellpose: + diameter: 60 + channels: ["DNA"] + flow_threshold: 2 + cellprob_threshold: -6 + min_area: 2000 + + comseg: + min_area: 10 + config: + dict_scale: + x: 1 + y: 1 + z: 1 + mean_cell_diameter: 8 + max_cell_radius: 15 + alpha: 0.5 + min_rna_per_cell: 5 + gene_column: "genes" + allow_disconnected_polygon : true + +aggregate: + average_intensities: true + min_transcripts: 10 # [optional] cells whose transcript count is below that this threshold are filtered + +# Comment this out if you want to use tangram --> + +# annotation: +# method: tangram +# args: +# sc_reference_path: "..." +# cell_type_key: ct + +explorer: + gene_column: "target" + ram_threshold_gb: 4 + +executables: + baysor: ~/.julia/bin/baysor # if you run baysor, put here the path to the baysor executable diff --git a/workflow/config/merscope/comseg_cellpose.yaml b/workflow/config/merscope/comseg_cellpose.yaml new file mode 100644 index 00000000..36db7268 --- /dev/null +++ b/workflow/config/merscope/comseg_cellpose.yaml @@ -0,0 +1,52 @@ +# For parameters details, see this commented example: https://github.com/gustaveroussy/sopa/blob/master/workflow/config/example_commented.yaml +read: + technology: merscope + +patchify: + patch_width_pixel: 6000 + patch_overlap_pixel: 150 + patch_width_microns: 1000 + patch_overlap_microns: 20 + +segmentation: + cellpose: + diameter: 60 + channels: ["DAPI"] + flow_threshold: 2 + cellprob_threshold: -6 + min_area: 2000 + + comseg: + min_area: 10 + config: + dict_scale: + x: 1 + y: 1 + z: 1 + mean_cell_diameter: 15 + max_cell_radius: 50 + alpha: 0.5 + min_rna_per_cell: 5 + gene_column: "genes" + allow_disconnected_polygon : true + + +aggregate: + average_intensities: true + min_transcripts: 10 # [optional] cells whose transcript count is below that this threshold are filtered + +# Comment this out if you want to use tangram --> + +# annotation: +# method: tangram +# args: +# sc_reference_path: "..." +# cell_type_key: ct + +explorer: + gene_column: "gene" + ram_threshold_gb: 4 + pixel_size: 0.108 + +executables: + baysor: ~/.julia/bin/baysor # if you run baysor, put here the path to the baysor executable diff --git a/workflow/config/toy/uniform_comseg.yaml b/workflow/config/toy/uniform_comseg.yaml new file mode 100644 index 00000000..662d8f52 --- /dev/null +++ b/workflow/config/toy/uniform_comseg.yaml @@ -0,0 +1,51 @@ +# For parameters details, see this commented example: https://github.com/gustaveroussy/sopa/blob/master/workflow/config/example_commented.yaml +read: + technology: uniform + +patchify: + patch_width_pixel: 1200 + patch_overlap_pixel: 50 + patch_width_microns: 3000 + patch_overlap_microns: 40 + +segmentation: + cellpose: + diameter: 35 + channels: [ "DAPI" ] + flow_threshold: 2 + cellprob_threshold: -6 + min_area: 2500 + + comseg: + min_area: 10 + config: + dict_scale: + x: 1 + y: 1 + z: 1 + mean_cell_diameter: 8 + max_cell_radius: 15 + alpha: 0.5 + min_rna_per_cell: 5 + gene_column: "genes" + + +aggregate: + average_intensities: true + gene_column: genes + +annotation: + method: fluorescence + args: + marker_cell_dict: + CK: Tumoral cell + CD3: T cell + CD20: B cell + +explorer: + gene_column: "genes" + ram_threshold_gb: 4 + pixel_size: 1 + +executables: + baysor: ~/.julia/bin/baysor diff --git a/workflow/config/xenium/comseg.yaml b/workflow/config/xenium/comseg.yaml new file mode 100644 index 00000000..579e4ff0 --- /dev/null +++ b/workflow/config/xenium/comseg.yaml @@ -0,0 +1,40 @@ +# For parameters details, see this commented example: https://github.com/gustaveroussy/sopa/blob/master/workflow/config/example_commented.yaml +read: + technology: uniform + +patchify: + patch_width_microns: 400 + patch_overlap_microns: 50 + min_transcripts_per_patch : 4000 + min_cells_per_patch: 3 + shapes_key: "cell_boundaries" + + + +segmentation: + comseg: + min_area: 10 + config: + dict_scale: + x: 1 + y: 1 + z: 1 + mean_cell_diameter: 15 + max_cell_radius: 15 + k_nearest_neighbors: 5 + alpha: 0.5 + min_rna_per_cell: 10 + gene_column: "feature_name" + allow_disconnected_polygon : true + norm_vector: true # [optional] requires exeternal R package '"sctransform"' "feather" and "arrow" to be installed, otherwise set to false + + +aggregate: + average_intensities: true + min_transcripts: 10 # [optional] cells whose transcript count is below that this threshold are filtered + + +explorer: + gene_column: "feature_name" + ram_threshold_gb: 4 + pixel_size: 0.108 diff --git a/workflow/utils.py b/workflow/utils.py index f19de651..a7f1869e 100644 --- a/workflow/utils.py +++ b/workflow/utils.py @@ -61,10 +61,13 @@ def __init__(self, config: dict) -> None: self.smk_patches = self.sopa_cache / "patches" self.smk_patches_file_image = self.sopa_cache / "patches_file_image" self.smk_patches_file_baysor = self.sopa_cache / "patches_file_baysor" + self.smk_patches_file_comseg = self.sopa_cache / "patches_file_comseg" self.smk_cellpose_temp_dir = self.sopa_cache / "cellpose_boundaries" self.smk_baysor_temp_dir = self.sopa_cache / "baysor_boundaries" + self.smk_comseg_temp_dir = self.sopa_cache / "comseg_boundaries" self.smk_cellpose_boundaries = self.sopa_cache / "cellpose_boundaries_done" self.smk_baysor_boundaries = self.sopa_cache / "baysor_boundaries_done" + self.smk_comseg_boundaries = self.sopa_cache / "comseg_boundaries_done" self.smk_aggregation = self.sopa_cache / "aggregation" # annotation files @@ -106,6 +109,17 @@ def cells_paths(self, file_content: str, name, dirs: bool = False): for i in indices for file in BAYSOR_FILES ] + if name == "comseg": + indices = map(int, file_content.split()) + COMSEG_FILES = ["segmentation_polygons.json", "segmentation_counts.h5ad"] + + if dirs: + return [str(self.smk_comseg_temp_dir / str(i)) for i in indices] + return [ + str(self.smk_comseg_temp_dir / str(i) / file) + for i in indices + for file in COMSEG_FILES + ] class Args: @@ -123,6 +137,11 @@ def __init__(self, paths: WorkflowPaths, config: dict): # which segmentation method(s) is/are used self.cellpose = self.segmentation and "cellpose" in self.config["segmentation"] self.baysor = self.segmentation and "baysor" in self.config["segmentation"] + self.comseg = ( + self.segmentation + and "comseg" in self.config["segmentation"] + and "cellpose" in self.config["segmentation"] + ) # whether to run annotation self.annotate = "annotation" in self.config and "method" in self.config["annotation"] @@ -197,7 +216,17 @@ def baysor_prior_seg(self): @property def gene_column(self): - return self.config["segmentation"]["baysor"]["config"]["data"]["gene"] + if "baysor" in self.config["segmentation"]: + return self.config["segmentation"]["baysor"]["config"]["data"]["gene"] + elif "comseg" in self.config["segmentation"]: + return self.config["segmentation"]["comseg"]["config"]["gene_column"] + else: + raise ValueError("No gene column found in the config") + + ### ComSeg related methods + + def dump_comseg_patchify(self): + return f'--comseg-temp-dir {self.paths.smk_comseg_temp_dir} {self["segmentation"]["comseg"].where(keys=["cell_key", "unassigned_value", "config"])}' def stringify_for_cli(value) -> str: