-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add dataloader for vizgen's merscope datasets
- Loading branch information
Showing
3 changed files
with
366 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
name: vizgen_merscope | ||
namespace: datasets/loaders | ||
|
||
argument_groups: | ||
- name: Inputs | ||
arguments: | ||
- type: string | ||
name: --gcloud_bucket | ||
default: "vz-ffpe-showcase" | ||
description: "Google cloud bucket name" | ||
- type: string | ||
name: --dataset_bucket_name | ||
default: "HumanBreastCancerPatient1" | ||
description: "The name of the dataset base directory in the vizgen google cloud bucket" | ||
- type: string | ||
name: --segmentation_id | ||
default: ["cell"] | ||
description: The segmentation identifier | ||
multiple: true | ||
- name: Metadata | ||
arguments: | ||
- type: string | ||
name: --id | ||
description: "A unique identifier for the dataset" | ||
required: true | ||
- name: --dataset_name | ||
type: string | ||
description: Nicely formatted name. | ||
required: true | ||
- type: string | ||
name: --dataset_url | ||
description: Link to the original source of the dataset. | ||
required: false | ||
- name: --dataset_reference | ||
type: string | ||
description: Bibtex reference of the paper in which the dataset was published. | ||
required: false | ||
- name: --dataset_summary | ||
type: string | ||
description: Short description of the dataset. | ||
required: true | ||
- name: --dataset_description | ||
type: string | ||
description: Long description of the dataset. | ||
required: true | ||
- name: --dataset_organism | ||
type: string | ||
description: The organism of the sample in the dataset. | ||
required: false | ||
- name: Outputs | ||
arguments: | ||
- name: "--output" | ||
__merge__: /src/api/file_common_ist.yaml | ||
direction: output | ||
required: true | ||
|
||
resources: | ||
- type: python_script | ||
path: script.py | ||
- path: download.sh | ||
|
||
engines: | ||
- type: docker | ||
image: openproblems/base_python:1.0.0 | ||
__merge__: | ||
- /src/base/setup_spatialdata_partial.yaml | ||
setup: | ||
- type: python | ||
pypi: | ||
- spatialdata-io | ||
- type: native | ||
|
||
runners: | ||
- type: executable | ||
- type: nextflow | ||
directives: | ||
label: [highmem, midcpu, midtime] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#!/bin/bash | ||
|
||
###################################################################### | ||
# Requirements: | ||
# - Install google-cloud-sdk, e.g. mamba install -c conda-forge google-cloud-sdk | ||
# - Authenticate with your Google account: gcloud auth login --no-browser | ||
# | ||
# Usage: merfish_download.sh <BUCKET_NAME> <DATASET> <OUT_DIR> <DRY_RUN=true/false> | ||
# - DATASET refers to the naming in the Vizgen download page | ||
# | ||
# Vizgen cancer panel datasets download page: https://console.cloud.google.com/storage/browser/vz-ffpe-showcase | ||
# The script should also work for other Vizgen style data buckets. | ||
|
||
BUCKET_NAME=$1 | ||
DATASET=$2 | ||
OUT_DIR=$3 | ||
DRY_RUN=$4 | ||
|
||
###################################################################### | ||
|
||
# Suppress BrokenPipeError messages from Python | ||
export PYTHONWARNINGS="ignore:BrokenPipeError" | ||
|
||
LIST_LIMIT=10 # Maximum number of files to list in dry-run mode | ||
|
||
# List of files or directories to download | ||
FILES=( | ||
"cell_by_gene.csv" | ||
"cell_metadata.csv" | ||
"detected_transcripts.csv" | ||
"cell_boundaries/" # This is a directory | ||
"cell_boundaries.parquet" | ||
"images/mosaic_DAPI_z3.tif" # If you want all images, change this to "images/" | ||
) | ||
|
||
# Check if gsutil is installed | ||
if ! command -v gsutil &> /dev/null; then | ||
echo "ERROR: gsutil is not installed. Please install the Google Cloud SDK." | ||
fi | ||
|
||
# Check if user is authenticated | ||
if ! gcloud auth list 2>&1 | grep -q ACTIVE; then | ||
echo "ERROR: No authenticated Google Cloud account found. Please authenticate using 'gcloud auth login'." | ||
fi | ||
|
||
|
||
# Loop over each file or directory to download | ||
for FILE in "${FILES[@]}"; do | ||
# Get the directory from the file path (remove the file name) | ||
DIR=$(dirname "$FILE") | ||
|
||
# Create the full target directory path | ||
TARGET_DIR="$OUT_DIR/$DIR" | ||
|
||
# If DIR is '.', set TARGET_DIR to OUT_DIR | ||
if [ "$DIR" = "." ]; then | ||
TARGET_DIR="$OUT_DIR" | ||
fi | ||
|
||
# Create the directory if it doesn't exist | ||
mkdir -p "$TARGET_DIR" | ||
|
||
# Check if the file or directory exists in the bucket | ||
if gsutil ls -d "gs://$BUCKET_NAME/$DATASET/$FILE" > /dev/null 2>&1; then | ||
if [ "$DRY_RUN" = true ]; then | ||
# Dry-run: list files instead of downloading | ||
echo -e "\nSimulating download for gs://$BUCKET_NAME/$DATASET/$FILE" | ||
echo -e "Download to $TARGET_DIR" | ||
|
||
if [[ "$FILE" == */ ]]; then | ||
echo "Listing first $LIST_LIMIT files from gs://$BUCKET_NAME/$DATASET/$FILE" | ||
gsutil ls "gs://$BUCKET_NAME/$DATASET/$FILE" 2>/dev/null | head -n $LIST_LIMIT | ||
else | ||
gsutil ls "gs://$BUCKET_NAME/$DATASET/$FILE" | ||
fi | ||
else | ||
# If it's a directory, download recursively | ||
if [[ "$FILE" == */ ]]; then | ||
echo "Downloading directory gs://$BUCKET_NAME/$DATASET/$FILE to $TARGET_DIR/" | ||
gsutil -m cp -r "gs://$BUCKET_NAME/$DATASET/$FILE" "$TARGET_DIR/" | ||
else | ||
# Download the file, maintaining directory structure under OUT_DIR | ||
echo "Downloading file gs://$BUCKET_NAME/$DATASET/$FILE to $TARGET_DIR/" | ||
gsutil cp "gs://$BUCKET_NAME/$DATASET/$FILE" "$TARGET_DIR/" | ||
fi | ||
fi | ||
else | ||
# Skip download if file or directory does not exist | ||
echo "Skipping $FILE: does not exist in gs://$BUCKET_NAME/$DATASET/" | ||
fi | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
# https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard | ||
|
||
import os | ||
import stat | ||
from pathlib import Path | ||
import subprocess | ||
from datetime import datetime | ||
import spatialdata as sd | ||
import spatialdata_io as sdio | ||
|
||
## VIASH START | ||
par = { | ||
"gcloud_bucket": "vz-ffpe-showcase", | ||
"dataset_bucket_name": "HumanBreastCancerPatient1", | ||
"segmentation_id": ["cell"], | ||
"output": "output.zarr", | ||
"id": "vizgen_merscope/2022_vizgen_human_breast_cancer_merfish/rep1", | ||
#"dataset_id": "2022Vizgen_human_breast_cancer_Patient1", | ||
"dataset_name": "value", | ||
"dataset_url": "https://vizgen.com/data-release-program/", | ||
"dataset_reference": "value", | ||
"dataset_summary": "value", | ||
"dataset_description": "value", | ||
"dataset_organism": "human", | ||
} | ||
|
||
|
||
## VIASH END | ||
|
||
assert ("cell" in par["segmentation_id"]) and (len(par["segmentation_id"]) == 1), "Currently cell labels are definitely assigned in this script. And merscope does not provide other segmentations." | ||
|
||
|
||
t0 = datetime.now() | ||
|
||
# Download raw data | ||
print(datetime.now() - t0, "Download raw data", flush=True) | ||
download_script = f"{meta['resources_dir']}/download.sh" #https://viash.io/guide/component/variables.html#meta-variables-in-meta | ||
BUCKET_NAME = par['gcloud_bucket'] | ||
DATASET = par["dataset_bucket_name"] | ||
OUT_DIR = meta["temp_dir"] | ||
DRY_RUN = "false" | ||
|
||
# Change permissions of the download script to make it executable | ||
os.chmod(download_script, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) # Grant read, write, and execute permissions to the user | ||
|
||
# Run the download script | ||
out = subprocess.run([download_script, BUCKET_NAME, DATASET, OUT_DIR, DRY_RUN], check=True, capture_output=True, text=True) | ||
print(out.stdout, flush=True) | ||
|
||
# If the cell polygons are in the old format (cell_boundaries/*.hdf5 instead of cell_boundaries.parquet) the raw data | ||
# needs to be modified for the spatialdata-io loader | ||
# (see: https://github.com/scverse/spatialdata-io/issues/71#issuecomment-1741995582) | ||
import h5py | ||
import pandas as pd | ||
import geopandas as gpd | ||
from shapely.geometry import Polygon | ||
from shapely import MultiPolygon | ||
import os | ||
|
||
def read_boundary_hdf5(folder): | ||
print("Convert boundary hdf5 to parquet", flush=True) | ||
all_boundaries = {} | ||
boundaries = None | ||
hdf5_files = os.listdir(folder + '/cell_boundaries/') | ||
n_files = len(hdf5_files) | ||
incr = n_files // 15 | ||
for _,i in enumerate(hdf5_files): | ||
if (_ % incr) == 0: | ||
print(f"\tProcessed {_}/{n_files}", flush=True) | ||
with h5py.File(folder + '/cell_boundaries/' + i, "r") as f: | ||
for key in f['featuredata'].keys(): | ||
if boundaries is not None: | ||
boundaries.loc[key] = MultiPolygon([Polygon(f['featuredata'][key]['zIndex_3']['p_0']['coordinates'][()][0])]) # doesn't matter which zIndex we use, MultiPolygon to work with read function in spatialdata-io | ||
else: | ||
#boundaries = gpd.GeoDataFrame(index=[key], geometry=MultiPolygon([Polygon(f['featuredata'][key]['zIndex_3']['p_0']['coordinates'][()][0])])) | ||
boundaries = gpd.GeoDataFrame(geometry=gpd.GeoSeries(MultiPolygon([Polygon(f['featuredata'][key]['zIndex_3']['p_0']['coordinates'][()][0])]),index=[key])) | ||
all_boundaries[i] = boundaries | ||
boundaries = None | ||
all_concat = pd.concat(list(all_boundaries.values())) | ||
all_concat = all_concat[~all_concat.index.duplicated(keep='first')] # hdf5 can contain duplicates with same cell_id and position, removing those | ||
all_concat.rename_geometry('geometry_renamed', inplace=True) # renaming to make it compatible with spatialdata-io | ||
all_concat["EntityID"] = all_concat.index # renaming to make it compatible with spatialdata-io | ||
all_concat['ZIndex'] = 0 # adding to make it compatible with spatialdata-io | ||
all_concat.to_parquet(folder + '/cell_boundaries.parquet') | ||
|
||
count_path = f"{folder}/cell_by_gene.csv" | ||
obs_path = f"{folder}/cell_metadata.csv" | ||
|
||
data = pd.read_csv(count_path, index_col=0) | ||
obs = pd.read_csv(obs_path, index_col=0) | ||
|
||
data.index = obs.index.astype(str) # data index in old software is range(n_obs) | ||
data.index.name = "cell" # renaming to make it compatible with spatialdata-io | ||
obs.index.name = 'EntityID' # renaming to make it compatible with spatialdata-io | ||
data.to_csv(count_path) | ||
obs.to_csv(obs_path) | ||
|
||
if not (Path(OUT_DIR) / "cell_boundaries.parquet").exists(): | ||
read_boundary_hdf5(str(OUT_DIR)) | ||
|
||
|
||
# Generate spatialdata.zarr | ||
RAW_DATA_DIR = Path(OUT_DIR) | ||
|
||
######################################### | ||
# Convert raw files to spatialdata zarr # | ||
######################################### | ||
print(datetime.now() - t0, "Convert raw files to spatialdata zarr", flush=True) | ||
|
||
slide_name = "slide" | ||
|
||
sdata = sdio.merscope( | ||
RAW_DATA_DIR, | ||
vpt_outputs=None, | ||
z_layers=3, | ||
region_name=None, | ||
slide_name=slide_name, | ||
backend=None, | ||
transcripts=True, | ||
cells_boundaries=True, | ||
cells_table=True, | ||
mosaic_images=True, | ||
#imread_kwargs=mappingproxy({}), | ||
#image_models_kwargs=mappingproxy({}) | ||
) | ||
|
||
############### | ||
# Rename keys # | ||
############### | ||
print(datetime.now() - t0, "Rename keys", flush=True) | ||
|
||
name = slide_name + "_" + RAW_DATA_DIR.name | ||
|
||
elements_renaming_map = { | ||
f"{name}_z3" : "morphology_mip", #TODO: that is actually not the morphology_mip, i.e. either we should rename the label later, or we should actually project over z. But we also want to have 3d at some point anyway | ||
f"{name}_transcripts" : "transcripts", | ||
f"{name}_polygons" : "cell_polygons", | ||
f"table" : "metadata", | ||
} | ||
|
||
for old_key, new_key in elements_renaming_map.items(): | ||
sdata[new_key] = sdata[old_key] | ||
del sdata[old_key] | ||
|
||
# Rename transcript column | ||
sdata['transcripts'] = sdata['transcripts'].rename(columns={"global_z":"z"}) | ||
|
||
######################################### | ||
# Throw out all channels except of DAPI # | ||
######################################### | ||
print(datetime.now() - t0, "Throw out all channels except of DAPI", flush=True) | ||
|
||
# TODO: in the future we want to keep PolyT and Cellbound1/2/3 stains. Note however, that somehow saving or plotting the sdata fails when | ||
# these channels aren't excluded, not sure why... | ||
sdata["morphology_mip"] = sdata["morphology_mip"].sel(c=["DAPI"]) | ||
|
||
################################# | ||
# Get cell labels from polygons # | ||
################################# | ||
print(datetime.now() - t0, "Get cell labels from polygons", flush=True) | ||
|
||
img_extent = sd.get_extent(sdata['morphology_mip']) | ||
# TODO: Just note that currently the rasterize function has a bug, this error is small though with the given spatial resolution. | ||
# Check https://github.com/scverse/spatialdata/issues/165 for updates on this bug. | ||
sdata["cell_labels"] = sd.rasterize( | ||
sdata["cell_polygons"], | ||
["x", "y"], | ||
min_coordinate=[int(img_extent["x"][0]), int(img_extent["y"][0])], | ||
max_coordinate=[int(img_extent["x"][1]), int(img_extent["y"][1])], | ||
target_coordinate_system="global", | ||
target_unit_to_pixels=1, | ||
return_regions_as_labels=True, | ||
) | ||
del sdata["cell_labels"].attrs['label_index_to_category'] | ||
|
||
############################## | ||
# Add info to metadata table # | ||
############################## | ||
print(datetime.now() - t0, "Add info to metadata table", flush=True) | ||
|
||
#TODO: values as input variables | ||
sdata["metadata"].uns["dataset_id"] = par["id"] | ||
sdata["metadata"].uns["dataset_name"] = par["dataset_name"] | ||
sdata["metadata"].uns["dataset_url"] = par["dataset_url"] | ||
sdata["metadata"].uns["dataset_reference"] = par["dataset_reference"] | ||
sdata["metadata"].uns["dataset_summary"] = par["dataset_summary"] | ||
sdata["metadata"].uns["dataset_description"] = par["dataset_description"] | ||
sdata["metadata"].uns["dataset_organism"] = par["dataset_organism"] | ||
sdata["metadata"].uns["segmentation_id"] = par["segmentation_id"] | ||
|
||
######### | ||
# Write # | ||
######### | ||
print(datetime.now() - t0, f"Writing to {par['output']}", flush=True) | ||
|
||
sdata.write(par["output"]) | ||
|
||
print(datetime.now() - t0, "Done", flush=True) |