-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from openproblems-bio/add_pciseq
Add pciseq
- Loading branch information
Showing
6 changed files
with
331 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,4 +18,6 @@ singularity_container/ | |
/resources | ||
/.vscode | ||
/.nextflow* | ||
/work | ||
/work | ||
|
||
.DS_STORE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,4 +50,4 @@ info: | |
- type: string | ||
name: region | ||
description: Region | ||
required: true | ||
required: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
165 changes: 165 additions & 0 deletions
165
src/methods_transcript_assignment/pciSeq_transcript_assignment/config.vsh.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
__merge__: /src/api/comp_method_transcript_assignment.yaml | ||
|
||
name: pciseq_transcript_assignment | ||
label: "pciSeq Transcript Assignment" | ||
summary: "Assign transcripts to cells using the pciSeq method from Qian et. al. (2020)" | ||
description: "Uses a reference sc-RNAseq dataset to probabalistically assign cell types and transcripts to cells ." | ||
links: | ||
documentation: "https://github.com/acycliq/pciSeq" | ||
repository: "https://github.com/acycliq/pciSeq" | ||
references: | ||
doi: "10.1038/s41592-019-0631-4" | ||
|
||
arguments: | ||
- name: --transcripts_key | ||
type: string | ||
description: The key of the transcripts within the points of the spatial data | ||
default: transcripts | ||
- name: --coordinate_system | ||
type: string | ||
description: The key of the pixel space coordinate system within the spatial data | ||
default: global | ||
# - name: --sc_cell_type_key | ||
# type: string | ||
# default: cell_type | ||
# required: true | ||
# direction: input | ||
# description: The name of column in the SC-RNAseq AnnData .obs with the cell type of each cell | ||
|
||
# - name: --exclude_genes | ||
# type: string | ||
# required: false | ||
# description: "list of genes to be excluded during cell-typing, e.g ['Aldoc', 'Id2'] to exclude all spots from Aldoc and Id2" | ||
# direction: input | ||
# default: None | ||
|
||
- name: --max_iter | ||
type: integer | ||
required: false | ||
description: "Maximum number of loops allowed for the Variational Bayes to run" | ||
direction: input | ||
default: 1000 | ||
|
||
- name: --CellCallTolerance | ||
type: double | ||
required: false | ||
description: "Convergence achieved if assignment probabilities between two successive loops is less than the tolerance" | ||
direction: input | ||
default: 0.02 | ||
|
||
- name: --rGene | ||
type: double | ||
required: false | ||
description: | | ||
"A gamma distribution expresses the efficiency of the in-situ sequencing for each gene. It tries to capture | ||
the ratio of the observed over the theoretical counts for a given gene. rGene controls the variance and | ||
Inefficiency is the average of this assumed Gamma distribution" | ||
direction: input | ||
default: 20 | ||
|
||
- name: --Inefficiency | ||
type: double | ||
required: false | ||
description: " " | ||
direction: input | ||
default: 0.2 | ||
|
||
- name: --InsideCellBonus | ||
type: double | ||
required: false | ||
description: | | ||
"If a spot is inside the cell boundaries this bonus will give the likelihood an extra boost | ||
in order to make the spot more probable to get assigned to the cell than another spot positioned | ||
outside the cell boundaries" | ||
direction: input | ||
default: 2 | ||
|
||
- name: --MisreadDensity | ||
type: double | ||
required: false | ||
description: | | ||
"To account for spots far from the some a uniform distribution is introduced to describe those misreads. | ||
By default this uniform distribution has a density of 1e-5 misreads per pixel." | ||
direction: input | ||
default: 0.00001 | ||
|
||
- name: --SpotReg | ||
type: double | ||
required: false | ||
description: | | ||
"Gene detection might come with irregularities due to technical errors. A small value is introduced | ||
here to account for these errors. It is an additive factor, applied to the single cell expression | ||
counts when the mean counts per class and per gene are calculated." | ||
direction: input | ||
default: 0.1 | ||
|
||
- name: --nNeighbors | ||
type: integer | ||
required: false | ||
description: | | ||
"By default only the 3 nearest cells will be considered as possible parent cells for any given spot. | ||
There is also one extra 'super-neighbor', which is always a neighbor to the spots so we can assign | ||
the misreads to. Could be seen as the background. Hence, by default the algorithm tries examines | ||
whether any of the 3 nearest cells is a possible parent cell to a given cell or whether the spot is | ||
a misread" | ||
direction: input | ||
default: 3 | ||
|
||
# | ||
- name: --rSpot | ||
type: double | ||
required: false | ||
description: | | ||
"A gamma distributed variate from Gamma(rSpot, 1) is applied to the mean expression, hence the counts | ||
are distributed according to a Negative Binomial distribution. | ||
The value for rSpot will control the variance/dispersion of the counts" | ||
direction: input | ||
default: 2 | ||
|
||
- name: --save_data | ||
type: boolean | ||
required: false | ||
description: "Boolean, if True the output will be saved as tsv files in a folder named 'pciSeq' in your system's temp dir." | ||
direction: input | ||
default: False | ||
|
||
# output directory 'default' will save to temp location | ||
# - name: output_path | ||
# default: ['default'] | ||
|
||
# | ||
# - name: --dtype | ||
# type: string | ||
# required: false | ||
# description: | | ||
# "Use either np.float16 or np.float32 to reduce memory usage. In most cases RAM consumption shouldnt | ||
# need more than 32Gb RAM. If you have a dataset from a full coronal mouse slice with a high number of | ||
# segmented cells (around 150,000) a gene panel of more than 250 genes and 100 or more different | ||
# cell types (aka clusters, aka classes) in the single cell data then you might need at least 64GB on | ||
# your machine. Changing the datatype to a float16 or float32 will help keeping RAM usage to a lower | ||
# level" | ||
# direction: input | ||
# default: np.float64 | ||
|
||
|
||
|
||
resources: | ||
- type: python_script | ||
path: script.py | ||
|
||
engines: | ||
- type: docker | ||
image: openproblems/base_python:1.0.0 | ||
__merge__: | ||
- /src/base/setup_spatialdata_partial.yaml | ||
- /src/base/setup_txsim_partial.yaml | ||
setup: | ||
- type: python | ||
pypi: [pciseq] | ||
- type: native | ||
|
||
runners: | ||
- type: executable | ||
- type: nextflow | ||
directives: | ||
label: [ midtime, midcpu, midmem ] |
143 changes: 143 additions & 0 deletions
143
src/methods_transcript_assignment/pciSeq_transcript_assignment/script.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import numpy as np | ||
import dask | ||
import spatialdata as sd | ||
import txsim as tx | ||
import anndata as ad | ||
import os | ||
import shutil | ||
|
||
## VIASH START | ||
# Note: this section is auto-generated by viash at runtime. To edit it, make changes | ||
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. | ||
par = { | ||
'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr', | ||
'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr', | ||
'transcripts_key': 'transcripts', | ||
'coordinate_system': 'global', | ||
'output': '../pciSeq_assigned_transcripts.zarr', | ||
|
||
'input_scrnaseq': 'resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad', | ||
'sc_cell_type_key': 'cell_type', | ||
|
||
'exclude_genes': None, | ||
'max_iter': 1000, | ||
'CellCallTolerance': 0.02, | ||
'rGene': 20, | ||
'Inefficiency': 0.2, | ||
'InsideCellBonus': 2, | ||
'MisreadDensity': 0.00001, | ||
'SpotReg': 0.1, | ||
'nNeighbors': 3, | ||
'rSpot': 2, | ||
'save_data': False, | ||
'dtype': np.float64 | ||
} | ||
meta = { | ||
'name': 'pciSeq_transcript_assignment' | ||
} | ||
## VIASH END | ||
|
||
# Read input | ||
print('Reading input files', flush=True) | ||
sdata = sd.read_zarr(par['input_ist']) | ||
sdata_segm = sd.read_zarr(par['input_segmentation']) | ||
|
||
# Check if coordinate system is available in input data | ||
transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys() | ||
assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." | ||
segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys() | ||
assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." | ||
|
||
# Transform transcript coordinates to the coordinate system | ||
print('Transforming transcripts coordinates', flush=True) | ||
transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system']) | ||
|
||
# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates | ||
trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse() | ||
transcripts = sd.transform(transcripts, trans, par['coordinate_system']) | ||
|
||
# Assign cell ids to transcripts | ||
print('Assigning transcripts to cell ids', flush=True) | ||
y_coords = transcripts.y.compute().to_numpy() | ||
x_coords = transcripts.x.compute().to_numpy() | ||
|
||
#Added for pciSeq | ||
#TODO this will immediately break when the name of the gene isn't feature_name | ||
transcripts_dataframe = sdata[par['transcripts_key']].compute()[['feature_name']] | ||
transcripts_dataframe['x'] = x_coords | ||
transcripts_dataframe['y'] = y_coords | ||
|
||
#same as before | ||
label_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() #TODO: mabye this line needs generalization (DataTree vs DataArray) | ||
|
||
# Grab all the pciSeq parameters | ||
opts_keys = [#'exclude_genes', | ||
'max_iter', | ||
'CellCallTolerance', | ||
'rGene', | ||
'Inefficiency', | ||
'InsideCellBonus', | ||
'MisreadDensity', | ||
'SpotReg', | ||
'nNeighbors', | ||
'rSpot', | ||
'save_data'] | ||
|
||
opts = {k: par[k] for k in opts_keys} | ||
|
||
input_scrnaseq = ad.read_h5ad(par['input_scrnaseq']) | ||
input_scrnaseq.X = input_scrnaseq.layers['counts'] | ||
|
||
assignments, cell_types = tx.preprocessing.run_pciSeq( | ||
transcripts_dataframe, | ||
label_image, | ||
input_scrnaseq, | ||
par['sc_cell_type_key'], | ||
opts | ||
) | ||
|
||
#assign transcript -> cell | ||
cell_id_dask_series = dask.dataframe.from_dask_array( | ||
dask.array.from_array( | ||
assignments['cell'].to_numpy(), chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute()) | ||
), | ||
index=sdata[par['transcripts_key']].index | ||
) | ||
|
||
sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series | ||
|
||
# create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id') | ||
cell_types['type'] = cell_types['type'].replace({'None':'None_sp'}) | ||
cell_types.insert(0, 'cell_id', cell_types.index) | ||
cell_types.rename(columns={'type':'cell_type','prob':'cell_type_prob'}, inplace=True) | ||
|
||
assert 0 not in cell_types['cell_id'], "Found '0' in cell_id column of assingment output cell matrix" | ||
|
||
output_table = ad.AnnData( | ||
obs=cell_types[['cell_id','cell_type','cell_type_prob']], | ||
var=sdata.tables["table"].var[[]] | ||
) | ||
|
||
# TODO: Also take care of the following cases: | ||
# - segmentation 3D, transcripts 3D | ||
# - segmentation 3D, transcripts 2D | ||
# - segmentation 2D, transcripts 3D | ||
|
||
# Subset sdata to transcripts with cell ids | ||
|
||
print('Subsetting to transcripts cell id and cell type data', flush=True) | ||
sdata_transcripts_only = sd.SpatialData( | ||
points={ | ||
"transcripts": sdata[par['transcripts_key']] | ||
}, | ||
tables={ | ||
"table": output_table | ||
} | ||
) | ||
|
||
print('Write transcripts with cell ids and cell types', flush=True) | ||
if os.path.exists(par["output"]): | ||
shutil.rmtree(par["output"]) | ||
sdata_transcripts_only.write(par['output']) | ||
|
||
|