Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardize conversion workflow #369

Draft
wants to merge 23 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,28 @@ process {
}

if (!params.skip_emptydrops) {
withName: EMPTYDROPS_CELL_CALLING {
withName: 'CELLBENDER_REMOVEBACKGROUND' {
publishDir = [
path: { "${params.outdir}/${params.aligner}" },
mode: params.publish_dir_mode,
saveAs: { filename ->
if ( params.aligner == 'cellranger' ) "count/${meta.id}/${filename}"
else if ( params.aligner == 'kallisto' ) "${meta.id}.count/${filename}"
else "${meta.id}/${filename}"
}
path: { "${params.outdir}/${params.aligner}/${meta.id}/emptydrops_filter" },
mode: params.publish_dir_mode
]
}
withName: 'ADATA_BARCODES' {
ext.prefix = { "${meta.id}_custom_emptydrops_filter_matrix" }
publishDir = [
path: { "${params.outdir}/${params.aligner}/mtx_conversions/${meta.id}" },
mode: params.publish_dir_mode
]
}
}

withName: 'MTX_TO_H5AD|CONCAT_H5AD|MTX_TO_SEURAT' {
withName: 'MTX_TO_H5AD*|CONCAT_H5AD|ANNDATAR_CONVERT' {
publishDir = [
path: { "${params.outdir}/${params.aligner}/mtx_conversions" },
mode: params.publish_dir_mode
]
}

withName: 'GTF_GENE_FILTER' {
publishDir = [
path: { "${params.outdir}/gtf_filter" },
Expand Down
5 changes: 5 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
"cellbender/removebackground": {
"branch": "master",
"git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48",
"installed_by": ["modules"]
},
"cellranger/count": {
"branch": "master",
"git_sha": "90dad5491658049282ceb287a3d7732c1ce39837",
Expand Down
23 changes: 23 additions & 0 deletions modules/local/adata_barcodes.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
process ADATA_BARCODES {
tag "$meta.id"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'oras://community.wave.seqera.io/library/anndata:0.10.7--e9840a94592528c8':
'community.wave.seqera.io/library/anndata:0.10.7--336c6c1921a0632b' }"

input:
tuple val(meta), path(h5ad), path(barcodes_csv)

output:
tuple val(meta), path("*.h5ad"), emit: h5ad
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
prefix = task.ext.prefix ?: "${meta.id}"
template 'barcodes.py'
}
24 changes: 24 additions & 0 deletions modules/local/anndatar_convert.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
process ANNDATAR_CONVERT {
tag "${meta.id}"

label 'process_medium'

container "fmalmeida/anndatar:dev" // TODO: Fix

input:
tuple val(meta), path(h5ad)

output:
tuple val(meta), path("${meta.id}_standardized.Rds"), emit: rds

when:
task.ext.when == null || task.ext.when

script:
template 'anndatar_convert.R'

stub:
"""
touch ${meta.id}_standardized.Rds
"""
}
17 changes: 6 additions & 11 deletions modules/local/concat_h5ad.nf
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
process CONCAT_H5AD {
tag "${meta.id}"

label 'process_medium'

conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' :
'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }"
conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg"
container "community.wave.seqera.io/library/scanpy:1.10.2--e83da2205b92a538"

input:
tuple val(input_type), path(h5ad)
tuple val(meta), path(h5ad)
path samplesheet

output:
Expand All @@ -17,12 +17,7 @@ process CONCAT_H5AD {
task.ext.when == null || task.ext.when

script:
"""
concat_h5ad.py \\
--input $samplesheet \\
--out combined_${input_type}_matrix.h5ad \\
--suffix "_matrix.h5ad"
"""
template 'concat_h5ad.py'

stub:
"""
Expand Down
35 changes: 35 additions & 0 deletions modules/local/mtx_to_h5ad_star.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
process MTX_TO_H5AD_STAR {
tag "$meta.id"
label 'process_medium'

conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg"
container "community.wave.seqera.io/library/scanpy:1.10.2--e83da2205b92a538"

input:
tuple val(meta), path(inputs)
path star_index

output:
tuple val(meta2), path("${meta.id}/*h5ad"), emit: h5ad
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

script:
// Get a file to check input type. Some aligners bring arrays instead of a single file.
def input_to_check = (inputs instanceof String) ? inputs : inputs[0]

// check input type of inputs
input_type = (input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered'
meta2 = meta + [input_type: input_type]

template 'mtx_to_h5ad_star.py'

stub:
"""
mkdir ${meta.id}
touch ${meta.id}/${meta.id}_matrix.h5ad
touch versions.yml
"""
}
15 changes: 15 additions & 0 deletions modules/local/templates/anndatar_convert.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env Rscript

# to use nf variables: "${meta.id}"

# load libraries
library(anndataR)

# read input
adata <- read_h5ad("${h5ad}")

# convert to Rds
obj <- adata\$to_Seurat()

# save files
saveRDS(obj, file = "${meta.id}_standardized.Rds")
44 changes: 44 additions & 0 deletions modules/local/templates/barcodes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3

import platform
import anndata as ad
import pandas as pd

def format_yaml_like(data: dict, indent: int = 0) -> str:
"""Formats a dictionary to a YAML-like string.

Args:
data (dict): The dictionary to format.
indent (int): The current indentation level.

Returns:
str: A string formatted as YAML.
"""
yaml_str = ""
for key, value in data.items():
spaces = " " * indent
if isinstance(value, dict):
yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
else:
yaml_str += f"{spaces}{key}: {value}\\n"
return yaml_str

df = pd.read_csv("${barcodes_csv}", header=None)
adata = ad.read_h5ad("${h5ad}")

adata = adata[df[0].values]

adata.write_h5ad("${prefix}.h5ad")

# Versions

versions = {
"${task.process}": {
"python": platform.python_version(),
"anndata": ad.__version__,
"pandas": pd.__version__
}
}

with open("versions.yml", "w") as f:
f.write(format_yaml_like(versions))
25 changes: 6 additions & 19 deletions bin/concat_h5ad.py → modules/local/templates/concat_h5ad.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import scanpy as sc, anndata as ad, pandas as pd
from pathlib import Path
import argparse


def read_samplesheet(samplesheet):
Expand All @@ -17,36 +16,24 @@ def read_samplesheet(samplesheet):
# samplesheet may contain replicates, when it has,
# group information from replicates and collapse with commas
# only keep unique values using set()
df = df.groupby(["sample"]).agg(lambda column: ",".join(set(column)))
df = df.groupby(["sample"]).agg(lambda column: ",".join(set(str(column))))

return df


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Concatenates h5ad files and merge metadata from samplesheet")

parser.add_argument("-i", "--input", dest="input", help="Path to samplesheet.csv")
parser.add_argument("-o", "--out", dest="out", help="Output path.")
parser.add_argument(
"-s",
"--suffix",
dest="suffix",
help="Suffix of matrices to remove and get sample name",
)

args = vars(parser.parse_args())

# Open samplesheet as dataframe
df_samplesheet = read_samplesheet(args["input"])
df_samplesheet = read_samplesheet("${samplesheet}")

# find all h5ad and append to dict
dict_of_h5ad = {str(path).replace(args["suffix"], ""): sc.read_h5ad(path) for path in Path(".").rglob("*.h5ad")}
dict_of_h5ad = {str(path).replace("_matrix.h5ad", ""): sc.read_h5ad(path) for path in Path(".").rglob("*.h5ad")}

# concat h5ad files
adata = ad.concat(dict_of_h5ad, label="sample", merge="unique", index_unique="_")

# merge with data.frame, on sample information
adata.obs = adata.obs.join(df_samplesheet, on="sample")
adata.write_h5ad(args["out"], compression="gzip")
adata.obs = adata.obs.join(df_samplesheet, on="sample").astype(str)
adata.write_h5ad("combined_${meta.input_type}_matrix.h5ad", compression="gzip")

print("Wrote h5ad file to {}".format(args["out"]))
print("Wrote h5ad file to {}".format("combined_${meta.input_type}_matrix.h5ad"))
92 changes: 92 additions & 0 deletions modules/local/templates/mtx_to_h5ad_star.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env python

# Set numba chache dir to current working directory (which is a writable mount also in containers)
import os

os.environ["NUMBA_CACHE_DIR"] = "."

import scanpy as sc
import pandas as pd
import argparse
from anndata import AnnData
import platform

def _mtx_to_adata(
input: str,
sample: str,
):
adata = sc.read_10x_mtx(input)
adata.obs["sample"] = sample

return adata


def format_yaml_like(data: dict, indent: int = 0) -> str:
"""Formats a dictionary to a YAML-like string.
Args:
data (dict): The dictionary to format.
indent (int): The current indentation level.
Returns:
str: A string formatted as YAML.
"""
yaml_str = ""
for key, value in data.items():
spaces = " " * indent
if isinstance(value, dict):
yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
else:
yaml_str += f"{spaces}{key}: {value}\\n"
return yaml_str

def dump_versions():
versions = {
"${task.process}": {
"python": platform.python_version(),
"scanpy": sc.__version__,
"pandas": pd.__version__
}
}

with open("versions.yml", "w") as f:
f.write(format_yaml_like(versions))

def input_to_adata(
input_data: str,
output: str,
sample: str,
):
print(f"Reading in {input_data}")

# open main data
adata = _mtx_to_adata(input_data, sample)

# standard format
# index are gene IDs and symbols are a column
adata.var["gene_symbol"] = adata.var.index
adata.var['gene_versions'] = adata.var["gene_ids"]
adata.var['gene_ids'] = adata.var['gene_versions'].str.split('.').str[0]
adata.var.index = adata.var["gene_ids"].values
adata.var = adata.var.drop("gene_ids", axis=1)

# write results
adata.write_h5ad(f"{output}", compression="gzip")
print(f"Wrote h5ad file to {output}")

# dump versions
dump_versions()

return adata

#
# Run main script
#

# create the directory with the sample name
os.makedirs("${meta.id}", exist_ok=True)

# input_type comes from NF module
adata = input_to_adata(
input_data="${input_type}",
output="${meta.id}/${meta.id}_${input_type}_matrix.h5ad",
sample="${meta.id}"
)
5 changes: 5 additions & 0 deletions modules/nf-core/cellbender/removebackground/environment.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading