nf-core · fmalmeida · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024 · Aug 19, 2024
diff --git a/conf/modules.config b/conf/modules.config
@@ -32,25 +32,28 @@ process {
     }
 
     if (!params.skip_emptydrops) {
-        withName: EMPTYDROPS_CELL_CALLING {
+        withName: 'CELLBENDER_REMOVEBACKGROUND' {
             publishDir = [
-                path: { "${params.outdir}/${params.aligner}" },
-                mode: params.publish_dir_mode,
-                saveAs: { filename ->
-                    if ( params.aligner == 'cellranger' ) "count/${meta.id}/${filename}"
-                    else if ( params.aligner == 'kallisto' ) "${meta.id}.count/${filename}"
-                    else "${meta.id}/${filename}"
-                }
+                path: { "${params.outdir}/${params.aligner}/${meta.id}/emptydrops_filter" },
+                mode: params.publish_dir_mode
+            ]
+        }
+        withName: 'ADATA_BARCODES' {
+            ext.prefix = { "${meta.id}_custom_emptydrops_filter_matrix" }
+            publishDir = [
+                path: { "${params.outdir}/${params.aligner}/mtx_conversions/${meta.id}" },
+                mode: params.publish_dir_mode
             ]
         }
     }
 
-    withName: 'MTX_TO_H5AD|CONCAT_H5AD|MTX_TO_SEURAT' {
+    withName: 'MTX_TO_H5AD*|CONCAT_H5AD|ANNDATAR_CONVERT' {
         publishDir = [
             path: { "${params.outdir}/${params.aligner}/mtx_conversions" },
             mode: params.publish_dir_mode
         ]
     }
+
     withName: 'GTF_GENE_FILTER' {
         publishDir = [
             path: { "${params.outdir}/gtf_filter" },

diff --git a/modules.json b/modules.json
@@ -5,6 +5,11 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
+                    "cellbender/removebackground": {
+                        "branch": "master",
+                        "git_sha": "06c8865e36741e05ad32ef70ab3fac127486af48",
+                        "installed_by": ["modules"]
+                    },
                     "cellranger/count": {
                         "branch": "master",
                         "git_sha": "90dad5491658049282ceb287a3d7732c1ce39837",

diff --git a/modules/local/adata_barcodes.nf b/modules/local/adata_barcodes.nf
@@ -0,0 +1,23 @@
+process ADATA_BARCODES {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'oras://community.wave.seqera.io/library/anndata:0.10.7--e9840a94592528c8':
+        'community.wave.seqera.io/library/anndata:0.10.7--336c6c1921a0632b' }"
+
+    input:
+    tuple val(meta), path(h5ad), path(barcodes_csv)
+
+    output:
+    tuple val(meta), path("*.h5ad"), emit: h5ad
+    path "versions.yml"            , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    template 'barcodes.py'
+}
diff --git a/modules/local/anndatar_convert.nf b/modules/local/anndatar_convert.nf
@@ -0,0 +1,24 @@
+process ANNDATAR_CONVERT {
+    tag "${meta.id}"
+
+    label 'process_medium'
+
+    container "fmalmeida/anndatar:dev" // TODO: Fix
+
+    input:
+    tuple val(meta), path(h5ad)
+
+    output:
+    tuple val(meta), path("${meta.id}_standardized.Rds"), emit: rds
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    template 'anndatar_convert.R'
+
+    stub:
+    """
+    touch ${meta.id}_standardized.Rds
+    """
+}
diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf
@@ -1,13 +1,13 @@
 process CONCAT_H5AD {
+    tag "${meta.id}"
+
     label 'process_medium'
 
-    conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' :
-        'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }"
+    conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg"
+    container "community.wave.seqera.io/library/scanpy:1.10.2--e83da2205b92a538"
 
     input:
-    tuple val(input_type), path(h5ad)
+    tuple val(meta), path(h5ad)
     path samplesheet
 
     output:
@@ -17,12 +17,7 @@ process CONCAT_H5AD {
     task.ext.when == null || task.ext.when
 
     script:
-    """
-    concat_h5ad.py \\
-        --input $samplesheet \\
-        --out combined_${input_type}_matrix.h5ad \\
-        --suffix "_matrix.h5ad"
-    """
+    template 'concat_h5ad.py'
 
     stub:
     """

diff --git a/modules/local/mtx_to_h5ad_star.nf b/modules/local/mtx_to_h5ad_star.nf
@@ -0,0 +1,35 @@
+process MTX_TO_H5AD_STAR {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "conda-forge::scanpy==1.10.2 conda-forge::python-igraph conda-forge::leidenalg"
+    container "community.wave.seqera.io/library/scanpy:1.10.2--e83da2205b92a538"
+
+    input:
+    tuple val(meta), path(inputs)
+    path star_index
+
+    output:
+    tuple val(meta2), path("${meta.id}/*h5ad"), emit: h5ad
+    path  "versions.yml"                      , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    // Get a file to check input type. Some aligners bring arrays instead of a single file.
+    def input_to_check = (inputs instanceof String) ? inputs : inputs[0]
+
+    // check input type of inputs
+    input_type   = (input_to_check.toUriString().contains('raw')) ? 'raw' : 'filtered'
+    meta2        = meta + [input_type: input_type]
+
+    template 'mtx_to_h5ad_star.py'
+
+    stub:
+    """
+    mkdir ${meta.id}
+    touch ${meta.id}/${meta.id}_matrix.h5ad
+    touch versions.yml
+    """
+}
diff --git a/modules/local/templates/anndatar_convert.R b/modules/local/templates/anndatar_convert.R
@@ -0,0 +1,15 @@
+#!/usr/bin/env Rscript
+
+# to use nf variables: "${meta.id}"
+
+# load libraries
+library(anndataR)
+
+# read input
+adata <- read_h5ad("${h5ad}")
+
+# convert to Rds
+obj <- adata\$to_Seurat()
+
+# save files
+saveRDS(obj, file = "${meta.id}_standardized.Rds")
diff --git a/modules/local/templates/barcodes.py b/modules/local/templates/barcodes.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+
+import platform
+import anndata as ad
+import pandas as pd
+
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+df = pd.read_csv("${barcodes_csv}", header=None)
+adata = ad.read_h5ad("${h5ad}")
+
+adata = adata[df[0].values]
+
+adata.write_h5ad("${prefix}.h5ad")
+
+# Versions
+
+versions = {
+    "${task.process}": {
+        "python": platform.python_version(),
+        "anndata": ad.__version__,
+        "pandas": pd.__version__
+    }
+}
+
+with open("versions.yml", "w") as f:
+    f.write(format_yaml_like(versions))
diff --git a/bin/concat_h5ad.py → modules/local/templates/concat_h5ad.py b/bin/concat_h5ad.py → modules/local/templates/concat_h5ad.py
@@ -7,7 +7,6 @@
 
 import scanpy as sc, anndata as ad, pandas as pd
 from pathlib import Path
-import argparse
 
 
 def read_samplesheet(samplesheet):
@@ -17,36 +16,24 @@ def read_samplesheet(samplesheet):
     # samplesheet may contain replicates, when it has,
     # group information from replicates and collapse with commas
     # only keep unique values using set()
-    df = df.groupby(["sample"]).agg(lambda column: ",".join(set(column)))
+    df = df.groupby(["sample"]).agg(lambda column: ",".join(set(str(column))))
 
     return df
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Concatenates h5ad files and merge metadata from samplesheet")
-
-    parser.add_argument("-i", "--input", dest="input", help="Path to samplesheet.csv")
-    parser.add_argument("-o", "--out", dest="out", help="Output path.")
-    parser.add_argument(
-        "-s",
-        "--suffix",
-        dest="suffix",
-        help="Suffix of matrices to remove and get sample name",
-    )
-
-    args = vars(parser.parse_args())
 
     # Open samplesheet as dataframe
-    df_samplesheet = read_samplesheet(args["input"])
+    df_samplesheet = read_samplesheet("${samplesheet}")
 
     # find all h5ad and append to dict
-    dict_of_h5ad = {str(path).replace(args["suffix"], ""): sc.read_h5ad(path) for path in Path(".").rglob("*.h5ad")}
+    dict_of_h5ad = {str(path).replace("_matrix.h5ad", ""): sc.read_h5ad(path) for path in Path(".").rglob("*.h5ad")}
 
     # concat h5ad files
     adata = ad.concat(dict_of_h5ad, label="sample", merge="unique", index_unique="_")
 
     # merge with data.frame, on sample information
-    adata.obs = adata.obs.join(df_samplesheet, on="sample")
-    adata.write_h5ad(args["out"], compression="gzip")
+    adata.obs = adata.obs.join(df_samplesheet, on="sample").astype(str)
+    adata.write_h5ad("combined_${meta.input_type}_matrix.h5ad", compression="gzip")
 
-    print("Wrote h5ad file to {}".format(args["out"]))
+    print("Wrote h5ad file to {}".format("combined_${meta.input_type}_matrix.h5ad"))
diff --git a/modules/local/templates/mtx_to_h5ad_star.py b/modules/local/templates/mtx_to_h5ad_star.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+
+# Set numba chache dir to current working directory (which is a writable mount also in containers)
+import os
+
+os.environ["NUMBA_CACHE_DIR"] = "."
+
+import scanpy as sc
+import pandas as pd
+import argparse
+from anndata import AnnData
+import platform
+
+def _mtx_to_adata(
+    input: str,
+    sample: str,
+):
+    adata = sc.read_10x_mtx(input)
+    adata.obs["sample"] = sample
+
+    return adata
+
+
+def format_yaml_like(data: dict, indent: int = 0) -> str:
+    """Formats a dictionary to a YAML-like string.
+    Args:
+        data (dict): The dictionary to format.
+        indent (int): The current indentation level.
+    Returns:
+        str: A string formatted as YAML.
+    """
+    yaml_str = ""
+    for key, value in data.items():
+        spaces = "  " * indent
+        if isinstance(value, dict):
+            yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
+        else:
+            yaml_str += f"{spaces}{key}: {value}\\n"
+    return yaml_str
+
+def dump_versions():
+    versions = {
+        "${task.process}": {
+            "python": platform.python_version(),
+            "scanpy": sc.__version__,
+            "pandas": pd.__version__
+        }
+    }
+
+    with open("versions.yml", "w") as f:
+        f.write(format_yaml_like(versions))
+
+def input_to_adata(
+    input_data: str,
+    output: str,
+    sample: str,
+):
+    print(f"Reading in {input_data}")
+
+    # open main data
+    adata = _mtx_to_adata(input_data, sample)
+
+    # standard format
+    # index are gene IDs and symbols are a column
+    adata.var["gene_symbol"] = adata.var.index
+    adata.var['gene_versions'] = adata.var["gene_ids"]
+    adata.var['gene_ids'] = adata.var['gene_versions'].str.split('.').str[0]
+    adata.var.index = adata.var["gene_ids"].values
+    adata.var = adata.var.drop("gene_ids", axis=1)
+
+    # write results
+    adata.write_h5ad(f"{output}", compression="gzip")
+    print(f"Wrote h5ad file to {output}")
+
+    # dump versions
+    dump_versions()
+
+    return adata
+
+#
+# Run main script
+#
+
+# create the directory with the sample name
+os.makedirs("${meta.id}", exist_ok=True)
+
+# input_type comes from NF module
+adata = input_to_adata(
+    input_data="${input_type}",
+    output="${meta.id}/${meta.id}_${input_type}_matrix.h5ad",
+    sample="${meta.id}"
+)
diff --git a/modules/nf-core/cellbender/removebackground/environment.yml b/modules/nf-core/cellbender/removebackground/environment.yml