Merge pull request #100 from vib-singlecell-nf/develop

Develop Former-commit-id: cc19ce9
vib-singlecell-nf · Jan 22, 2020 · f56f454 · f56f454
2 parents 29ab89d + ba5e9a6
commit f56f454
Show file tree

Hide file tree

Showing 29 changed files with 455 additions and 174 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,5 @@ work/
 out/
 tests/
 debug/
+*.swp
+*.swo
diff --git a/README.md b/README.md
diff --git a/data/README.md b/data/README.md
@@ -24,7 +24,7 @@ tar -xzvf pbmc_1k_v3_filtered_feature_bc_matrix.tar.gz -C data/10x/1k_pbmc/1k_pb
 
 Download the small meta data to annotate the samples:
 ```
-wget https://raw.githubusercontent.com/aertslab/SingleCellTxBenchmark/master/data/10x/1k_pbmc/metadata.tsv -O data/10x/1k_pbmc/metadata.tsv
+wget https://raw.githubusercontent.com/vib-singlecell-nf/vsn-pipelines/master/data/10x/1k_pbmc/metadata.tsv -O data/10x/1k_pbmc/metadata.tsv
 ```
 
 If these links appear not work, you can always download them from https://support.10xgenomics.com/single-cell-gene-expression/datasets.

diff --git a/main.nf b/main.nf
@@ -33,7 +33,7 @@ workflow bbknn_scenic {
 // run single_sample, output a scope loom file
 workflow single_sample {
 
-    include single_sample as SINGLE_SAMPLE from './workflows/single_sample' params(params)
+    include single_sample_standalone as SINGLE_SAMPLE from './workflows/single_sample' params(params)
     SINGLE_SAMPLE()
 
 }
@@ -43,7 +43,7 @@ workflow single_sample {
 workflow single_sample_scenic {
 
     include SCENIC_append from './src/scenic/main.nf' params(params)
-    include single_sample as SINGLE_SAMPLE from './workflows/single_sample' params(params)
+    include single_sample_standalone as SINGLE_SAMPLE from './workflows/single_sample' params(params)
     SINGLE_SAMPLE()
     SCENIC_append( SINGLE_SAMPLE.out.filteredloom, SINGLE_SAMPLE.out.scopeloom )
 
@@ -75,7 +75,19 @@ workflow cellranger {
 // runs mkfastq, CellRanger count, then single_sample:
 workflow single_sample_cellranger {
 
-    cellranger | single_sample
+    include single_sample as SINGLE_SAMPLE from './workflows/single_sample' params(params)
+    cellranger | SINGLE_SAMPLE
+
+}
+
+workflow h5ad_single_sample {
+
+    include getChannel as getH5ADChannel from './src/channels/h5ad' params(params)
+    include single_sample as SINGLE_SAMPLE from './workflows/single_sample' params(params)
+    data = getH5ADChannel( 
+        params.data.h5ad.file_paths,
+        params.data.h5ad.suffix
+    ).view() | SINGLE_SAMPLE
 
 }
 

diff --git a/nextflow.config b/nextflow.config
@@ -1,9 +1,9 @@
 
 manifest {
-    name = 'vib-singlecell-nf/vib-singlecell-nf'
+    name = 'vib-singlecell-nf/vsn-pipelines'
     description = 'A repository of pipelines for single-cell data in Nextflow DSL2'
-    homePage = 'https://github.com/vib-singlecell-nf/vib-singlecell-nf'
-    version = '0.6.1'
+    homePage = 'https://github.com/vib-singlecell-nf/vsn-pipelines'
+    version = '0.8.0'
     mainScript = 'main.nf'
     defaultBranch = 'master'
     nextflowVersion = '!19.12.0-edge' // with ! prefix, stop execution if current version does not match required version.
@@ -110,6 +110,9 @@ profiles {
     tenx {
         includeConfig 'src/channels/conf/tenx.config'
     }
+    h5ad {
+        includeConfig 'src/channels/conf/h5ad.config'
+    }
     sra {
         includeConfig 'src/channels/conf/sra.config'
         includeConfig 'src/utils/conf/sra_metadata.config'

diff --git a/src/cellranger b/src/cellranger
diff --git a/src/channels/conf/h5ad.config b/src/channels/conf/h5ad.config
@@ -0,0 +1,8 @@
+params {
+    data {
+        h5ad {
+            file_paths = ''
+            suffix = ''
+        }
+    }
+}
diff --git a/src/channels/h5ad.nf b/src/channels/h5ad.nf
@@ -0,0 +1,33 @@
+nextflow.preview.dsl=2
+
+def extractSample(path, suffix) {
+    if(!path.endsWith(".h5ad"))
+        throw new Exception("Wrong channel used for data: "+ path)
+    // Extract the sample name based on the given path and on the given suffix
+    suffix = suffix.replace(".","\\.")
+    pattern = /(.+)\/(.+)${suffix}/
+    (full, parentDir, id) = (path =~ pattern)[0]
+    return id
+}
+
+workflow getChannel {
+
+    take:
+        glob
+        sampleSuffixWithExtension // Suffix after the sample name in the file paths
+
+    main:
+        // Check whether multiple globs are provided
+        if(glob.contains(',')) {
+            glob = Arrays.asList(glob.split(',')); 
+        }
+        channel = Channel
+            .fromPath(glob, checkIfExists: true)
+            .map {
+                path -> tuple(extractSample( "${path}", sampleSuffixWithExtension ), file("${path}"))
+            }
+
+    emit:
+        channel
+
+}
diff --git a/src/channels/singleend.nf b/src/channels/singleend.nf
@@ -3,7 +3,6 @@ nextflow.preview.dsl=2
 def extractSample(path) {
     pattern = /(.+)\/(.+)_R[1-2](.*)\.fastq(\.gz)?/
     (full, parentDir, id, whateverSuffix, compressionExtension) = (path =~ pattern)[0]
-
     return id
 }
 
@@ -20,7 +19,7 @@ workflow getChannel {
         channel = Channel
             .fromPath(glob, checkIfExists: true)
             .map {
-                path -> tuple(extractSample( "${path}" ), path("${path}"))
+                path -> tuple(extractSample( "${path}" ), file("${path}"))
             }
 
     emit:

diff --git a/src/dropletutils b/src/dropletutils
diff --git a/src/fastp b/src/fastp
diff --git a/src/flybaser b/src/flybaser
diff --git a/src/pcacv b/src/pcacv
diff --git a/src/picard b/src/picard
diff --git a/src/scanpy b/src/scanpy
diff --git a/src/scenic b/src/scenic
diff --git a/src/sratoolkit b/src/sratoolkit
diff --git a/src/utils/Dockerfile b/src/utils/Dockerfile
@@ -12,7 +12,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 
 RUN python3 -m pip install ipykernel && \
     pip install --no-cache-dir papermill && \
-    pip install --no-cache-dir pysradb
+    pip install --no-cache-dir pysradb==0.9.9
 
 FROM python:3.6.8-slim-stretch AS build-image
 RUN apt-get -y update && \

diff --git a/src/utils/bin/sc_file_converter.py b/src/utils/bin/sc_file_converter.py
@@ -46,6 +46,15 @@
     action='store'
 )
 
+parser.add_argument(
+    "-t", "--tag-cell-with-sample-id",
+    action="store_true",
+    dest="tag_cell_with_sample_id",
+    default=False,
+    help="Tag each cell with the given sample_id."
+)
+
+
 parser.add_argument(
     "-o", "--output-format",
     action="store",  # optional because action defaults to "store"
@@ -83,6 +92,12 @@ def check_10x_cellranger_mex_path(path):
         )
 
 
+def add_sample_id(adata, args):
+    # Annotate the file with the sample ID
+    adata.obs["sample_id"] = args.sample_id
+    return adata
+
+
 if INPUT_FORMAT == '10x_cellranger_mex' and OUTPUT_FORMAT == 'h5ad':
     check_10x_cellranger_mex_path(path=FILE_PATH_IN)
     # Convert
@@ -92,8 +107,12 @@ def check_10x_cellranger_mex_path(path):
         var_names='gene_symbols',  # use gene symbols for the variable names (variables-axis index)
         cache=False
     )
-    # If is sample_id is given, add the sample ID as suffix
-    if args.sample_id is not None:
+    adata = add_sample_id(
+        adata=adata,
+        args=args
+    )
+    # If is tag_cell_with_sample_id is given, add the sample ID as suffix
+    if args.tag_cell_with_sample_id:
         adata.obs.index = map(lambda x: re.sub('-[0-9]+', f"-{args.sample_id}", x), adata.obs.index)
     print("Writing 10x data to h5ad...")
     adata.write_h5ad(filename="{}.h5ad".format(FILE_PATH_OUT_BASENAME))
@@ -106,8 +125,12 @@ def check_10x_cellranger_mex_path(path):
     adata = sc.read_10x_h5(
         FILE_PATH_IN
     )
-    # If is sample_id is given, add the sample ID as suffix
-    if args.sample_id is not None:
+    adata = add_sample_id(
+        adata=adata,
+        args=args
+    )
+    # If is tag_cell_with_sample_id is given, add the sample ID as suffix
+    if args.tag_cell_with_sample_id:
         adata.obs.index = map(lambda x: re.sub('-[0-9]+', f"-{args.sample_id}", x), adata.obs.index)
     print("Writing 10x data to h5ad...")
     adata.write_h5ad(filename="{}.h5ad".format(FILE_PATH_OUT_BASENAME))

diff --git a/src/utils/bin/sra_to_metadata.py b/src/utils/bin/sra_to_metadata.py
@@ -76,15 +76,15 @@
 # Get the metadata
 #
 
-if "sra_db" in args:
+if args.sra_db is not None:
     db = SRAdb(args.sra_db.name)
     print(f"Using local SRA SQLite database to query...")
 else:
     print(f"Using NCBi's esearch and esummary interface to query...")
     db = SRAweb()
 
 metadata = db.sra_metadata(
-    acc=args.sra_project_id,
+    args.sra_project_id,
     detailed=True,
     expand_sample_attributes=True,
     sample_attribute=True
@@ -99,7 +99,7 @@
     [
         metadata,
         metadata["experiment_title"].str.extract(
-            r'^(.*): ([a-zA-Z0-9_-]*); (.*); (.*)$', expand=True
+            r'^(.*): ([a-zA-Z0-9\s,_-]*); (.*); (.*)$', expand=True
         ).rename(
             columns={
                 0: 'geo_accession',

diff --git a/src/utils/conf/base.config b/src/utils/conf/base.config
@@ -1,6 +1,6 @@
 params {
     utils {
-        container = 'dweemx/sctx-utils:0.2.0'
+        container = 'vibsinglecellnf/utils:0.2.1'
         workflow_configuration {
             report_ipynb = "/src/utils/bin/reports/workflow_configuration_template.ipynb"
         }
@@ -22,4 +22,4 @@ params {
             off = 'tsv'
         }
     }
-}
+}
diff --git a/src/utils/conf/sra_metadata.config b/src/utils/conf/sra_metadata.config
@@ -3,11 +3,12 @@ params {
         outdir = 'out'
     }
     utils {
-        container = 'dweemx/sctx-utils:0.0.1'
+        container = 'vibsinglecellnf/utils:0.2.1'
         sra_metadata {
-            sraDb = ''
-            sraDbForceDownload = false
-            sraDbOutDir = '/ddn1/vol1/staging/leuven/stg_00002/lcb/dwmax/documents/resources/sra'
+            mode = 'web' // or db
+            // sraDb = ''
+            // sraDbForceDownload = false
+            // sraDbOutDir = ''
         }
     }
 }
diff --git a/src/utils/conf/test.config b/src/utils/conf/test.config
@@ -1,7 +1,7 @@
 params {
     sc {
         scanpy {
-            container = 'aertslab/sctx-scanpy:0.5.0'
+            container = 'vibsinglecellnf/scanpy:0.5.0'
         }
         file_converter {
             iff = '10x_cellranger_mex'

diff --git a/src/utils/conf/test/downloadSRARunCellRanger.test.config b/src/utils/conf/test/downloadSRARunCellRanger.test.config
@@ -2,6 +2,15 @@ params {
     global {
         outdir = 'out'
     }
+    data {
+        // Based on SRA Project Identifiers
+        sra = [
+            [
+                id: '',
+                samples: [""] // Use Unix globbing
+            ]
+        ]
+    }
     sratoolkit {
         container = 'dweemx/sctx-sratoolkit:2.9.4-1.1.0'
         downloadFastqs {
@@ -30,12 +39,16 @@ params {
                 // indicies = ''
             }
         }
-        utils {
-            sra_metadata {
-                sraDb = ''
-                sraDbForceDownload = false
-                sraDbOutDir = '/ddn1/vol1/staging/leuven/stg_00002/lcb/dwmax/documents/resources/sra'
-            }
+    }
+
+    utils {
+        container = 'vibsinglecellnf/utils:0.2.1'
+
+        sra_metadata {
+            mode = 'web' // or db
+            // sraDb = ''
+            // sraDbForceDownload = false
+            // sraDbOutDir = ''
         }
     }
 }

diff --git a/src/utils/main.test.nf b/src/utils/main.test.nf
@@ -77,14 +77,22 @@ workflow {
                 db = file(params.utils.sra_metadata.sraDbOutDir + "/SRAmetadb.sqlite")
                 SRA_TO_METADATA( sra, db )
             break;
+            case "GET_METADATA_FROM_SRA_WEB":
+                // Imports
+                include getChannel as getSRAChannel from './../channels/sra' params(params)
+                include SRA_TO_METADATA from './processes/sra' params(params)
+                // Run
+                sra = getSRAChannel( params.data.sra )
+                SRA_TO_METADATA( sra, file('NO_FILE') )
+            break;
             case "DOWNLOAD_FROM_SRA":
                 // Imports
                 include DOWNLOAD_FROM_SRA from './workflows/downloadFromSRA' params(params)
                 include SC__CELLRANGER__PREPARE_FOLDER from './../cellranger/processes/utils.nf'
                 include SC__CELLRANGER__COUNT   from './../cellranger/processes/count'    params(params)
                 // Run 
                 DOWNLOAD_FROM_SRA(
-                    tuple('SRP125768', ["w1118_15d_*"]) //["DGRP-551_*d_r*","w1118_*d_r*"]
+                    tuple('SRP162698', ["10x, sample 1", "10x, sample 2"])
                 )
             break;
             case "DOWNLOAD_FROM_SRA_AND_RUN_CELLRANGER":

diff --git a/src/utils/processes/sra.nf b/src/utils/processes/sra.nf
@@ -42,12 +42,20 @@ process SRA_TO_METADATA {
         file "${sraId}_metadata.tsv"
 
     script:
-        if(sraDb.name != 'NO_FILE') {
-            sraDbAsArgument = "--sra-db ${sraDb}"
+        if(processParams.mode == 'db') {
+            if(sraDb.name != 'NO_FILE') {
+                sraDbAsArgument = "--sra-db ${sraDb}"
+            } else {
+                if(!processParams.containsKey('sraDb') || processParams.sraDb == '')
+                    throw new Exception("The db modue requires sraDb to be specified")
+                sraDbAsArgument = '--sra-db ' + processParams.sraDb
+            }
+        } else if(processParams.mode == 'web') {
+            sraDbAsArgument = ''
         } else {
-            sraDbAsArgument = (processParams.containsKey('sraDb') && processParams.sraDb != '') ? '--sra-db ' + processParams.sraDb : ''
+            throw new Exception("The "+ processParams.mode +" mode does not exist. Choose one of: web, db.")
         }
-        def sampleFiltersAsArguments = sampleFilters.collect({ '--sample-filter' + ' ' + it }).join(' ')
+        def sampleFiltersAsArguments = sampleFilters.collect({ '--sample-filter' + ' "' + it + '"'}).join(' ')
         """
         ${binDir}sra_to_metadata.py \
             ${sraId} \
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,3 +24,5 @@ work/ @@
     out/
     tests/
     debug/
+    *.swp
+    *.swo
+43 −22		bin/reports/sc_bbknn_report.ipynb
+5 −5		bin/reports/sc_clustering_report.ipynb
+1 −1		conf/base.config
+12 −2		processes/reports.nf
+3 −2		workflows/bec_bbknn.nf
+1 −1		workflows/dim_reduction_pca.nf