update

xiaoli-dong · Jan 30, 2024 · b2bc2e0 · b2bc2e0
1 parent 23cb2a7
commit b2bc2e0
Show file tree

Hide file tree

Showing 26 changed files with 1,053 additions and 244 deletions.
diff --git a/bin/GBS-SBG.pl b/bin/GBS-SBG.pl
diff --git a/bin/combine_xml.py b/bin/combine_xml.py
@@ -21,7 +21,8 @@ def main():
         xml_element_tree = None
         for xml_file in xml_files:
             data = ET.tostring(ET.parse(xml_file).getroot()).decode("utf-8")
-            fout.write(data)    
+            fout.write(data)
+            fout.write('\n')    
     fout.close()
 
 

diff --git a/bin/xml2csv.py b/bin/xml2csv.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+
+import csv
+import argparse
+import xml.etree.ElementTree as ET
+
+def xml_to_csv(element, csv_writer): 
+    # Extract element name and text 
+    name = element.tag 
+    text = element.text 
+
+    # Extract element attributes 
+    attrib = element.attrib 
+
+    # Write element name and text as well as attributes to CSV file 
+    row = [name, text] + list(attrib.values()) 
+    csv_writer.writerow(row) 
+
+    # Recursively process child elements 
+    for child in element: 
+        xml_to_csv(child, csv_writer) 
+
+def main():
+
+    description = "Combine multiple xml files into a single xml file"
+    parser = argparse.ArgumentParser(description=description)
+
+    # help=f"Space seperated xml file list, for example: 'f1.xml f2.xml f3.xml'\n",
+    parser.add_argument('-i', "--input", required=True, help=f"space seperated xml file name list\n")
+    parser.add_argument("-o", "--output", required=True, default="combined.xml", help=f"Output file name\n")
+
+    args = parser.parse_args()
+
+    cols = ["name", "phone", "email", "date", "country"] 
+    rows = [] 
+
+    tree = ET.parse(args.input)
+    root = tree.getroot()
+
+    for result in root[1]:
+        sample_data = []
+        #print(result.tag)
+        #print(result.attrib)
+        for detail in result:
+            #print(detail.tag)
+            #print(detail.attrib)
+            print(detail.attrib.get("type"))
+            print(detail.attrib.get("value"))
+
+
+    #Open CSV file for writing 
+    with open(args.output, "w", newline="") as csv_file: 
+        # Create CSV writer 
+        csv_writer = csv.writer(csv_file) 
+
+        # Convert XML to CSV 
+        xml_to_csv(root, csv_writer) 
+
+if __name__ == "__main__":
+    main()
+
diff --git a/conf/modules.config b/conf/modules.config
@@ -72,6 +72,14 @@ if(!params.skip_illumina_reads_qc){
                 saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
             ]
         }
+        withName: CSVTK_CONCAT_STATS_NOT_ASSEMBLED{
+            publishDir = [
+                path: { "${params.outdir}/report" },
+                mode: params.publish_dir_mode,
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+
+            ]
+        }
     }
 
     if(params.illumina_reads_qc_tool == 'bbduk'){
@@ -276,7 +284,7 @@ if(! params.skip_tbprofiler){
 if(! params.skip_pneumocat){
     process{
         withName: PNEUMOCAT {
-            ext.prefix = { "${meta.id}.pneumocat" }
+            //ext.prefix = { "${meta.id}.pneumocat" }
             ext.args = '--cleanup'
             publishDir = [
                 path: { "${params.outdir}/${meta.id}/pneumocat/illumina" },
@@ -628,7 +636,32 @@ if(! params.skip_emmtyper){
 
     }
 }
+if(! params.skip_gbssbg){
+    process{
+        withName: GBS_SBG {
+            ext.prefix = { "${meta.id}.gbssbg" }
+
+            publishDir = [
+                path: { "${params.outdir}/${meta.id}/gbssbg" },
+                mode: params.publish_dir_mode,
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+                enabled: true
+            ]
 
+        }
+
+        withName: CSVTK_CONCAT_GBSSBG{
+            ext.args = '-C \'$\' -I -E '
+            publishDir = [
+                path: { "${params.outdir}/report" },
+                mode: params.publish_dir_mode,
+                saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+                enabled: true
+            ]
+        }
+
+    }
+}
 
 if (params.platform == 'nanopore') {
     includeConfig 'modules_nanopore.config'

diff --git a/conf/modules_illumina.config b/conf/modules_illumina.config
@@ -12,6 +12,8 @@
 
 if(!params.skip_illumina_reads_assembly){
     process {
+
+
         withName: SKESA {
             publishDir = [
                 path: { "${params.outdir}/${meta.id}/assembly/illumina/skesa" },

diff --git a/modules/local/csvtk/concat/main.nf b/modules/local/csvtk/concat/main.nf
@@ -0,0 +1,40 @@
+process CSVTK_CONCAT {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::csvtk=0.23.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/csvtk:0.23.0--h9ee0642_0' :
+        'biocontainers/csvtk:0.23.0--h9ee0642_0' }"
+
+    input:
+    tuple val(meta), path(csv)
+    val in_format
+    val out_format
+
+    output:
+    tuple val(meta), path("${prefix}.${out_extension}"), emit: csv
+    path "versions.yml"                                , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args   ?: ''
+    prefix   = task.ext.prefix ?: "${meta.id}"
+
+    out_extension = out_format == "tsv" ? 'tsv' : 'csv'
+    """
+    csvtk \\
+        concat \\
+        $args \\
+        --num-cpus $task.cpus \\
+        --out-file ${prefix}.${out_extension} \\
+        $csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" ))
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/csvtk/concat/meta.yml b/modules/local/csvtk/concat/meta.yml
@@ -0,0 +1,51 @@
+name: csvtk_concat
+description: Concatenate two or more CSV (or TSV) tables into a single table
+keywords:
+  - concatenate
+  - tsv
+  - csv
+tools:
+  - csvtk:
+      description: A cross-platform, efficient, practical CSV/TSV toolkit
+      homepage: http://bioinf.shenwei.me/csvtk
+      documentation: http://bioinf.shenwei.me/csvtk
+      tool_dev_url: https://github.com/shenwei356/csvtk
+
+      licence: ["MIT"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - csv:
+      type: file
+      description: CSV/TSV formatted files
+      pattern: "*.{csv,tsv}"
+  - in_format:
+      type: string
+      description: Input format (csv, tab, or a delimiting character)
+      pattern: "*"
+  - out_format:
+      type: string
+      description: Output format (csv, tab, or a delimiting character)
+      pattern: "*"
+
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "version.yml"
+  - csv:
+      type: file
+      description: Concatenated CSV/TSV file
+      pattern: "*.{csv,tsv}"
+
+authors:
+  - "@rpetit3"
diff --git a/modules/local/fastp/main.nf b/modules/local/fastp/main.nf
@@ -14,7 +14,7 @@ process FASTP {
     val   save_merged
 
     output:
-    tuple val(meta), path('*fastp.*fastq.gz') , optional:true, emit: reads
+    tuple val(meta), path('*fastp*fastq.gz') , optional:true, emit: reads
     tuple val(meta), path('*.json')           , emit: json
     tuple val(meta), path('*.html')           , emit: html
     tuple val(meta), path('*.log')            , emit: log
@@ -81,8 +81,8 @@ process FASTP {
         fastp \\
             --in1 ${prefix}_1.fastq.gz \\
             --in2 ${prefix}_2.fastq.gz \\
-            --out1 ${prefix}_fastp.R1.fastq.gz \\
-            --out2 ${prefix}_fastp.R2.fastq.gz \\
+            --out1 ${prefix}_fastp_1.fastq.gz \\
+            --out2 ${prefix}_fastp_2.fastq.gz \\
             --json ${prefix}.fastp.json \\
             --html ${prefix}.fastp.html \\
             $adapter_list \\

diff --git a/modules/local/gbs/sbg/main.nf b/modules/local/gbs/sbg/main.nf
@@ -0,0 +1,43 @@
+process GBS_SBG {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "bioconda::blast=2.15.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/blast%3A2.15.0--pl5321h6f7f691_1':
+        'biocontainers/blast%3A2.15.0--pl5321h6f7f691_1' }"
+
+
+    input:
+    tuple val(meta), path(fasta) //contigs
+    path(ref)
+
+    output:
+    //# Name  Serotype        Uncertainty
+    //S18     NT      MaxCov:0;MaxID:0
+    //S17     GBS-SBG:Ia
+    tuple val(meta), path("*.tsv"), emit: tsv
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def gzipped = fasta.toString().endsWith('.gz')
+    def cmd_input = gzipped ? "zcat ${fasta}" : "cat ${fasta}"
+    def cmd_refdb = ref ? "-ref ${ref}" : "" 
+
+    """
+    ${cmd_input} | GBS-SBG.pl \\
+        -name ${meta.id} \\
+        ${cmd_refdb} \\
+        > ${prefix}.tsv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        blast: \$(blastn -version 2>&1 | sed 's/^.*blastn: //; s/ .*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/hostile/main.nf b/modules/local/hostile/main.nf
@@ -46,8 +46,8 @@ process HOSTILE {
         then
             mv ${simplename}*.clean.fastq.gz ${prefix}.dehost.fastq.gz
         else
-            mv ${simplename}*.clean_1.fastq.gz ${prefix}.dehost.R1.fastq.gz
-            mv ${simplename}*.clean_2.fastq.gz ${prefix}.dehost.R2.fastq.gz
+            mv *clean_1.fastq.gz ${prefix}.dehost_1.fastq.gz
+            mv *clean_2.fastq.gz ${prefix}.dehost_2.fastq.gz
         fi
 
         cat <<-END_VERSIONS > versions.yml
@@ -71,8 +71,8 @@ process HOSTILE {
         then
             mv ${simplename}*.clean.fastq.gz ${prefix}.dehost.fastq.gz
         else
-            mv ${simplename}*.clean_1.fastq.gz ${prefix}.dehost.R1.fastq.gz
-            mv ${simplename}*.clean_2.fastq.gz ${prefix}.dehost.R2.fastq.gz
+            mv ${simplename}*.clean_1.fastq.gz ${prefix}.dehost_1.fastq.gz
+            mv ${simplename}*.clean_2.fastq.gz ${prefix}.dehost_2.fastq.gz
         fi
 
         cat <<-END_VERSIONS > versions.yml

diff --git a/modules/nf-core/pneumocat/environment.yml → modules/local/pneumocat/environment.yml b/modules/nf-core/pneumocat/environment.yml → modules/local/pneumocat/environment.yml
diff --git a/modules/local/pneumocat/main.nf b/modules/local/pneumocat/main.nf
@@ -0,0 +1,73 @@
+VERSION = '1.2.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
+process PNEUMOCAT {
+    tag "$meta.id"
+    label 'process_low'
+    errorStrategy 'ignore'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pneumocat:1.2.1--0':
+        'biocontainers/pneumocat:1.2.1--0' }"
+
+    input:
+    tuple val(meta), path(reads)
+
+    output:
+    tuple val(meta), path("*.final_results.xml"), emit: results
+    tuple val(meta), path("coverage_summary.txt"), emit: coverage
+    path "versions.yml"           , emit: versions
+    /*
+    If only one capsular type is matched with more than 90% coverage 
+    then the report from step 1 contained in this xml file is considered 
+    the final result (result type="Serotype") and no further folders 
+    will appear within the PneumoCaT output folder. If more than one 
+    capsular type are matched with more than 90% coverage then the 
+    software moves to step two and a SNP_based_serotyping folder is 
+    created containing a second XML file with the final result 
+    - see STEP 2- VARIANT-BASED APPROACH.
+    Note that the output XML file from step 1 only reports two capsular types, 
+    when actually more could be matched and all will pass to step 2 for 
+    further distinction. Further information on mapped serotypes in 
+    stage 1 can be found in "Coverage_summary.txt". If the top hit 
+    coverage is < 90% then no serotypes are reported and 'Failed' 
+    appears instead.
+    */
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    PneumoCaT.py \\
+        --input_directory ./ \\
+        $args \\
+        --threads $task.cpus \\
+        --output_dir ./
+
+    if [ -d "SNP_based_serotyping" ]
+    then
+        cp SNP_based_serotyping/${prefix}.results.xml ${prefix}.final_results.xml
+    else
+        cp ${prefix}.results.xml ${prefix}.final_results.xml
+    fi
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pneumocat: $VERSION
+    END_VERSIONS
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.results.xml
+    touch ${prefix}.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pneumocat: $VERSION
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/pneumocat/meta.yml → modules/local/pneumocat/meta.yml b/modules/nf-core/pneumocat/meta.yml → modules/local/pneumocat/meta.yml