From 21883f11be21aa27fcdd55795b4731ff815fc52a Mon Sep 17 00:00:00 2001
From: Linda Xiang <linda.xiang@oicr.on.ca>
Date: Tue, 30 Jan 2024 16:54:50 -0500
Subject: [PATCH 1/6] update stage_input fixes

---
 assets/tests/csv/sample_sheet.csv             |   8 +-
 conf/modules.config                           |   6 +
 conf/test_rdpc_qa.config                      |   2 +-
 modules.json                                  |  41 ++--
 modules/icgc-argo-workflows/cleanup/main.nf   |   7 +-
 .../prep/sample/resources/usr/bin/main.py     |  28 ++-
 .../score/download/main.nf                    |   6 +-
 .../icgc-argo-workflows/score/upload/main.nf  |   5 +-
 modules/icgc-argo-workflows/song/get/main.nf  |   4 +-
 .../icgc-argo-workflows/song/manifest/main.nf |   5 +-
 .../icgc-argo-workflows/song/publish/main.nf  |   5 +-
 .../icgc-argo-workflows/song/submit/main.nf   |   4 +-
 .../custom/dumpsoftwareversions/main.nf       |   6 +-
 .../custom/dumpsoftwareversions/meta.yml      |   7 +-
 .../templates/dumpsoftwareversions.py         |   3 +-
 modules/nf-core/cutadapt/main.nf              |  19 +-
 modules/nf-core/cutadapt/meta.yml             |   5 +-
 modules/nf-core/fastqc/main.nf                |  18 +-
 modules/nf-core/fastqc/meta.yml               |   5 +
 modules/nf-core/multiqc/main.nf               |  10 +-
 modules/nf-core/multiqc/meta.yml              |  11 +-
 nextflow.config                               |   5 +-
 .../icgc-argo-workflows/stage_input/main.nf   | 217 +++++++++++-------
 .../icgc-argo-workflows/stage_input/meta.yml  |   5 +-
 workflows/prealnqc.nf                         | 122 +++++-----
 25 files changed, 323 insertions(+), 231 deletions(-)

diff --git a/assets/tests/csv/sample_sheet.csv b/assets/tests/csv/sample_sheet.csv
index 2401860..c90c857 100644
--- a/assets/tests/csv/sample_sheet.csv
+++ b/assets/tests/csv/sample_sheet.csv
@@ -1,4 +1,4 @@
-sample,lane,fastq_1,fastq_2
-TEST,C0HVY.2,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/C0HVY.2_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/C0HVY.2_r2.fq.gz
-TEST,D0RE2.1,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RE2.1_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RE2.1_r2.fq.gz
-TEST,D0RH0.2,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RH0.2_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RH0.2_r2.fq.gz
\ No newline at end of file
+sample,lane,fastq_1,fastq_2,read_group_count,single_end
+TEST,C0HVY.2,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/C0HVY.2_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/C0HVY.2_r2.fq.gz,3,False
+TEST,D0RE2.1,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RE2.1_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RE2.1_r2.fq.gz,3,False
+TEST,D0RH0.2,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RH0.2_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RH0.2_r2.fq.gz,3,False
\ No newline at end of file
diff --git a/conf/modules.config b/conf/modules.config
index c48f648..c595d00 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -32,5 +32,11 @@ process {
         ]
     }
 
+    withName: 'SONG.*|SCORE.*' {
+      ext.prefix = ""
+      ext.api_download_token = params.api_download_token ?: params.api_token
+      ext.api_upload_token = params.api_upload_token ?: params.api_token
+    }
+
 }
 
diff --git a/conf/test_rdpc_qa.config b/conf/test_rdpc_qa.config
index d175d31..1d9d487 100644
--- a/conf/test_rdpc_qa.config
+++ b/conf/test_rdpc_qa.config
@@ -22,6 +22,6 @@ params {
     // Input data for rdpc mode
     local_mode  = false
     study_id    = "TEST-QA"
-    analysis_ids = "875ef550-e536-4456-9ef5-50e5362456df"
+    analysis_ids = "875ef550-e536-4456-9ef5-50e5362456df,9bb63c49-86c8-44e2-b63c-4986c804e274"
 
 }
diff --git a/modules.json b/modules.json
index 5487e51..dd1f6b2 100644
--- a/modules.json
+++ b/modules.json
@@ -5,9 +5,14 @@
         "https://github.com/icgc-argo-workflows/argo-modules.git": {
             "modules": {
                 "icgc-argo-workflows": {
+                    "checkinput": {
+                        "branch": "stage_input_fixB",
+                        "git_sha": "af24d4d6b59921ee048c304926897567ac956b00",
+                        "installed_by": ["stage_input"]
+                    },
                     "cleanup": {
                         "branch": "main",
-                        "git_sha": "517b0a82ed697564891e18ff5dba35a70f9da225",
+                        "git_sha": "8d014598ef81d65bece3684bd67aef7afae2cda9",
                         "installed_by": ["modules"]
                     },
                     "payload/qcmetrics": {
@@ -21,38 +26,38 @@
                         "installed_by": ["modules"]
                     },
                     "prep/sample": {
-                        "branch": "main",
-                        "git_sha": "4ae27e792724f69f7211db10bcd9e3373abc1837",
+                        "branch": "stage_input_fixB",
+                        "git_sha": "f253d1e6d4dc5f6ac0e6440041ee7e55b8203e35",
                         "installed_by": ["stage_input"]
                     },
                     "score/download": {
-                        "branch": "main",
-                        "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0",
+                        "branch": "stage_input_fixB",
+                        "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec",
                         "installed_by": ["song_score_download"]
                     },
                     "score/upload": {
                         "branch": "main",
-                        "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0",
+                        "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec",
                         "installed_by": ["song_score_upload"]
                     },
                     "song/get": {
-                        "branch": "main",
-                        "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0",
+                        "branch": "stage_input_fixB",
+                        "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec",
                         "installed_by": ["song_score_download"]
                     },
                     "song/manifest": {
                         "branch": "main",
-                        "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0",
+                        "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec",
                         "installed_by": ["song_score_upload"]
                     },
                     "song/publish": {
                         "branch": "main",
-                        "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0",
+                        "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec",
                         "installed_by": ["song_score_upload"]
                     },
                     "song/submit": {
                         "branch": "main",
-                        "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0",
+                        "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec",
                         "installed_by": ["song_score_upload"]
                     }
                 }
@@ -60,7 +65,7 @@
             "subworkflows": {
                 "icgc-argo-workflows": {
                     "song_score_download": {
-                        "branch": "main",
+                        "branch": "stage_input_fixB",
                         "git_sha": "92aa620385099e94401c22b8633cc55ed34ca10e",
                         "installed_by": ["stage_input"]
                     },
@@ -70,8 +75,8 @@
                         "installed_by": ["subworkflows"]
                     },
                     "stage_input": {
-                        "branch": "main",
-                        "git_sha": "4ae27e792724f69f7211db10bcd9e3373abc1837",
+                        "branch": "stage_input_fixB",
+                        "git_sha": "af24d4d6b59921ee048c304926897567ac956b00",
                         "installed_by": ["subworkflows"]
                     }
                 }
@@ -82,22 +87,22 @@
                 "nf-core": {
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
-                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
+                        "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93",
                         "installed_by": ["modules"]
                     },
                     "cutadapt": {
                         "branch": "master",
-                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
+                        "git_sha": "07881e42938b4f0070e864b45d424b01745bc3a4",
                         "installed_by": ["modules"]
                     },
                     "fastqc": {
                         "branch": "master",
-                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
+                        "git_sha": "c9488585ce7bd35ccd2a30faa2371454c8112fb9",
                         "installed_by": ["modules"]
                     },
                     "multiqc": {
                         "branch": "master",
-                        "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c",
+                        "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93",
                         "installed_by": ["modules"]
                     }
                 }
diff --git a/modules/icgc-argo-workflows/cleanup/main.nf b/modules/icgc-argo-workflows/cleanup/main.nf
index 2f505c9..815f1a7 100644
--- a/modules/icgc-argo-workflows/cleanup/main.nf
+++ b/modules/icgc-argo-workflows/cleanup/main.nf
@@ -3,9 +3,10 @@ process CLEANUP {
     label 'process_low'
  
     conda "conda-forge::coreutils=9.1"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
-        'ubuntu:20.04' }"
+    container "${ workflow.containerEngine == 'singularity' ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'docker.io/ubuntu:20.04'}"
+    //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+    //    'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
+    //    'ubuntu:20.04' }"
 
     input:
     path files_to_delete  // more accurately, other non-hidden files in the same folder will be deleted as well
diff --git a/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py b/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py
index 7dc42e7..c13d3dc 100755
--- a/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py
+++ b/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py
@@ -262,9 +262,16 @@ def main():
     specimen_type = song_analysis['samples'][0]['specimen']['specimenType']
     tumour_normal_designation = song_analysis['samples'][0]['specimen']['tumourNormalDesignation']
     status = '0' if tumour_normal_designation == 'Normal' else '1'
+    
+    if song_analysis.get('workflow'):
+      genome_build = song_analysis['workflow']["genome_build"]
+    else:
+      genome_build = None
+
     analysis_type = song_analysis['analysisType']['name']
     output_sample_sheet = f'{args.outdir}/{sample_id}_{analysis_type}_sample_sheet.csv'
-    
+    experiment=song_analysis['experiment']['experimental_strategy']
+
 
     sample_sheet = dict()
     if analysis_type == 'sequencing_experiment':
@@ -317,13 +324,13 @@ def main():
       if rgs_missed_lane:  # throw error here if that happens
           sys.exit("Error: no lane BAM has been generated for some read groups: '%s'. "
                   "Please make sure supplied sequencing files and metadata are correct." % "', '".join(rgs_missed_lane))
-      
+
       with open(output_sample_sheet, 'w', newline='') as f:
         csvwriter = csv.writer(f, delimiter=',')
-        csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','lane','fastq_1','fastq_2','read_group','single_end','read_group_count','analysis_json'])
+        csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','lane','fastq_1','fastq_2','read_group','single_end','read_group_count',"experiment", 'analysis_json'])
         for k,v in sample_sheet.items():
           single_end = True if v['file_r2'] == 'No_File' else False
-          csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, k, v['file_r1'], v['file_r2'], v['read_group'], single_end, read_group_count, metadata_json])
+          csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, k, v['file_r1'], v['file_r2'], v['read_group'], single_end, read_group_count,experiment, metadata_json])
     
     elif analysis_type == 'sequencing_alignment':
       for fp in args.input_files:
@@ -337,8 +344,8 @@ def main():
           sys.exit("Error: not supported input file format")
       with open(output_sample_sheet, 'w', newline='') as f:
         csvwriter = csv.writer(f, delimiter=',')
-        csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','cram','crai', 'analysis_json'])
-        csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, cram, crai, metadata_json])
+        csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','cram','crai',"genome_build",'experiment', 'analysis_json'])
+        csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, cram, crai, genome_build,experiment, metadata_json])
 
     elif analysis_type == 'variant_calling':
       for fp in song_analysis['files']:
@@ -355,13 +362,13 @@ def main():
           sys.exit("Error: not supported input file format")
       with open(output_sample_sheet, 'w', newline='') as f:
         csvwriter = csv.writer(f, delimiter=',')
-        csvwriter.writerow(['analysis_type','study_id','patient','sex','sample','variantcaller','vcf','tbi', 'analysis_json'])
-        csvwriter.writerow([analysis_type, study_id, donor_id, sex, sample_id, variantcaller, vcf, tbi, metadata_json]) 
+        csvwriter.writerow(['analysis_type','study_id','patient','sex','sample','variantcaller','vcf','tbi',"genome_build",'experiment', 'analysis_json'])
+        csvwriter.writerow([analysis_type, study_id, donor_id, sex, sample_id, variantcaller, vcf, tbi ,genome_build,experiment, metadata_json])  
 
     elif analysis_type == 'qc_metrics':
       with open(output_sample_sheet, 'w', newline='') as f:
         csvwriter = csv.writer(f, delimiter=',')
-        csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','qc_tools','qc_file', 'analysis_json'])
+        csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','qc_tools','qc_file',"genome_build", 'experiment','analysis_json'])
 
         for fp in args.input_files:
           for fq in song_analysis['files']:
@@ -370,8 +377,7 @@ def main():
             os.symlink(os.path.abspath(fp), qc_file)
             qc_tools = ','.join(fq['info']['analysis_tools'])
 
-          csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, qc_tools, qc_file, metadata_json]) 
-
+          csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, qc_tools, qc_file, genome_build, experiment, metadata_json]) 
 
 if __name__ == "__main__":
     main()
diff --git a/modules/icgc-argo-workflows/score/download/main.nf b/modules/icgc-argo-workflows/score/download/main.nf
index 87d0c50..792bc31 100644
--- a/modules/icgc-argo-workflows/score/download/main.nf
+++ b/modules/icgc-argo-workflows/score/download/main.nf
@@ -1,5 +1,3 @@
-
-
 process SCORE_DOWNLOAD {
     tag "${analysis_id}"
     label 'process_medium'
@@ -32,7 +30,7 @@ process SCORE_DOWNLOAD {
     def score_url = params.score_url_download ?: params.score_url
     def transport_parallel = params.transport_parallel ?: task.cpus
     def transport_mem = params.transport_mem ?: "2"
-    def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`"
+    def accessToken = task.ext.api_download_token ?: "`cat /tmp/rdpc_secret/secret`"
     def VERSION = params.score_container_version ?: '5.8.1'
     """
     export METADATA_URL=${song_url}
@@ -48,4 +46,4 @@ process SCORE_DOWNLOAD {
         score-client: ${VERSION}
     END_VERSIONS
     """
-}
+}
\ No newline at end of file
diff --git a/modules/icgc-argo-workflows/score/upload/main.nf b/modules/icgc-argo-workflows/score/upload/main.nf
index b4aa4a4..db22f3e 100644
--- a/modules/icgc-argo-workflows/score/upload/main.nf
+++ b/modules/icgc-argo-workflows/score/upload/main.nf
@@ -1,5 +1,4 @@
 
-
 process SCORE_UPLOAD {
     tag "${analysis_id}"
     label 'process_medium'
@@ -29,7 +28,7 @@ process SCORE_UPLOAD {
     def score_url = params.score_url_upload ?: params.score_url
     def transport_parallel = params.transport_parallel ?: task.cpus
     def transport_mem = params.transport_mem ?: "2"
-    def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`"
+    def accessToken = task.ext.api_upload_token ?: "`cat /tmp/rdpc_secret/secret`"
     def VERSION = params.score_container_version ?: '5.8.1'
     """
     export METADATA_URL=${song_url}
@@ -45,4 +44,4 @@ process SCORE_UPLOAD {
         score-client: ${VERSION}
     END_VERSIONS
     """
-}
+}
\ No newline at end of file
diff --git a/modules/icgc-argo-workflows/song/get/main.nf b/modules/icgc-argo-workflows/song/get/main.nf
index 22e0cb3..b476dd0 100644
--- a/modules/icgc-argo-workflows/song/get/main.nf
+++ b/modules/icgc-argo-workflows/song/get/main.nf
@@ -26,7 +26,7 @@ process SONG_GET {
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${analysis_id}"
     def song_url = params.song_url_download ?: params.song_url
-    def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`"
+    def accessToken = task.ext.api_download_token ?: "`cat /tmp/rdpc_secret/secret`"
     def VERSION = params.song_container_version ?: '5.0.2'
     """
     export CLIENT_SERVER_URL=${song_url}
@@ -40,4 +40,4 @@ process SONG_GET {
         song-client: ${VERSION}
     END_VERSIONS
     """
-}
+}
\ No newline at end of file
diff --git a/modules/icgc-argo-workflows/song/manifest/main.nf b/modules/icgc-argo-workflows/song/manifest/main.nf
index 3691416..49a209f 100644
--- a/modules/icgc-argo-workflows/song/manifest/main.nf
+++ b/modules/icgc-argo-workflows/song/manifest/main.nf
@@ -1,4 +1,3 @@
-
 process SONG_MANIFEST {
     tag "${analysis_id}"
     label 'process_single'
@@ -26,7 +25,7 @@ process SONG_MANIFEST {
     script:
     def args = task.ext.args ?: ''
     def song_url = params.song_url_upload ?: params.song_url
-    def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`"
+    def accessToken = task.ext.api_upload_token ?: "`cat /tmp/rdpc_secret/secret`"
     def VERSION = params.song_container_version ?: '5.0.2'
     def study_id = "${meta.study_id}"
     """
@@ -41,4 +40,4 @@ process SONG_MANIFEST {
         song-client: ${VERSION}
     END_VERSIONS
     """
-}
+}
\ No newline at end of file
diff --git a/modules/icgc-argo-workflows/song/publish/main.nf b/modules/icgc-argo-workflows/song/publish/main.nf
index 903564a..ef92c7d 100644
--- a/modules/icgc-argo-workflows/song/publish/main.nf
+++ b/modules/icgc-argo-workflows/song/publish/main.nf
@@ -1,4 +1,3 @@
-
 process SONG_PUBLISH {
     tag "${analysis_id}"
     label 'process_single'
@@ -25,7 +24,7 @@ process SONG_PUBLISH {
     script:
     def args = task.ext.args ?: ''
     def song_url = params.song_url_upload ?: params.song_url
-    def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`"
+    def accessToken = task.ext.api_upload_token ?: "`cat /tmp/rdpc_secret/secret`"
     def study_id = "${meta.study_id}"
     def VERSION = params.song_container_version ?: '5.0.2'
     """
@@ -40,4 +39,4 @@ process SONG_PUBLISH {
         song-client: ${VERSION}
     END_VERSIONS
     """
-}
+}
\ No newline at end of file
diff --git a/modules/icgc-argo-workflows/song/submit/main.nf b/modules/icgc-argo-workflows/song/submit/main.nf
index c948338..4d63d92 100644
--- a/modules/icgc-argo-workflows/song/submit/main.nf
+++ b/modules/icgc-argo-workflows/song/submit/main.nf
@@ -27,7 +27,7 @@ process SONG_SUBMIT {
     script:
     def args = task.ext.args ?: ''
     def song_url = params.song_url_upload ?: params.song_url
-    def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`"
+    def accessToken = task.ext.api_upload_token ?: "`cat /tmp/rdpc_secret/secret`"
     def VERSION = params.song_container_version ?: '5.0.2'
     def study_id = "${meta.study_id}"
     """
@@ -44,4 +44,4 @@ process SONG_SUBMIT {
     END_VERSIONS
 
     """
-}
+}
\ No newline at end of file
diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf
index 3df2176..f218761 100644
--- a/modules/nf-core/custom/dumpsoftwareversions/main.nf
+++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf
@@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS {
     label 'process_single'
 
     // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container
-    conda "bioconda::multiqc=1.13"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' :
-        'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' :
+        'biocontainers/multiqc:1.19--pyhdfd78af_0' }"
 
     input:
     path versions
diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml
index 60b546a..5f15a5f 100644
--- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml
+++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml
@@ -1,7 +1,9 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
 name: custom_dumpsoftwareversions
 description: Custom module used to dump software versions within the nf-core pipeline template
 keywords:
   - custom
+  - dump
   - version
 tools:
   - custom:
@@ -14,7 +16,6 @@ input:
       type: file
       description: YML file containing software versions
       pattern: "*.yml"
-
 output:
   - yml:
       type: file
@@ -28,7 +29,9 @@ output:
       type: file
       description: File containing software versions
       pattern: "versions.yml"
-
 authors:
   - "@drpatelh"
   - "@grst"
+maintainers:
+  - "@drpatelh"
+  - "@grst"
diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py
index e55b8d4..da03340 100755
--- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py
+++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py
@@ -4,11 +4,10 @@
 """Provide functions to merge multiple versions.yml files."""
 
 
+import yaml
 import platform
 from textwrap import dedent
 
-import yaml
-
 
 def _make_versions_html(versions):
     """Generate a tabular HTML output of all versions for MultiQC."""
diff --git a/modules/nf-core/cutadapt/main.nf b/modules/nf-core/cutadapt/main.nf
index dd030a6..e232a70 100644
--- a/modules/nf-core/cutadapt/main.nf
+++ b/modules/nf-core/cutadapt/main.nf
@@ -1,11 +1,11 @@
 process CUTADAPT {
     tag "$meta.id"
-    label 'process_low'
+    label 'process_medium'
 
-    conda "bioconda::cutadapt=3.4"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/cutadapt:3.4--py39h38f01e4_1' :
-        'quay.io/biocontainers/cutadapt:3.4--py39h38f01e4_1' }"
+        'biocontainers/cutadapt:3.4--py39h38f01e4_1' }"
 
     input:
     tuple val(meta), path(reads)
@@ -34,4 +34,17 @@ process CUTADAPT {
         cutadapt: \$(cutadapt --version)
     END_VERSIONS
     """
+
+    stub:
+    def prefix  = task.ext.prefix ?: "${meta.id}"
+    def trimmed = meta.single_end ? "${prefix}.trim.fastq.gz" : "${prefix}_1.trim.fastq.gz ${prefix}_2.trim.fastq.gz"
+    """
+    touch ${prefix}.cutadapt.log
+    touch ${trimmed}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        cutadapt: \$(cutadapt --version)
+    END_VERSIONS
+    """
 }
diff --git a/modules/nf-core/cutadapt/meta.yml b/modules/nf-core/cutadapt/meta.yml
index bcfe291..5ecfe27 100644
--- a/modules/nf-core/cutadapt/meta.yml
+++ b/modules/nf-core/cutadapt/meta.yml
@@ -10,7 +10,7 @@ tools:
       description: |
         Cutadapt finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads.
       documentation: https://cutadapt.readthedocs.io/en/stable/index.html
-      doi: DOI:10.14806/ej.17.1.200
+      doi: 10.14806/ej.17.1.200
       licence: ["MIT"]
 input:
   - meta:
@@ -44,3 +44,6 @@ output:
 authors:
   - "@drpatelh"
   - "@kevinmenden"
+maintainers:
+  - "@drpatelh"
+  - "@kevinmenden"
diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf
index 0699836..9e19a74 100644
--- a/modules/nf-core/fastqc/main.nf
+++ b/modules/nf-core/fastqc/main.nf
@@ -1,11 +1,11 @@
 process FASTQC {
     tag "$meta.id"
-    label 'process_low'
+    label 'process_medium'
 
-    conda "bioconda::fastqc=0.11.9"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' :
-        'quay.io/biocontainers/fastqc:0.11.9--0' }"
+        'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' :
+        'biocontainers/fastqc:0.12.1--hdfd78af_0' }"
 
     input:
     tuple val(meta), path(reads)
@@ -29,11 +29,15 @@ process FASTQC {
     printf "%s %s\\n" $rename_to | while read old_name new_name; do
         [ -f "\${new_name}" ] || ln -s \$old_name \$new_name
     done
-    fastqc $args --threads $task.cpus $renamed_files
+
+    fastqc \\
+        $args \\
+        --threads $task.cpus \\
+        $renamed_files
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" )
+        fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' )
     END_VERSIONS
     """
 
@@ -45,7 +49,7 @@ process FASTQC {
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" )
+        fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' )
     END_VERSIONS
     """
 }
diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml
index 4da5bb5..ee5507e 100644
--- a/modules/nf-core/fastqc/meta.yml
+++ b/modules/nf-core/fastqc/meta.yml
@@ -50,3 +50,8 @@ authors:
   - "@grst"
   - "@ewels"
   - "@FelixKrueger"
+maintainers:
+  - "@drpatelh"
+  - "@grst"
+  - "@ewels"
+  - "@FelixKrueger"
diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf
index 68f66be..1b9f7c4 100644
--- a/modules/nf-core/multiqc/main.nf
+++ b/modules/nf-core/multiqc/main.nf
@@ -1,10 +1,10 @@
 process MULTIQC {
     label 'process_single'
 
-    conda "bioconda::multiqc=1.13"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' :
-        'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' :
+        'biocontainers/multiqc:1.19--pyhdfd78af_0' }"
 
     input:
     path  multiqc_files, stageAs: "?/*"
@@ -25,12 +25,14 @@ process MULTIQC {
     def args = task.ext.args ?: ''
     def config = multiqc_config ? "--config $multiqc_config" : ''
     def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : ''
+    def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : ''
     """
     multiqc \\
         --force \\
         $args \\
         $config \\
         $extra_config \\
+        $logo \\
         .
 
     cat <<-END_VERSIONS > versions.yml
@@ -41,7 +43,7 @@ process MULTIQC {
 
     stub:
     """
-    touch multiqc_data
+    mkdir multiqc_data
     touch multiqc_plots
     touch multiqc_report.html
 
diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml
index ebc29b2..45a9bc3 100644
--- a/modules/nf-core/multiqc/meta.yml
+++ b/modules/nf-core/multiqc/meta.yml
@@ -1,4 +1,4 @@
-name: MultiQC
+name: multiqc
 description: Aggregate results from bioinformatics analyses across many samples into a single report
 keywords:
   - QC
@@ -12,7 +12,6 @@ tools:
       homepage: https://multiqc.info/
       documentation: https://multiqc.info/docs/
       licence: ["GPL-3.0-or-later"]
-
 input:
   - multiqc_files:
       type: file
@@ -30,14 +29,13 @@ input:
       type: file
       description: Optional logo file for MultiQC
       pattern: "*.{png}"
-
 output:
   - report:
       type: file
       description: MultiQC report file
       pattern: "multiqc_report.html"
   - data:
-      type: dir
+      type: directory
       description: MultiQC data dir
       pattern: "multiqc_data"
   - plots:
@@ -53,3 +51,8 @@ authors:
   - "@bunop"
   - "@drpatelh"
   - "@jfy133"
+maintainers:
+  - "@abhi18av"
+  - "@bunop"
+  - "@drpatelh"
+  - "@jfy133"
diff --git a/nextflow.config b/nextflow.config
index cf6cf9b..0c74f62 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -14,6 +14,7 @@ params {
     local_mode                 = false
     study_id                   = null
     analysis_ids               = null
+    cleanup                    = true
 
     // rdpc data staging 
     tempdir                    = null
@@ -22,14 +23,16 @@ params {
     score_url                  = null
     song_url_download          = null
     score_url_download         = null
+    api_download_token         = null
     song_url_upload            = null
     score_url_upload           = null
+    api_upload_token           = null
     transport_parallel         = null
     transport_mem              = null
     song_container             = "ghcr.io/overture-stack/song-client"
     song_container_version     = "5.0.2"
     score_container            = "ghcr.io/overture-stack/score"
-    score_container_version    = "5.9.0"
+    score_container_version    = "5.10.0"
 
     // Boilerplate options
     outdir                     = null
diff --git a/subworkflows/icgc-argo-workflows/stage_input/main.nf b/subworkflows/icgc-argo-workflows/stage_input/main.nf
index d351874..b34f760 100644
--- a/subworkflows/icgc-argo-workflows/stage_input/main.nf
+++ b/subworkflows/icgc-argo-workflows/stage_input/main.nf
@@ -1,130 +1,183 @@
 
 include { SONG_SCORE_DOWNLOAD          } from '../../icgc-argo-workflows/song_score_download/main'
-include { PREP_SAMPLE                  } from '../../../modules/icgc-argo-workflows/prep/sample/main.nf'
+include { PREP_SAMPLE                  } from '../../../modules/icgc-argo-workflows/prep/sample/main'
+include { CHECKINPUT                   } from '../../../modules/icgc-argo-workflows/checkinput/main'
 
 workflow STAGE_INPUT {
 
     take:
-    study_analysis  // channel: study_id, analysis_id
-
+    study_id // channel: study_id
+    analysis_ids // channel: analysis_ids
+    samplesheet  // channel: samplesheet
+    
     main:
     ch_versions = Channel.empty()
 
-    SONG_SCORE_DOWNLOAD( study_analysis )
-    ch_versions = ch_versions.mix(SONG_SCORE_DOWNLOAD.out.versions)
+    //If local_mode is specified do not upload To RDPC
+    if (params.local_mode){
+      upRdpc_flag=false
+    } else {
+      //Otherwise only upload to RDPC is API_Token is present
+      if (params.api_token || params.api_upload_token){
+        upRdpc_flag=true
+      } else {
+        upRdpc_flag=false
+      }
+    }
+
+    //Apply appropriate action if API_TOKEN is supplied
+    if (params.api_token || params.api_download_token){
+      //If IDs are present proceed with download otherwise exit
+      if (study_id && analysis_ids){
+
+      Channel.from(analysis_ids.split(","))
+      .map{analysis_id -> tuple([study_id,analysis_id])}
+      .set{ch_study_analysis}
+
+      SONG_SCORE_DOWNLOAD( ch_study_analysis )
+      ch_versions = ch_versions.mix(SONG_SCORE_DOWNLOAD.out.versions)
 
-    PREP_SAMPLE ( SONG_SCORE_DOWNLOAD.out.analysis_files )
-    ch_versions = ch_versions.mix(PREP_SAMPLE.out.versions)
+      PREP_SAMPLE ( SONG_SCORE_DOWNLOAD.out.analysis_files )
+      ch_versions = ch_versions.mix(PREP_SAMPLE.out.versions)
 
-    PREP_SAMPLE.out.sample_sheet_csv
+      analysis_input = PREP_SAMPLE.out.sample_sheet_csv
+      } else {
+        exit 1, "Using using API_Token, both a study_id and analysis_ids must be specified."
+      }
+    } else {
+      //If no API_Token, check for local samplesheet
+      if (samplesheet){
+        CHECKINPUT(file(samplesheet,checkIfExists: true),workflow.Manifest.name)
+        ch_versions = ch_versions.mix(CHECKINPUT.out.versions)
+
+        analysis_input = CHECKINPUT.out.csv
+      } else {
+        exit 1, "When no API_TOKEN is provided, a local samplesheet must be provided."
+      }
+    }
+    //Collect meta,data files and analysis_json
+    //Two channels for meta,files and meta,analysis_json will be refined afterwards
+    analysis_input
     .collectFile(keepHeader: true, name: 'sample_sheet.csv')
     .splitCsv(header:true)
     .map{ row ->
-      if (row.analysis_type == "sequencing_experiment") {
-        tuple([
-          id:"${row.sample}-${row.lane}".toString(), 
-          study_id:row.study_id,
-          patient:row.patient,
-          sex:row.sex,
-          status:row.status.toInteger(),
-          sample:row.sample, 
-          read_group:row.read_group.toString(), 
-          data_type:'fastq', 
-          size:1, 
-          numLanes:row.read_group_count], 
-          [file(row.fastq_1), file(row.fastq_2)]) 
-      }
-      else if (row.analysis_type == "sequencing_alignment") {
+       if (row.analysis_type == "sequencing_experiment" && row.single_end.toLowerCase() == 'false') {
+         tuple([
+           analysis_type : row.analysis_type,
+           id:"${row.sample}-${row.lane}".toString(), 
+           study_id:row.study_id,
+           patient:row.patient,
+           sex:row.sex,
+           status:row.status.toInteger(),
+           sample:row.sample, 
+           read_group:row.read_group.toString(), 
+           data_type:'fastq', 
+           numLanes:row.read_group_count,
+           experiment:row.experiment,
+           single_end : row.single_end.toBoolean()
+           ], 
+           [file(row.fastq_1), file(row.fastq_2)],
+           row.analysis_json
+           )
+       } else if (row.analysis_type == "sequencing_experiment" && row.single_end.toLowerCase() == 'true') {
+         tuple([
+           analysis_type : row.analysis_type,
+           id:"${row.sample}-${row.lane}".toString(), 
+           study_id:row.study_id,
+           patient:row.patient,
+           sex:row.sex,
+           status:row.status.toInteger(),
+           sample:row.sample, 
+           read_group:row.read_group.toString(), 
+           data_type:'fastq', 
+           numLanes:row.read_group_count,
+           experiment:row.experiment,
+           single_end : row.single_end.toBoolean()
+           ], 
+           [file(row.fastq_1)],
+           row.analysis_json
+           ) 
+      } else if (row.analysis_type == "sequencing_alignment") {
         tuple([
+          analysis_type : row.analysis_type,
           id:"${row.sample}".toString(),
           study_id:row.study_id,
           patient:row.patient,
           sample:row.sample,
           sex:row.sex,
-          status:row.status.toInteger(), 
+          status:row.status.toInteger(),
+          genome_build:row.genome_build,
+          experiment:row.experiment,
           data_type:'cram'], 
-          file(row.cram), file(row.crai))
+          [file(row.cram), file(row.crai)],
+          row.analysis_json
+          )
       }
       else if (row.analysis_type == "variant_calling") {
         tuple([
+          analysis_type : row.analysis_type,
           id:"${row.sample}".toString(),
           study_id:row.study_id, 
           patient:row.patient,
-          sample:row.sample, 
+          sample:row.sample,
+          sex:row.sex,
+          status:row.status.toInteger(), 
           variantcaller:row.variantcaller, 
-          data_type:'vcf'], file(row.vcf), file(row.tbi))
+          genome_build:row.genome_build,
+          experiment:row.experiment,
+          data_type:'vcf'],
+          [file(row.vcf), file(row.tbi)],
+          row.analysis_json
+          )
       }
       else if (row.analysis_type == "qc_metrics") {
         tuple([
+          analysis_type : row.analysis_type,
           id:"${row.sample}".toString(),
           study_id:row.study_id, 
           patient:row.patient,
           sample:row.sample,
           sex:row.sex,
           status:row.status.toInteger(), 
-          qc_tools:row.qc_tools, 
-          data_type:'tgz'], file(row.qc_file))
+          qc_tools:row.qc_tools,
+          genome_build:row.genome_build,
+          experiment:row.experiment,
+          data_type:'tgz'],
+          [file(row.qc_file)],
+          row.analysis_json
+          )
       }
     }
     .set { ch_input_sample }
 
-    PREP_SAMPLE.out.sample_sheet_csv
-    .collectFile(keepHeader: true)
-    .splitCsv(header:true)
-    .map{ row ->
-      if (row.analysis_type == "sequencing_experiment") {
-        tuple([
-          id:"${row.sample}-${row.lane}".toString(), 
-          study_id:row.study_id,
-          patient:row.patient,
-          sex:row.sex,
-          status:row.status.toInteger(),
-          sample:row.sample, 
-          read_group:row.read_group.toString(), 
-          data_type:'json', 
-          size:1, 
-          numLanes:row.read_group_count], 
-          file(row.analysis_json)) 
-      }
-      else if (row.analysis_type == "sequencing_alignment") {
-        tuple([
-          id:"${row.sample}".toString(),
-          study_id:row.study_id,
-          patient:row.patient,
-          sample:row.sample,
-          sex:row.sex,
-          status:row.status.toInteger(), 
-          data_type:'json'], 
-          file(row.analysis_json))
-      }
-      else if (row.analysis_type == "variant_calling") {
-        tuple([
-          id:"${row.sample}".toString(),
-          study_id:row.study_id, 
-          patient:row.patient,
-          sample:row.sample, 
-          variantcaller:row.variantcaller, 
-          data_type:'json'], file(row.analysis_json))
-      }
-      else if (row.analysis_type == "qc_metrics") {
-        tuple([
-          id:"${row.sample}".toString(),
-          study_id:row.study_id, 
-          patient:row.patient,
-          sample:row.sample,
-          sex:row.sex,
-          status:row.status.toInteger(),  
-          qc_tools:row.qc_tools, 
-          data_type:'json'], file(row.analysis_json))
+    //We want to still have meta when analysis_json doesn't exist
+    ch_input_sample.map{ meta,files,analysis ->
+      if (analysis){
+        tuple([meta,file(analysis)])
+      } else {
+        tuple([meta,null])
       }
     }
-    .set { ch_meta_analysis }
+    .unique{it[1]}
+    .set{ ch_meta_analysis }
+
+    //Reorganize files as "sequencing_experiment expected input is tuple while other types are flat"
+    ch_input_sample.map{ meta,files,analysis ->
+      if (meta.analysis_type == "sequencing_experiment"){
+        tuple([meta,files])
+      } else if (meta.analysis_type == "sequencing_alignment") {
+        tuple([meta,files[0],files[1]])
+      } else if (meta.analysis_type == "variant_calling") {
+        tuple([meta,files[0],files[1]])
+      } else if (meta.analysis_type == "qc_metrics") {
+        tuple([meta,files[0]])
+      }
+    }.set{ch_meta_files}
 
     emit:
-    analysis_json = SONG_SCORE_DOWNLOAD.out.analysis_json  // channel: [ analysis_json ] 
-    meta_analysis = ch_meta_analysis         // channel: [ val(meta), analysis_json]
-    sample_files  = ch_input_sample          // channel: [ val(meta), [ files ] ]
-    input_files = SONG_SCORE_DOWNLOAD.out.files // channel: [files]
+    meta_analysis = ch_meta_analysis // channel: [ val(meta), analysis_json]
+    meta_files  = ch_meta_files      // channel: [ val(meta), [ files ] ]
+    upRdpc = upRdpc_flag
     
     versions = ch_versions                   // channel: [ versions.yml ]
 }
\ No newline at end of file
diff --git a/subworkflows/icgc-argo-workflows/stage_input/meta.yml b/subworkflows/icgc-argo-workflows/stage_input/meta.yml
index 9c565cc..792a5ea 100644
--- a/subworkflows/icgc-argo-workflows/stage_input/meta.yml
+++ b/subworkflows/icgc-argo-workflows/stage_input/meta.yml
@@ -9,7 +9,8 @@ keywords:
 modules:
   - song/get
   - score/download
-  - prep_sample
+  - prep/sample
+  - checkinput
 input:
   - study_id:
       type: string
@@ -38,4 +39,4 @@ output:
       pattern: "versions.yml"
 authors:
   - "@lindaxiang"
-  
\ No newline at end of file
+  
diff --git a/workflows/prealnqc.nf b/workflows/prealnqc.nf
index 75ae34b..d5f1cca 100644
--- a/workflows/prealnqc.nf
+++ b/workflows/prealnqc.nf
@@ -28,7 +28,7 @@ ch_multiqc_logo            = params.multiqc_logo   ? Channel.fromPath( params.mu
     IMPORT LOCAL MODULES/SUBWORKFLOWS
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
-include { INPUT_CHECK       } from '../subworkflows/local/input_check'
+// include { INPUT_CHECK       } from '../subworkflows/local/input_check'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -61,32 +61,16 @@ workflow PREALNQC {
 
     ch_versions = Channel.empty()
     
-    // Read in samplesheet, validate and stage input files
-    if (params.local_mode) {
-      if (params.input) {
-        ch_input = Channel.fromPath(params.input)
-        ch_input_sample = INPUT_CHECK (ch_input).reads
-      } 
-      else { exit 1, 'Input samplesheet must be specified for local mode!' }
-    } else if (params.study_id && params.analysis_ids) {
-      ch_study = Channel.of(params.study_id)
-      ch_analysis_ids = Channel.fromList(params.analysis_ids.split(',') as List)
-      ch_input = ch_study.combine(ch_analysis_ids)
-
-      STAGE_INPUT(ch_input)
-      ch_input_sample = STAGE_INPUT.out.sample_files
-      ch_metadata = STAGE_INPUT.out.meta_analysis
-      ch_versions = ch_versions.mix(STAGE_INPUT.out.versions)
-    
-    } else { exit 1, 'study_id & analysis_ids must be specified for rdpc mode!' }
-
+    // Stage input files
+    STAGE_INPUT(params.study_id, params.analysis_ids, params.input)
+    ch_versions = ch_versions.mix(STAGE_INPUT.out.versions)
 
     // MODULE: Run FastQC
-    FASTQC( ch_input_sample )
+    FASTQC( STAGE_INPUT.out.meta_files )
     ch_versions = ch_versions.mix(FASTQC.out.versions)
 
-    // MODULE: Perform cutadpat
-    CUTADAPT( ch_input_sample )
+    // // MODULE: Perform cutadpat
+    CUTADAPT( STAGE_INPUT.out.meta_files )
     ch_versions = ch_versions.mix(CUTADAPT.out.versions)
 
     // Gather QC files    
@@ -109,7 +93,7 @@ workflow PREALNQC {
     // Group the QC files by sampleId
     ch_qc_files
     .transpose()
-    .map { meta, files -> [[id: meta.sample], files] }
+    .map { meta, files -> [[id: meta.sample, study_id: meta.study_id], files] }
     .groupTuple()
     .set{ ch_meta_qcfiles }
 
@@ -118,49 +102,55 @@ workflow PREALNQC {
 
     // Collect Software Versions
     CUSTOM_DUMPSOFTWAREVERSIONS (ch_versions.unique{ it.text }.collectFile(name: 'collated_versions.yml'))
-
-    // upload QC files and metadata to song/score
-    if (!params.local_mode) {
-      // make metadata and files match  
-      ch_metadata.map { meta, metadata -> [[id: meta.sample], metadata]}
-          .unique().set{ ch_meta_metadata }
-
-      ch_meta_metadata.join(ch_meta_qcfiles).join(PREP_METRICS.out.metrics_json)
-      .set { ch_metadata_upload }
-
-      // // generate payload
-      PAYLOAD_QCMETRICS(
-        ch_metadata_upload, CUSTOM_DUMPSOFTWAREVERSIONS.out.yml.collect()) 
-
-      // SONG_SCORE_UPLOAD(PAYLOAD_QCMETRICS.out.payload_files)
-
-      // // cleanup
-      // // Gather files to remove   
-      // ch_files = Channel.empty()
-      // ch_files = ch_files.mix(STAGE_INPUT.out.sample_files)
-      // ch_files = ch_files.mix(STAGE_INPUT.out.analysis_meta)
-      // ch_files = ch_files.mix(FASTQC.out.zip)
-      // ch_files = ch_files.mix(FASTQC.out.html)
-      // ch_files = ch_files.mix(CUTADAPT.out.log)
-      // ch_files = ch_files.mix(CUTADAPT.out.reads)
-      // ch_files.map{ meta, files -> files}
-      // .unique()
-      // .set { ch_files_to_remove1 }
-
-      // PAYLOAD_QCMETRICS.out.payload_files
-      // .map {meta, payload, files -> files}
-      // .unique()
-      // .set { ch_files_to_remove2 }
-
-      // ch_files_to_remove = Channel.empty()
-      // ch_files_to_remove = ch_files_to_remove.mix(STAGE_INPUT.out.input_files)
-      // ch_files_to_remove = ch_files_to_remove.mix(MULTIQC.out.report)
-      // ch_files_to_remove = ch_files_to_remove.mix(MULTIQC.out.data)
-      // ch_files_to_remove = ch_files_to_remove.mix(ch_files_to_remove1)
-      // ch_files_to_remove = ch_files_to_remove.mix(ch_files_to_remove2)
-      // CLEANUP(ch_files_to_remove.unique().collect(), SONG_SCORE_UPLOAD.out.analysis_id)
+    
+    // Combine channels to determine upload status and payload creation
+    // make metadata and files match  
+    STAGE_INPUT.out.meta_analysis.map { meta, metadata -> [[id: meta.sample, study_id: meta.study_id], metadata]}
+        .unique().set{ ch_meta_metadata }
+  
+    ch_meta_metadata.join(ch_meta_qcfiles).join(PREP_METRICS.out.metrics_json)
+    .set { ch_metadata_files }
+
+    STAGE_INPUT.out.upRdpc.combine(ch_metadata_files)
+    .map{upRdpc, meta, metadata, files, metrics -> 
+    [[id: meta.id, study_id: meta.study_id, upRdpc: upRdpc],
+      metadata, files, metrics]}
+    .branch{
+      upload: it[0].upRdpc
+    }.set{ch_metadata_files_status}
+
+    // generate payload
+    PAYLOAD_QCMETRICS(
+        ch_metadata_files_status.upload, CUSTOM_DUMPSOFTWAREVERSIONS.out.yml.collect()) 
+
+    SONG_SCORE_UPLOAD(PAYLOAD_QCMETRICS.out.payload_files)
+
+    if (params.cleanup) {
+      // cleanup
+      // Gather files to remove   
+      ch_files = Channel.empty()
+      ch_files = ch_files.mix(STAGE_INPUT.out.meta_files)
+      ch_files = ch_files.mix(STAGE_INPUT.out.meta_analysis)
+      ch_files = ch_files.mix(FASTQC.out.zip)
+      ch_files = ch_files.mix(FASTQC.out.html)
+      ch_files = ch_files.mix(CUTADAPT.out.log)
+      ch_files = ch_files.mix(CUTADAPT.out.reads)
+      ch_files.map{ meta, files -> files}
+      .unique()
+      .set { ch_files_to_remove1 }
+
+      PAYLOAD_QCMETRICS.out.payload_files
+      .map {meta, payload, files -> files}
+      .unique()
+      .set { ch_files_to_remove2 }
+
+      ch_files_to_remove = Channel.empty()
+      ch_files_to_remove = ch_files_to_remove.mix(MULTIQC.out.report)
+      ch_files_to_remove = ch_files_to_remove.mix(MULTIQC.out.data)
+      ch_files_to_remove = ch_files_to_remove.mix(ch_files_to_remove1)
+      ch_files_to_remove = ch_files_to_remove.mix(ch_files_to_remove2)
+      CLEANUP(ch_files_to_remove.unique().collect(), SONG_SCORE_UPLOAD.out.analysis_id)
     }
-
 }
 
 /*

From a4bfeb5dcab2ce811f2524bad042ddca1ce7d28f Mon Sep 17 00:00:00 2001
From: Linda Xiang <linda.xiang@oicr.on.ca>
Date: Sat, 24 Feb 2024 11:05:20 -0500
Subject: [PATCH 2/6] update README.md

---
 README.md | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9ca5e4b..14704d2 100644
--- a/README.md
+++ b/README.md
@@ -11,15 +11,20 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
 
 2. Install [`Docker`](https://docs.docker.com/engine/installation/).
 
-3. Download the pipeline and test it on a minimal dataset with a single command:
+3. Test the workflow running in `Local` mode on a minimal dataset with a single command:
 
    ```bash
    nextflow run icgc-argo-workflows/prealnqc -profile test,standard
    ```
 
-4. Start running your own analysis!
+4. Test the workflow running in `RDPC` mode with a single command if you have access to `RDPC-QA` env and have your valid api_token available:
    ```bash
-   nextflow run icgc-argo-workflows/prealnqc --input samplesheet.csv --outdir <OUTDIR> -profile standard
+   nextflow run icgc-argo-workflows/prealnqc -profile rdpc_qa,test_rdpc_qa,standard --api_token <YOUR_API_TOKEN>
+   ```
+
+5. Start running your own analysis!
+   ```bash
+   nextflow run icgc-argo-workflows/prealnqc -profile standard --input samplesheet.csv --outdir <OUTDIR>
    ```
 
 ## Pipeline summary

From e20d5c42f63ed75d0730449154c024964f70b554 Mon Sep 17 00:00:00 2001
From: Linda Xiang <linda.xiang@oicr.on.ca>
Date: Thu, 21 Mar 2024 08:28:30 -0400
Subject: [PATCH 3/6] add updated modules

---
 .../icgc-argo-workflows/checkinput/main.nf    |  56 +++
 .../icgc-argo-workflows/checkinput/meta.yml   |  43 ++
 .../checkinput/resources/usr/bin/dnaaln.py    | 476 ++++++++++++++++++
 .../checkinput/resources/usr/bin/dnaalnqc.py  | 356 +++++++++++++
 .../resources/usr/bin/germlinevar.py          | 352 +++++++++++++
 .../checkinput/resources/usr/bin/prealnqc.py  | 416 +++++++++++++++
 .../dumpsoftwareversions/environment.yml      |   7 +
 modules/nf-core/cutadapt/environment.yml      |   7 +
 modules/nf-core/fastqc/environment.yml        |   7 +
 modules/nf-core/multiqc/environment.yml       |   7 +
 10 files changed, 1727 insertions(+)
 create mode 100644 modules/icgc-argo-workflows/checkinput/main.nf
 create mode 100644 modules/icgc-argo-workflows/checkinput/meta.yml
 create mode 100755 modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaaln.py
 create mode 100755 modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py
 create mode 100755 modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py
 create mode 100755 modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py
 create mode 100644 modules/nf-core/custom/dumpsoftwareversions/environment.yml
 create mode 100644 modules/nf-core/cutadapt/environment.yml
 create mode 100644 modules/nf-core/fastqc/environment.yml
 create mode 100644 modules/nf-core/multiqc/environment.yml

diff --git a/modules/icgc-argo-workflows/checkinput/main.nf b/modules/icgc-argo-workflows/checkinput/main.nf
new file mode 100644
index 0000000..45eff91
--- /dev/null
+++ b/modules/icgc-argo-workflows/checkinput/main.nf
@@ -0,0 +1,56 @@
+process CHECKINPUT {
+    tag "$samplesheet"
+    label 'process_single'
+
+    conda "conda-forge::python=3.8.3"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/python:3.8.3' :
+        'quay.io/biocontainers/python:3.8.3' }"
+
+    input:
+    path samplesheet
+    val workflow_name
+
+    output:
+    path 'samplesheet.valid.csv', emit: csv
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script: 
+    """
+    case '$workflow_name' in
+    'Pre Alignment QC')
+        echo $workflow_name detected;
+        prealnqc.py \\
+            $samplesheet \\
+            samplesheet.valid.csv
+        ;;
+    'DNA Alignment QC')
+        dnaalnqc.py \\
+            $samplesheet \\
+            samplesheet.valid.csv
+        ;;
+    'DNA Alignment')
+        dnaaln.py \\
+            $samplesheet \\
+            samplesheet.valid.csv
+        ;;
+    'Germline Variant Call')
+        germlinevar.py \\
+            $samplesheet \\
+            samplesheet.valid.csv
+        ;;
+    *)
+        echo -n "Unknown workflow"
+        exit 1
+        ;;
+    esac
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        python: \$(python --version | sed 's/Python //g')
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/icgc-argo-workflows/checkinput/meta.yml b/modules/icgc-argo-workflows/checkinput/meta.yml
new file mode 100644
index 0000000..f19a5f7
--- /dev/null
+++ b/modules/icgc-argo-workflows/checkinput/meta.yml
@@ -0,0 +1,43 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
+name: "checkinput"
+## TODO nf-core: Add a description of the module and list keywords
+description: write your description here
+keywords:
+  - sort
+  - example
+  - genomics
+tools:
+  - "checkinput":
+      ## TODO nf-core: Add a description and other details for the software below
+      description: ""
+      homepage: ""
+      documentation: ""
+      tool_dev_url: ""
+      doi: ""
+      licence: ""
+
+## TODO nf-core: Add a description of all of the variables used as input
+input:
+  #
+  ## TODO nf-core: Delete / customise this example input
+  - bam:
+      type: file
+      description: Sorted BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+
+## TODO nf-core: Add a description of all of the variables used as output
+output:
+  #
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  ## TODO nf-core: Delete / customise this example output
+  - bam:
+      type: file
+      description: Sorted BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+
+authors:
+  - "@edsu7"
diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaaln.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaaln.py
new file mode 100755
index 0000000..e18de9b
--- /dev/null
+++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaaln.py
@@ -0,0 +1,476 @@
+#!/usr/bin/env python
+
+
+"""Provide a command line tool to validate and transform tabular samplesheets."""
+
+
+import argparse
+import csv
+import logging
+import sys
+from collections import Counter
+from pathlib import Path
+
+logger = logging.getLogger()
+
+
+class RowChecker:
+    """
+    Define a service that can validate and transform each given row.
+
+    Attributes:
+        modified (list): A list of dicts, where each dict corresponds to a previously
+            validated and transformed row. The order of rows is maintained.
+
+    """
+
+    VALID_FORMATS = (
+        ".bam",
+        ".cram",
+    )
+
+    def __init__(
+        self,
+        analysis_type_col = 'analysis_type',
+        study_id_col = 'study_id',
+        patient_col = 'patient',
+        sex_col = 'sex',
+        status_col = 'status',
+        sample_col = 'sample',
+        lane_col = 'lane',
+        fastq_1_col = 'fastq_1',
+        fastq_2_col = 'fastq_2',
+        library_name_col = 'library_name',
+        platform_unit_col = 'platform_unit',
+        platform_col = 'platform',
+        sequencing_center_col = 'sequencing_center',
+        sequencing_date_col = 'sequencing_date',
+        platform_model_col = 'platform_model',
+        single_end_col = 'single_end',
+        read_group_count_col = 'read_group_count',
+        experiment_col = 'experiment',
+        analysis_json_col = 'analysis_json',
+        **kwargs,
+    ):
+        """
+        Initialize the row checker with the expected column names.
+analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json
+        Args:
+            sample_col (str): The name of the column that contains the sample name
+                (default "sample").
+            first_col (str): The name of the column that contains the first (or only)
+                FASTQ file path (default "fastq_1").
+            second_col (str): The name of the column that contains the second (if any)
+                FASTQ file path (default "fastq_2").
+            single_col (str): The name of the new column that will be inserted and
+                records whether the sample contains single- or paired-end sequencing
+                reads (default "single_end").
+
+        """
+        super().__init__(**kwargs)
+        self._analysis_type_col = analysis_type_col
+        self._study_id_col = study_id_col
+        self._patient_col = patient_col
+        self._sex_col = sex_col
+        self._status_col = status_col
+        self._sample_col = sample_col
+        self._lane_col = lane_col
+        self._fastq_1_col = fastq_1_col
+        self._fastq_2_col = fastq_2_col
+        self._library_name_col = library_name_col
+        self._platform_unit_col = platform_unit_col
+        self._platform_col = platform_col
+        self._sequencing_center_col = sequencing_center_col
+        self._sequencing_date_col = sequencing_date_col
+        self._platform_model_col = platform_model_col
+        self._single_end_col = single_end_col
+        self._read_group_count_col = read_group_count_col
+        self._experiment_col = experiment_col
+        self._analysis_json_col = analysis_json_col
+        self._seen = []
+        self.modified = []
+
+
+    def validate_and_transform(self, row):
+        """
+        Perform all validations on the given row and insert the read pairing status.
+
+        Args:
+            row (dict): A mapping from column headers (keys) to elements of that row
+                (values).
+
+        """
+        self._validate_analysis_type(row) if row.get(self._analysis_type_col) else ""
+        self._validate_sex(row) if row.get(self._sex_col) else ""
+        self._validate_study_id(row) if row.get(self._study_id_col) else ""
+        self._validate_patient(row) if row.get(self._patient_col) else ""
+        self._validate_sex(row) if row.get(self._sex_col) else ""
+        self._validate_status(row) if row.get(self._status_col) else ""
+        self._validate_sample(row)
+        self._validate_lane(row)
+        self._validate_single_end(row)
+        self._validate_fastq_1(row)
+        self._validate_fastq_2(row)
+        self._validate_library_name(row)
+        self._validate_platform_unit(row)
+        self._validate_platform_col(row) if row.get(self._platform_col) else ""
+        self._validate_sequencing_center_col(row) if row.get(self._sequencing_center_col) else ""
+        self._validate_sequencing_date_col(row) if row.get(self._sequencing_date_col) else ""
+        self._validate_platform_model_col(row) if row.get(self._platform_model_col) else ""
+        self._validate_read_group_count(row)
+        self._validate_experiment(row) if row.get(self._experiment_col) else ""
+        self._validate_analysis_json(row) if row.get(self._analysis_json_col) else ""
+
+        tmp_dict={
+            "analysis_type" : row[self._analysis_type_col] if row.get(self._analysis_type_col) else "sequencing_experiment",
+            "study_id" : row[self._study_id_col] if row.get(self._study_id_col) else "LOCAL",
+            "patient" : row[self._patient_col] if row.get(self._patient_col) else row[self._sample_col],
+            "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA",
+            "status" : row[self._status_col] if row.get(self._status_col) else "0",
+            "sample" : row[self._sample_col],
+            "lane" : row[self._lane_col],
+            "fastq_1" : row[self._fastq_1_col],
+            "fastq_2" : row[self._fastq_2_col] if row.get(self._fastq_2_col) else "NO_FILE",
+            "single_end" : row[self._single_end_col].lower(),
+            "read_group_count" : row[self._read_group_count_col],
+            "experiment" : row[self._experiment_col] if row.get(self._experiment_col) else "WGS",
+            "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None
+            }
+
+        read_group_info=[]
+        description=[]
+
+        for col in [
+            'experiment',
+            'study_id',
+            'experiment',
+            'patient',
+            'sample',
+            'status'
+        ]:
+            if tmp_dict.get(col):
+                if col=='status':
+                    if tmp_dict['status']==1:
+                        description.append("Tumour")
+                    else:
+                        description.append("Normal")
+                        continue
+                description.append(tmp_dict[col])
+
+        for col,id in zip(
+            [
+                self._lane_col,
+                self._sample_col,
+                self._library_name_col,
+                self._platform_unit_col,
+                self._sequencing_center_col,
+                self._platform_col,
+                self._platform_model_col,
+                self._sequencing_date_col
+            ],
+            ["ID","SM","LB","PU","CN","PL","PM","DT"]):
+            if row.get(col):
+                read_group_info.append("%s:%s" % (id,row[col]))
+
+        tmp_dict['read_group']="'@RG\\t%s\\tDS:%s'" % ("\\t".join(read_group_info),"|".join(description))
+
+        self._seen.append(row)
+        self.modified.append(tmp_dict)
+
+    def _validate_analysis_type(self, row):
+        """Assert that expected analysis is correct."""
+        if len(row[self._analysis_type_col]) <= 0:
+            raise AssertionError("'analysis_type' input is required.")
+        if row[self._analysis_type_col]!="sequencing_experiment":
+            raise AssertionError("analysis_type for \"DNA Alignment\" should be  \"sequencing_experiment\"")
+
+    def _validate_study_id(self, row):
+        """Assert that expected study_id is correct."""
+        if len(row[self._study_id_col]) <= 0:
+            raise AssertionError("'study_id' input is required.")
+
+    def _validate_patient(self, row):
+        """Assert that expected patient is correct."""
+        if len(row[self._patient_col]) <= 0:
+            raise AssertionError("'patient' input is required.")
+
+    def _validate_sex(self, row):
+        """Assert that expected sex is correct."""
+        if len(row[self._sex_col]) <= 0:
+            raise AssertionError("'analysis_type' input is required.")
+        if row[self._sex_col]!="XX" and row[self._sex_col]!="XY" and row[self._sex_col]!="NA":
+            raise AssertionError("sex should be one of the following values : XX,XY,NA")
+
+    def _validate_status(self, row):
+        """Assert that expected tumour status is correct."""
+        if len(row[self._status_col]) <= 0:
+            raise AssertionError("'status' input is required.")
+        if row[self._status_col]!="1" and row[self._status_col]!="0":
+            raise AssertionError("Tumour status should be \"0\" is normal else \"1\"")
+
+    def _validate_sample(self, row):
+        """Assert that expected sample is correct."""
+        if len(row[self._sample_col]) <= 0:
+            raise AssertionError("'sample' input is required.")
+    
+
+    def _validate_lane(self, row):
+        """Assert that expected lane is correct."""
+        if len(row[self._lane_col]) <= 0:
+            raise AssertionError("'lane' input is required.")
+    
+
+    def _validate_fastq_1(self, row):
+        """Assert that expected fastq_1 is correct."""
+        if len(row[self._fastq_1_col]) <= 0:
+            raise AssertionError("'fastq_1' input is required.")
+        if not (
+            row[self._fastq_1_col].endswith(".fq.gz") or 
+            row[self._fastq_1_col].endswith(".fastq.gz") or
+            row[self._fastq_1_col].endswith(".bam")
+            ):
+            raise AssertionError("'fastq_1' incorrect format detected.")
+    
+
+    def _validate_fastq_2(self, row):
+        """Assert that expected fastq_2 is correct."""
+        if row[self._single_end_col].lower()=="true":
+            return 
+
+        if len(row[self._fastq_2_col]) <= 0:
+            raise AssertionError("'fastq_2' input is required.")
+        if row[self._fastq_2_col].endswith(".fastq.gz"):
+            if row[self._fastq_2_col].split("/")[-1].replace("R2.fastq.gz","").replace("r2.fastq.gz","")!=row[self._fastq_1_col].split("/")[-1].replace("R1.fastq.gz","").replace("r1.fastq.gz",""):
+                raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.")
+        if row[self._fastq_2_col].endswith(".fq.gz"):
+            if row[self._fastq_2_col].split("/")[-1].replace("R2.fq.gz","").replace("r2.fq.gz","")!=row[self._fastq_1_col].split("/")[-1].replace("R1.fq.gz","").replace("r1.fq.gz",""):
+                raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.")
+        if row[self._fastq_2_col].endswith(".bam"):
+            if row[self._fastq_2_col]!=row[self._fastq_1_col]:
+                raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.")
+
+    def _validate_single_end(self, row):
+        """Assert that expected single_end is correct."""
+        if len(row[self._single_end_col]) <= 0:
+            raise AssertionError("'single_end' input is required.")
+        if row[self._single_end_col].lower()!="true" and row[self._single_end_col].lower()!="false":
+            raise AssertionError("'single_end' should be specifed as \"True\" or \"False\".") 
+    
+
+    def _validate_read_group_count(self, row):
+        """Assert that expected read_group_count is correct."""
+        if len(row[self._read_group_count_col]) <= 0:
+            raise AssertionError("'read_group_count' input is required.")
+    
+    def _validate_experiment(self, row):
+        """Assert that expected Experiment is correct."""
+        if len(row[self._experiment_col]) <= 0:
+            raise AssertionError("'experiment' input is required.")
+        for val in ["WGS","WXS","RNA-Seq","Bisulfite-Seq","ChIP-Seq","Targeted-Seq"]:
+            if val==row[self._experiment_col]:
+                return
+        raise AssertionError("'experiment' type does not match the following: \"WGS\",\"WXS\",\"RNA-Seq\",\"Bisulfite-Seq\",\"ChIP-Seq\",\"Targeted-Seq\".")
+
+    def _validate_analysis_json(self, row):
+        """Assert that expected analysis_json is correct."""
+        if len(row[self._analysis_json_col]) <= 0:
+            raise AssertionError("'analysis_json' input is required.")
+        if not row[self._analysis_json_col].endswith(".json"):
+            raise AssertionError("'analysis_json' input should have the suffix \".json\".")
+
+    def _validate_library_name(self, row):
+        """Assert that expected library_name is correct."""
+        if len(row[self._library_name_col]) <= 0:
+            raise AssertionError("'library_name' input is required.")
+
+    def _validate_platform_unit(self, row):
+        """Assert that expected platform_unit is correct."""
+        if len(row[self._platform_unit_col]) <= 0:
+            raise AssertionError("'platform_unit' input is required.")
+
+    def _validate_platform_col(self, row):
+        """Assert that expected platform is correct."""
+        if len(row[self._platform_col]) <= 0:
+            raise AssertionError("'platform' input is required.")
+
+    def _validate_sequencing_center_col(self, row):
+        """Assert that expected sequencing_center is correct."""
+        if len(row[self._sequencing_center_col]) <= 0:
+            raise AssertionError("'sequencing_center' input is required.")
+
+    def _validate_sequencing_date_col(self, row):
+        """Assert that expected sequencing_date is correct."""
+        if len(row[self._sequencing_date_col]) <= 0:
+            raise AssertionError("'sequencing_date' input is required.")
+
+    def _validate_platform_model_col(self, row):
+        """Assert that expected platform_model is correct."""
+        if len(row[self._platform_model_col]) <= 0:
+            raise AssertionError("'platform_model' input is required.")
+
+    def validate_unique_fastq(self):
+        """
+        Assert that the combination of FASTQ filename is unique.
+        """
+        tmp=[z['fastq_1'] for z in self.modified]+[z['fastq_2'] for z in self.modified]
+
+        for iter in range(0,len(tmp)):
+            current_val=tmp.pop(0)
+            if current_val.endswith(".fastq.gz"):
+                continue
+            if current_val.endswith(".fq.gz"):
+                continue
+            if current_val=='NO_FILE':
+                continue
+            if current_val in tmp:
+                raise AssertionError("Errors multiple instances of file '%s' detected" % (current_val))
+                sys.exit(1)
+            else:
+                raise AssertionError("Unexpected file format detected for '%s'" % (current_val))
+
+    def validate_unique_values(self,col):
+        """
+        Assert a single unique value exists in array
+        """
+        if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]):
+                raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col))
+                sys.exit(1)
+
+    def validate_common_values(self,col):
+        """
+        Assert each value in array is unique
+        """
+        if len(set([z[col] for z in self.modified]))!=1:
+            raise AssertionError("Errors multiple values detected for '%s'. Only a single value should be used" % (col))
+            sys.exit(1)
+
+def read_head(handle, num_lines=10):
+    """Read the specified number of lines from the current position in the file."""
+    lines = []
+    for idx, line in enumerate(handle):
+        if idx == num_lines:
+            break
+        lines.append(line)
+    return "".join(lines)
+
+
+def sniff_format(handle):
+    """
+    Detect the tabular format.
+
+    Args:
+        handle (text file): A handle to a `text file`_ object. The read position is
+        expected to be at the beginning (index 0).
+
+    Returns:
+        csv.Dialect: The detected tabular format.
+
+    .. _text file:
+        https://docs.python.org/3/glossary.html#term-text-file
+
+    """
+    peek = read_head(handle)
+    handle.seek(0)
+    sniffer = csv.Sniffer()
+    dialect = sniffer.sniff(peek)
+    return dialect
+
+
+def check_samplesheet(file_in, file_out):
+    required_columns = {"sample","lane","fastq_1","fastq_2","single_end","read_group_count","library_name","platform_unit"}
+    conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json","platform","sequencing_center","sequencing_date","platform_model"}
+
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_in.open(newline="") as in_handle:
+        reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
+        # Validate the existence of the expected header columns.
+        if not required_columns.issubset(reader.fieldnames) and not conditional_columns.issubset(reader.fieldnames):
+            req_cols = ", ".join(required_columns)
+            logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.")
+            sys.exit(1)
+        # Validate each row.
+        checker = RowChecker()
+        for i, row in enumerate(reader):
+            try:
+                checker.validate_and_transform(row)
+            except AssertionError as error:
+                logger.critical(f"{str(error)} On line {i + 2}.")
+                sys.exit(1)
+        checker.validate_unique_fastq()
+        for col in["sample","study_id","sex","patient","experiment","read_group_count","status","analysis_json"]:
+            checker.validate_common_values(col)
+        for col in ["lane"]:
+            checker.validate_unique_values(col)
+    
+
+    header = checker.modified[0].keys()
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_out.open(mode="w", newline="") as out_handle:
+        writer = csv.DictWriter(out_handle, header, delimiter=",")
+        writer.writeheader()
+        for row in checker.modified:
+            writer.writerow(row)
+
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Validate and transform a tabular samplesheet.",
+        epilog=\
+        '''
+Check that the tabular samplesheet has the structure expected by nf-core pipelines.
+
+Validate the general shape of the table, expected columns, and each row. Also add
+an additional column which records whether one or two FASTQ reads were found.
+
+Args:
+file_in (pathlib.Path): The given tabular samplesheet. The format can be either
+        CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
+file_out (pathlib.Path): Where the validated and transformed samplesheet should
+    be created; always in CSV format.
+
+Example:
+    This function checks that the samplesheet follows the following structure,
+
+    analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json
+    sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,C0HVY.2,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R1.fq.gz,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R2.fq.gz,'@RG\\tID:C0HVY.2\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8a\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json
+    sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RE2.1,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R2.fq.gz,'@RG\\tID:D0RE2.1\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8b\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json
+    sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RH0.2,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R2.fq.gz,'@RG\\tID:D0RH0.2\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8c\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json
+''',
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "file_in",
+        metavar="FILE_IN",
+        type=Path,
+        help="Tabular input samplesheet in CSV or TSV format.",
+    )
+    parser.add_argument(
+        "file_out",
+        metavar="FILE_OUT",
+        type=Path,
+        help="Transformed output samplesheet in CSV format.",
+    )
+    parser.add_argument(
+        "-l",
+        "--log-level",
+        help="The desired log level (default WARNING).",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
+        default="WARNING",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv=None):
+    """Coordinate argument parsing and program execution."""
+    args = parse_args(argv)
+    logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
+    if not args.file_in.is_file():
+        logger.error(f"The given input file {args.file_in} was not found!")
+        sys.exit(2)
+    args.file_out.parent.mkdir(parents=True, exist_ok=True)
+    check_samplesheet(args.file_in, args.file_out)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py
new file mode 100755
index 0000000..4e111cd
--- /dev/null
+++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py
@@ -0,0 +1,356 @@
+#!/usr/bin/env python
+
+
+"""Provide a command line tool to validate and transform tabular samplesheets."""
+
+
+import argparse
+import csv
+import logging
+import sys
+from collections import Counter
+from pathlib import Path
+
+logger = logging.getLogger()
+
+
+class RowChecker:
+    """
+    Define a service that can validate and transform each given row.
+
+    Attributes:
+        modified (list): A list of dicts, where each dict corresponds to a previously
+            validated and transformed row. The order of rows is maintained.
+
+    """
+
+    VALID_FORMATS = (
+        ".bam",
+        ".cram",
+    )
+
+    def __init__(
+        self,
+        #sample_col="sample",
+        #first_col="bam_cram",
+        analysis_type_col = 'analysis_type',
+        study_id_col = 'study_id',
+        patient_col = 'patient',
+        sex_col = 'sex',
+        status_col = 'status',
+        sample_col = 'sample',
+        cram_col = 'cram',
+        crai_col = 'crai',
+        experiment_col = 'experiment',
+        genome_build_col = "genome_build",
+        analysis_json_col = 'analysis_json',
+        **kwargs,
+    ):
+        """
+        Initialize the row checker with the expected column names.
+analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json
+        Args:
+            sample_col (str): The name of the column that contains the sample name
+                (default "sample").
+            first_col (str): The name of the column that contains the first (or only)
+                FASTQ file path (default "fastq_1").
+            second_col (str): The name of the column that contains the second (if any)
+                FASTQ file path (default "fastq_2").
+            single_col (str): The name of the new column that will be inserted and
+                records whether the sample contains single- or paired-end sequencing
+                reads (default "single_end").
+
+        """
+        super().__init__(**kwargs)
+        self._analysis_type_col = analysis_type_col
+        self._study_id_col = study_id_col
+        self._patient_col = patient_col
+        self._sex_col = sex_col
+        self._status_col = status_col
+        self._sample_col = sample_col
+        self._cram_col = cram_col
+        self._crai_col = crai_col
+        self._experiment_col = experiment_col
+        self._genome_build_col = genome_build_col
+        self._analysis_json_col = analysis_json_col
+        self._seen = []
+        self.modified = []
+
+    def validate_and_transform(self, row):
+        """
+        Perform all validations on the given row and insert the read pairing status.
+
+        Args:
+            row (dict): A mapping from column headers (keys) to elements of that row
+                (values).
+
+        """
+        #{"analysis_type","study_id","patient","sex","status","sample","cram","crai","analysis_json"}
+        self._validate_analysis_type(row) if row.get(self._analysis_type_col) else ""
+        self._validate_sex(row) if row.get(self._sex_col) else ""
+        self._validate_study_id(row) if row.get(self._study_id_col) else ""
+        self._validate_patient(row) if row.get(self._patient_col) else ""
+        self._validate_status(row) if row.get(self._status_col) else ""
+        self._validate_sample(row)
+        self._validate_cram(row)
+        self._validate_crai(row)
+        self._validate_experiment(row) if row.get(self._experiment_col) else ""
+        self._validate_genome_build(row) if row.get(self._genome_build_col) else ""
+        self._validate_analysis_json(row) if row.get(self._analysis_json_col) else ""
+
+
+        tmp_dict={
+            "analysis_type" : row[self._analysis_type_col] if row.get(self._analysis_type_col) else "sequencing_alignment",
+            "study_id" : row[self._study_id_col] if row.get(self._study_id_col) else "LOCAL",
+            "patient" : row[self._patient_col] if row.get(self._patient_col) else row[self._sample_col],
+            "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA",
+            "status" : row[self._status_col] if row.get(self._status_col) else "0",
+            "sample" : row[self._sample_col],
+            "cram" : row[self._cram_col],
+            "crai" : row[self._crai_col],
+            "experiment": row[self._experiment_col] if row.get(self._experiment_col) else "WGS",
+            "genome_build": row[self._genome_build_col] if row.get(self._genome_build_col) else "GRCh38",
+            "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None
+            }
+
+        self._seen.append(row)
+        self.modified.append(tmp_dict)
+
+
+    def _validate_analysis_type(self, row):
+        """Assert that expected analysis is correct."""
+        if len(row[self._analysis_type_col]) <= 0:
+            raise AssertionError("'analysis_type' input is required.")
+        if row[self._analysis_type_col]!="sequencing_alignment":
+            raise AssertionError("analysis_type for \"DNA Alignment QC\" should be  \"sequencing_alignment\"")
+
+    def _validate_study_id(self, row):
+        """Assert that expected study_id is correct."""
+        if len(row[self._study_id_col]) <= 0:
+            raise AssertionError("'study_id' input is required.")
+
+    def _validate_patient(self, row):
+        """Assert that expected patient is correct."""
+        if len(row[self._patient_col]) <= 0:
+            raise AssertionError("'patient' input is required.")
+
+    def _validate_sex(self, row):
+        """Assert that expected sex is correct."""
+        if len(row[self._sex_col]) <= 0:
+            raise AssertionError("'analysis_type' input is required.")
+        if row[self._sex_col]!="XX" and row[self._sex_col]!="XY" and row[self._sex_col]!="NA":
+            raise AssertionError("sex should be one of the following values : XX,XY,NA")
+
+    def _validate_status(self, row):
+        """Assert that expected tumour status is correct."""
+        if len(row[self._status_col]) <= 0:
+            raise AssertionError("'status' input is required.")
+        if row[self._status_col]!="1" and row[self._status_col]!="0":
+            raise AssertionError("Tumour status should be \"0\" is normal else \"1\"")
+
+    def _validate_sample(self, row):
+        """Assert that expected sample is correct."""
+        if len(row[self._sample_col]) <= 0:
+            raise AssertionError("'sample' input is required.")
+    
+    def _validate_cram(self, row):
+        """Assert that expected cram is correct."""
+        if len(row[self._cram_col]) <= 0:
+            raise AssertionError("'cram' input is required.")
+        if not row[self._cram_col].endswith(".cram"):
+            raise AssertionError("'cram' input format is incorrect, ensure file ends with '.cram'")
+
+    def _validate_crai(self, row):
+        """Assert that expected crai is correct."""
+        if len(row[self._crai_col]) <= 0:
+            raise AssertionError("'crai' input is required.")
+        if not row[self._crai_col].endswith(".crai"):
+            raise AssertionError("'crai' input format is incorrect, ensure file ends with '.crai'")
+        if row[self._crai_col].split("/")[-1].replace(".cram.crai","")!=row[self._cram_col].split("/")[-1].replace(".cram",""):
+            raise AssertionError("'cram' and 'crai' file name bodies do not match.")
+
+    def _validate_experiment(self, row):
+        """Assert that expected Experiment is correct."""
+        if len(row[self._experiment_col]) <= 0:
+            raise AssertionError("'experiment' input is required.")
+        for val in ["WGS","WXS","RNA-Seq","Bisulfite-Seq","ChIP-Seq","Targeted-Seq"]:
+            if val==row[self._experiment_col]:
+                return
+        raise AssertionError("'experiment' type does not match the following: \"WGS\",\"WXS\",\"RNA-Seq\",\"Bisulfite-Seq\",\"ChIP-Seq\",\"Targeted-Seq\".")
+
+
+    def _validate_analysis_json(self, row):
+        """Assert that expected analysis_json is correct."""
+        if len(row[self._analysis_json_col]) <= 0:
+            raise AssertionError("'analysis_json' input is required.")
+        if not row[self._analysis_json_col].endswith(".json"):
+            raise AssertionError("'analysis_json' input should have the suffix \".json\".")
+
+    def _validate_genome_build(self, row):
+        """Assert that expected genome_build is correct."""
+        if len(row[self._genome_build_col]) <= 0:
+            raise AssertionError("'genome_build' input is required.")
+
+    def validate_unique_values(self,col):
+        """
+        Assert a single unique value exists in array
+        """
+        if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]):
+                raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col))
+                sys.exit(1)
+
+    def validate_common_values(self,col):
+        """
+        Assert each value in array is unique
+        """
+        if len(set([z[col] for z in self.modified]))!=1:
+            raise AssertionError("Errors multiple values detected for '%s'. Only a single value should be used" % (col))
+            sys.exit(1)
+
+
+
+def read_head(handle, num_lines=10):
+    """Read the specified number of lines from the current position in the file."""
+    lines = []
+    for idx, line in enumerate(handle):
+        if idx == num_lines:
+            break
+        lines.append(line)
+    return "".join(lines)
+
+
+def sniff_format(handle):
+    """
+    Detect the tabular format.
+
+    Args:
+        handle (text file): A handle to a `text file`_ object. The read position is
+        expected to be at the beginning (index 0).
+
+    Returns:
+        csv.Dialect: The detected tabular format.
+
+    .. _text file:
+        https://docs.python.org/3/glossary.html#term-text-file
+
+    """
+    peek = read_head(handle)
+    handle.seek(0)
+    sniffer = csv.Sniffer()
+    dialect = sniffer.sniff(peek)
+    return dialect
+
+
+def check_samplesheet(file_in, file_out):
+    """
+    Check that the tabular samplesheet has the structure expected by nf-core pipelines.
+
+    Validate the general shape of the table, expected columns, and each row. Also add
+    an additional column which records whether one or two FASTQ reads were found.
+
+    Args:
+    file_in (pathlib.Path): The given tabular samplesheet. The format can be either
+            CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
+    file_out (pathlib.Path): Where the validated and transformed samplesheet should
+        be created; always in CSV format.
+
+    Example:
+        This function checks that the samplesheet follows the following structure,
+
+    analysis_type,study_id,patient,sex,status,sample,cram,crai,genome_build,analysis_json
+    sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,WXS,hg38,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json
+    """
+    required_columns = {"sample","cram","crai"}
+    conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json"}
+
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_in.open(newline="") as in_handle:
+        reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
+        # Validate the existence of the expected header columns.
+        if not required_columns.issubset(reader.fieldnames) and not conditional_columns.issubset(reader.fieldnames):
+            req_cols = ", ".join(required_columns)
+            logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.")
+            sys.exit(1)
+        # Validate each row.
+        checker = RowChecker()
+        for i, row in enumerate(reader):
+            try:
+                checker.validate_and_transform(row)
+            except AssertionError as error:
+                logger.critical(f"{str(error)} On line {i + 2}.")
+                sys.exit(1)
+
+        for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]:
+            checker.validate_common_values(col)
+        for col in ["cram","crai"]:
+            checker.validate_unique_values(col)
+
+    header = checker.modified[0].keys()
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_out.open(mode="w", newline="") as out_handle:
+        writer = csv.DictWriter(out_handle, header, delimiter=",")
+        writer.writeheader()
+        for row in checker.modified:
+            writer.writerow(row)
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Validate and transform a tabular samplesheet.",
+        epilog=\
+        '''
+Check that the tabular samplesheet has the structure expected by nf-core pipelines.
+
+Validate the general shape of the table, expected columns, and each row. Also add
+an additional column which records whether one or two FASTQ reads were found.
+
+Args:
+file_in (pathlib.Path): The given tabular samplesheet. The format can be either
+        CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
+file_out (pathlib.Path): Where the validated and transformed samplesheet should
+    be created; always in CSV format.
+
+Example:
+    This function checks that the samplesheet follows the following structure,
+
+    analysis_type,study_id,patient,sex,status,sample,cram,crai,analysis_json
+    sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json
+    ''',
+
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "file_in",
+        metavar="FILE_IN",
+        type=Path,
+        help="Tabular input samplesheet in CSV or TSV format.",
+    )
+    parser.add_argument(
+        "file_out",
+        metavar="FILE_OUT",
+        type=Path,
+        help="Transformed output samplesheet in CSV format.",
+    )
+    parser.add_argument(
+        "-l",
+        "--log-level",
+        help="The desired log level (default WARNING).",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
+        default="WARNING",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv=None):
+    """Coordinate argument parsing and program execution."""
+    args = parse_args(argv)
+    logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
+    if not args.file_in.is_file():
+        logger.error(f"The given input file {args.file_in} was not found!")
+        sys.exit(2)
+    args.file_out.parent.mkdir(parents=True, exist_ok=True)
+    check_samplesheet(args.file_in, args.file_out)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py
new file mode 100755
index 0000000..caccb36
--- /dev/null
+++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py
@@ -0,0 +1,352 @@
+#!/usr/bin/env python
+
+
+"""Provide a command line tool to validate and transform tabular samplesheets."""
+
+
+import argparse
+import csv
+import logging
+import sys
+from collections import Counter
+from pathlib import Path
+
+logger = logging.getLogger()
+
+
+class RowChecker:
+    """
+    Define a service that can validate and transform each given row.
+
+    Attributes:
+        modified (list): A list of dicts, where each dict corresponds to a previously
+            validated and transformed row. The order of rows is maintained.
+
+    """
+
+    VALID_FORMATS = (
+        ".bam",
+        ".cram",
+    )
+
+    def __init__(
+        self,
+        analysis_type_col = 'analysis_type',
+        study_id_col = 'study_id',
+        patient_col = 'patient',
+        sex_col = 'sex',
+        status_col = 'status',
+        sample_col = 'sample',
+        cram_col = 'cram',
+        crai_col = 'crai',
+        experiment_col = 'experiment',
+        genome_build_col = "genome_build",
+        analysis_json_col = 'analysis_json',
+        **kwargs,
+    ):
+        """
+        Initialize the row checker with the expected column names.
+analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json
+        Args:
+            sample_col (str): The name of the column that contains the sample name
+                (default "sample").
+            first_col (str): The name of the column that contains the first (or only)
+                FASTQ file path (default "fastq_1").
+            second_col (str): The name of the column that contains the second (if any)
+                FASTQ file path (default "fastq_2").
+            single_col (str): The name of the new column that will be inserted and
+                records whether the sample contains single- or paired-end sequencing
+                reads (default "single_end").
+
+        """
+        super().__init__(**kwargs)
+        self._analysis_type_col = analysis_type_col
+        self._study_id_col = study_id_col
+        self._patient_col = patient_col
+        self._sex_col = sex_col
+        self._status_col = status_col
+        self._sample_col = sample_col
+        self._cram_col = cram_col
+        self._crai_col = crai_col
+        self._experiment_col = experiment_col
+        self._genome_build_col = genome_build_col
+        self._analysis_json_col = analysis_json_col
+        self._seen = []
+        self.modified = []
+
+    def validate_and_transform(self, row):
+        """
+        Perform all validations on the given row and insert the read pairing status.
+
+        Args:
+            row (dict): A mapping from column headers (keys) to elements of that row
+                (values).
+
+        """
+        #{"analysis_type","study_id","patient","sex","status","sample","cram","crai","analysis_json"}
+        self._validate_analysis_type(row) if row.get(self._analysis_type_col) else ""
+        self._validate_sex(row) if row.get(self._sex_col) else ""
+        self._validate_study_id(row) if row.get(self._study_id_col) else ""
+        self._validate_patient(row) if row.get(self._patient_col) else ""
+        self._validate_status(row) if row.get(self._status_col) else ""
+        self._validate_sample(row)
+        self._validate_cram(row)
+        self._validate_crai(row)
+        self._validate_experiment(row) if row.get(self._experiment_col) else ""
+        self._validate_genome_build(row) if row.get(self._genome_build_col) else ""
+        self._validate_analysis_json(row) if row.get(self._analysis_json_col) else ""
+
+        tmp_dict={
+            "analysis_type" : row[self._analysis_type_col] if row.get(self._analysis_type_col) else "sequencing_alignment",
+            "study_id" : row[self._study_id_col] if row.get(self._study_id_col) else "LOCAL",
+            "patient" : row[self._patient_col] if row.get(self._patient_col) else row[self._sample_col],
+            "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA",
+            "status" : row[self._status_col] if row.get(self._status_col) else "0",
+            "sample" : row[self._sample_col],
+            "cram" : row[self._cram_col],
+            "crai" : row[self._crai_col],
+            "experiment": row[self._experiment_col] if row.get(self._experiment_col) else "WGS",
+            "genome_build": row[self._genome_build_col] if row.get(self._genome_build_col) else "GRCh38",
+            "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None
+            }
+
+        self._seen.append(row)
+        self.modified.append(tmp_dict)
+
+
+    def _validate_analysis_type(self, row):
+        """Assert that expected analysis is correct."""
+        if len(row[self._analysis_type_col]) <= 0:
+            raise AssertionError("'analysis_type' input is required.")
+        if row[self._analysis_type_col]!="sequencing_alignment":
+            raise AssertionError("analysis_type for \"Germline Variant Call\" should be  \"sequencing_alignment\"")
+
+    def _validate_study_id(self, row):
+        """Assert that expected study_id is correct."""
+        if len(row[self._study_id_col]) <= 0:
+            raise AssertionError("'study_id' input is required.")
+
+    def _validate_patient(self, row):
+        """Assert that expected patient is correct."""
+        if len(row[self._patient_col]) <= 0:
+            raise AssertionError("'patient' input is required.")
+
+    def _validate_sex(self, row):
+        """Assert that expected sex is correct."""
+        if len(row[self._sex_col]) <= 0:
+            raise AssertionError("'analysis_type' input is required.")
+        if row[self._sex_col]!="XX" and row[self._sex_col]!="XY" and row[self._sex_col]!="NA":
+            raise AssertionError("sex should be one of the following values : XX,XY,NA")
+
+    def _validate_status(self, row):
+        """Assert that expected tumour status is correct."""
+        if len(row[self._status_col]) <= 0:
+            raise AssertionError("'status' input is required.")
+        if row[self._status_col]!="1" and row[self._status_col]!="0":
+            raise AssertionError("Tumour status should be \"0\" is normal else \"1\"")
+
+    def _validate_sample(self, row):
+        """Assert that expected sample is correct."""
+        if len(row[self._sample_col]) <= 0:
+            raise AssertionError("'sample' input is required.")
+    
+    def _validate_cram(self, row):
+        """Assert that expected cram is correct."""
+        if len(row[self._cram_col]) <= 0:
+            raise AssertionError("'cram' input is required.")
+        if not row[self._cram_col].endswith(".cram"):
+            raise AssertionError("'cram' input format is incorrect, ensure file ends with '.cram'")
+
+    def _validate_crai(self, row):
+        """Assert that expected crai is correct."""
+        if len(row[self._crai_col]) <= 0:
+            raise AssertionError("'crai' input is required.")
+        if not row[self._crai_col].endswith(".crai"):
+            raise AssertionError("'crai' input format is incorrect, ensure file ends with '.crai'")
+        if row[self._crai_col].split("/")[-1].replace(".cram.crai","")!=row[self._cram_col].split("/")[-1].replace(".cram",""):
+            raise AssertionError("'cram' and 'crai' file name bodies do not match.")
+
+    def _validate_experiment(self, row):
+        """Assert that expected Experiment is correct."""
+        if len(row[self._experiment_col]) <= 0:
+            raise AssertionError("'experiment' input is required.")
+        for val in ["WGS","WXS","RNA-Seq","Bisulfite-Seq","ChIP-Seq","Targeted-Seq"]:
+            if val==row[self._experiment_col]:
+                return
+        raise AssertionError("'experiment' type does not match the following: \"WGS\",\"WXS\",\"RNA-Seq\",\"Bisulfite-Seq\",\"ChIP-Seq\",\"Targeted-Seq\".")
+
+    def _validate_analysis_json(self, row):
+        """Assert that expected analysis_json is correct."""
+        if len(row[self._analysis_json_col]) <= 0:
+            raise AssertionError("'analysis_json' input is required.")
+        if not row[self._analysis_json_col].endswith(".json"):
+            raise AssertionError("'analysis_json' input should have the suffix \".json\".")
+
+    def _validate_genome_build(self, row):
+        """Assert that expected genome_build is correct."""
+        if len(row[self._genome_build_col]) <= 0:
+            raise AssertionError("'genome_build' input is required.")
+
+    def validate_unique_values(self,col):
+        """
+        Assert a single unique value exists in array
+        """
+        if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]):
+                raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col))
+                sys.exit(1)
+
+    def validate_common_values(self,col):
+        """
+        Assert each value in array is unique
+        """
+        if len(set([z[col] for z in self.modified]))!=1:
+            raise AssertionError("Errors multiple values detected for '%s'. Only a single value should be used" % (col))
+            sys.exit(1)
+
+
+def read_head(handle, num_lines=10):
+    """Read the specified number of lines from the current position in the file."""
+    lines = []
+    for idx, line in enumerate(handle):
+        if idx == num_lines:
+            break
+        lines.append(line)
+    return "".join(lines)
+
+
+def sniff_format(handle):
+    """
+    Detect the tabular format.
+
+    Args:
+        handle (text file): A handle to a `text file`_ object. The read position is
+        expected to be at the beginning (index 0).
+
+    Returns:
+        csv.Dialect: The detected tabular format.
+
+    .. _text file:
+        https://docs.python.org/3/glossary.html#term-text-file
+
+    """
+    peek = read_head(handle)
+    handle.seek(0)
+    sniffer = csv.Sniffer()
+    dialect = sniffer.sniff(peek)
+    return dialect
+
+
+def check_samplesheet(file_in, file_out):
+    """
+    Check that the tabular samplesheet has the structure expected by nf-core pipelines.
+
+    Validate the general shape of the table, expected columns, and each row. Also add
+    an additional column which records whether one or two FASTQ reads were found.
+
+    Args:
+    file_in (pathlib.Path): The given tabular samplesheet. The format can be either
+            CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
+    file_out (pathlib.Path): Where the validated and transformed samplesheet should
+        be created; always in CSV format.
+
+    Example:
+        This function checks that the samplesheet follows the following structure,
+
+    analysis_type,study_id,patient,sex,status,sample,cram,crai,genome_build,analysis_json
+    sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,WXS,hg38,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json
+    """
+    required_columns = {"sample","cram","crai"}
+    conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json"}
+
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_in.open(newline="") as in_handle:
+        reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
+        # Validate the existence of the expected header columns.
+        if not required_columns.issubset(reader.fieldnames) and not conditional_columns.issubset(reader.fieldnames):
+            req_cols = ", ".join(required_columns)
+            logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.")
+            sys.exit(1)
+        # Validate each row.
+        checker = RowChecker()
+        for i, row in enumerate(reader):
+            try:
+                checker.validate_and_transform(row)
+            except AssertionError as error:
+                logger.critical(f"{str(error)} On line {i + 2}.")
+                sys.exit(1)
+
+        for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]:
+            checker.validate_common_values(col)
+        for col in ["cram","crai"]:
+            checker.validate_unique_values(col)
+
+    header = checker.modified[0].keys()
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_out.open(mode="w", newline="") as out_handle:
+        writer = csv.DictWriter(out_handle, header, delimiter=",")
+        writer.writeheader()
+        for row in checker.modified:
+            writer.writerow(row)
+
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Validate and transform a tabular samplesheet.",
+        epilog=\
+        '''
+Check that the tabular samplesheet has the structure expected by nf-core pipelines.
+
+Validate the general shape of the table, expected columns, and each row. Also add
+an additional column which records whether one or two FASTQ reads were found.
+
+Args:
+file_in (pathlib.Path): The given tabular samplesheet. The format can be either
+        CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
+file_out (pathlib.Path): Where the validated and transformed samplesheet should
+    be created; always in CSV format.
+
+Example:
+    This function checks that the samplesheet follows the following structure,
+
+    analysis_type,study_id,patient,sex,status,sample,cram,crai,analysis_json
+    sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json
+    ''',
+
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "file_in",
+        metavar="FILE_IN",
+        type=Path,
+        help="Tabular input samplesheet in CSV or TSV format.",
+    )
+    parser.add_argument(
+        "file_out",
+        metavar="FILE_OUT",
+        type=Path,
+        help="Transformed output samplesheet in CSV format.",
+    )
+    parser.add_argument(
+        "-l",
+        "--log-level",
+        help="The desired log level (default WARNING).",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
+        default="WARNING",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv=None):
+    """Coordinate argument parsing and program execution."""
+    args = parse_args(argv)
+    logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
+    if not args.file_in.is_file():
+        logger.error(f"The given input file {args.file_in} was not found!")
+        sys.exit(2)
+    args.file_out.parent.mkdir(parents=True, exist_ok=True)
+    check_samplesheet(args.file_in, args.file_out)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py
new file mode 100755
index 0000000..f9d5704
--- /dev/null
+++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py
@@ -0,0 +1,416 @@
+#!/usr/bin/env python
+
+
+"""Provide a command line tool to validate and transform tabular samplesheets."""
+
+
+import argparse
+import csv
+import logging
+import sys
+from collections import Counter
+from pathlib import Path
+
+logger = logging.getLogger()
+
+
+class RowChecker:
+    """
+    Define a service that can validate and transform each given row.
+
+    Attributes:
+        modified (list): A list of dicts, where each dict corresponds to a previously
+            validated and transformed row. The order of rows is maintained.
+
+    """
+
+    VALID_FORMATS = (
+        ".bam",
+        ".cram",
+    )
+
+    def __init__(
+        self,
+        analysis_type_col = 'analysis_type',
+        study_id_col = 'study_id',
+        patient_col = 'patient',
+        sex_col = 'sex',
+        status_col = 'status',
+        sample_col = 'sample',
+        lane_col = 'lane',
+        fastq_1_col = 'fastq_1',
+        fastq_2_col = 'fastq_2',
+        single_end_col = 'single_end',
+        read_group_count_col = 'read_group_count',
+        experiment_col = 'experiment',
+        analysis_json_col = 'analysis_json',
+        **kwargs,
+    ):
+        """
+        Initialize the row checker with the expected column names.
+analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json
+        Args:
+            sample_col (str): The name of the column that contains the sample name
+                (default "sample").
+            first_col (str): The name of the column that contains the first (or only)
+                FASTQ file path (default "fastq_1").
+            second_col (str): The name of the column that contains the second (if any)
+                FASTQ file path (default "fastq_2").
+            single_col (str): The name of the new column that will be inserted and
+                records whether the sample contains single- or paired-end sequencing
+                reads (default "single_end").
+
+        """
+        super().__init__(**kwargs)
+        self._analysis_type_col = analysis_type_col
+        self._study_id_col = study_id_col
+        self._patient_col = patient_col
+        self._sex_col = sex_col
+        self._status_col = status_col
+        self._sample_col = sample_col
+        self._lane_col = lane_col
+        self._fastq_1_col = fastq_1_col
+        self._fastq_2_col = fastq_2_col
+        self._single_end_col = single_end_col
+        self._read_group_count_col = read_group_count_col
+        self._experiment_col = experiment_col
+        self._analysis_json_col = analysis_json_col
+        self._seen = []
+        self.modified = []
+
+    def validate_and_transform(self, row):
+        """
+        Perform all validations on the given row and insert the read pairing status.
+
+        Args:
+            row (dict): A mapping from column headers (keys) to elements of that row
+                (values).
+
+        """
+        self._validate_analysis_type(row) if row.get(self._analysis_type_col) else ""
+        self._validate_study_id(row) if row.get(self._study_id_col) else ""
+        self._validate_patient(row) if row.get(self._patient_col) else ""
+        self._validate_sex(row) if row.get(self._sex_col) else ""
+        self._validate_status(row) if row.get(self._status_col) else ""
+        self._validate_sample(row)
+        self._validate_lane(row)
+        self._validate_single_end(row)
+        self._validate_fastq_1(row)
+        self._validate_fastq_2(row)
+        self._validate_read_group_count(row) if row.get(self._read_group_count_col) else ""
+        self._validate_experiment(row) if row.get(self._experiment_col) else ""
+        self._validate_analysis_json(row) if row.get(self._analysis_json_col) else ""
+
+        tmp_dict={
+            "analysis_type" : row[self._analysis_type_col] if row.get(self._analysis_type_col) else "sequencing_experiment",
+            "study_id" : row[self._study_id_col] if row.get(self._study_id_col) else "LOCAL",
+            "patient" : row[self._patient_col] if row.get(self._patient_col) else row[self._sample_col],
+            "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA",
+            "status" : row[self._status_col] if row.get(self._status_col) else "0",
+            "sample" : row[self._sample_col],
+            "lane" : row[self._lane_col],
+            "fastq_1" : row[self._fastq_1_col],
+            "fastq_2" : row[self._fastq_2_col],
+            "single_end" : row[self._single_end_col].lower(),
+            "read_group_count" : row[self._read_group_count_col] if row.get(self._read_group_count_col) else None,
+            "experiment" : row[self._experiment_col] if row.get(self._experiment_col) else "WGS",
+            "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None
+            }
+
+        self._seen.append(row)
+        self.modified.append(tmp_dict)
+
+
+    def _validate_analysis_type(self, row):
+        """Assert that expected analysis is correct."""
+        if len(row[self._analysis_type_col]) <= 0:
+            raise AssertionError("'analysis_type' input is required.")
+        if row[self._analysis_type_col]!="sequencing_experiment":
+            raise AssertionError("analysis_type for \"Pre Alignment QC\" should be  \"sequencing_experiment\"")
+
+    def _validate_study_id(self, row):
+        """Assert that expected study_id is correct."""
+        if len(row[self._study_id_col]) <= 0:
+            raise AssertionError("'study_id' input is required.")
+
+    def _validate_patient(self, row):
+        """Assert that expected patient is correct."""
+        if len(row[self._patient_col]) <= 0:
+            raise AssertionError("'patient' input is required.")
+
+    def _validate_sex(self, row):
+        """Assert that expected sex is correct."""
+        if len(row[self._sex_col]) <= 0:
+            raise AssertionError("'analysis_type' input is required.")
+        if row[self._sex_col]!="XX" and row[self._sex_col]!="XY" and row[self._sex_col]!="NA":
+            raise AssertionError("sex should be one of the following values : XX,XY,NA")
+
+    def _validate_status(self, row):
+        """Assert that expected tumour status is correct."""
+        if len(row[self._status_col]) <= 0:
+            raise AssertionError("'status' input is required.")
+        if row[self._status_col]!="1" and row[self._status_col]!="0":
+            raise AssertionError("Tumour status should be \"0\" is normal else \"1\"")
+
+    def _validate_sample(self, row):
+        """Assert that expected sample is correct."""
+        if len(row[self._sample_col]) <= 0:
+            raise AssertionError("'sample' input is required.")
+    
+
+    def _validate_lane(self, row):
+        """Assert that expected lane is correct."""
+        if len(row[self._lane_col]) <= 0:
+            raise AssertionError("'lane' input is required.")
+    
+
+    def _validate_fastq_1(self, row):
+        """Assert that expected fastq_1 is correct."""
+        if len(row[self._fastq_1_col]) <= 0:
+            raise AssertionError("'fastq_1' input is required.")
+        if not (
+            row[self._fastq_1_col].endswith(".fq.gz") or 
+            row[self._fastq_1_col].endswith(".fastq.gz") or
+            row[self._fastq_1_col].endswith(".bam")
+            ):
+            raise AssertionError("'fastq_1' incorrect format detected.")
+    
+
+    def _validate_fastq_2(self, row):
+        """Assert that expected fastq_2 is correct."""
+        if row[self._single_end_col].lower()=="true":
+            return 
+
+        if len(row[self._fastq_2_col]) <= 0:
+            raise AssertionError("'fastq_2' input is required.")
+        if row[self._fastq_2_col].endswith(".fastq.gz"):
+            if row[self._fastq_2_col].split("/")[-1].replace("R2.fastq.gz","").replace("r2.fastq.gz","")!=row[self._fastq_1_col].split("/")[-1].replace("R1.fastq.gz","").replace("r1.fastq.gz",""):
+                raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.")
+        if row[self._fastq_2_col].endswith(".fq.gz"):
+            if row[self._fastq_2_col].split("/")[-1].replace("R2.fq.gz","").replace("r2.fq.gz","")!=row[self._fastq_1_col].split("/")[-1].replace("R1.fq.gz","").replace("r1.fq.gz",""):
+                raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.")
+        if row[self._fastq_2_col].endswith(".bam"):
+            if row[self._fastq_2_col]!=row[self._fastq_1_col]:
+                raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.")
+    
+
+    def _validate_single_end(self, row):
+        """Assert that expected single_end is correct."""
+        if len(row[self._single_end_col]) <= 0:
+            raise AssertionError("'single_end' input is required.")
+        if row[self._single_end_col].lower()!="true" and row[self._single_end_col].lower()!="false":
+            raise AssertionError("'single_end' should be specifed as \"True\" or \"False\".") 
+    
+
+    def _validate_read_group_count(self, row):
+        """Assert that expected read_group_count is correct."""
+        if len(row[self._read_group_count_col]) <= 0:
+            raise AssertionError("'read_group_count' input is required.")
+    
+    def _validate_experiment(self, row):
+        """Assert that expected Experiment is correct."""
+        if len(row[self._experiment_col]) <= 0:
+            raise AssertionError("'experiment' input is required.")
+        for val in ["WGS","WXS","RNA-Seq","Bisulfite-Seq","ChIP-Seq","Targeted-Seq"]:
+            if val==row[self._experiment_col]:
+                return
+        raise AssertionError("'experiment' type does not match the following: \"WGS\",\"WXS\",\"RNA-Seq\",\"Bisulfite-Seq\",\"ChIP-Seq\",\"Targeted-Seq\".")
+
+
+    def _validate_analysis_json(self, row):
+        """Assert that expected analysis_json is correct."""
+        if len(row[self._analysis_json_col]) <= 0:
+            raise AssertionError("'analysis_json' input is required.")
+        if not row[self._analysis_json_col].endswith(".json"):
+            raise AssertionError("'analysis_json' input should have the suffix \".json\".")
+
+    def validate_unique_fastq(self):
+        """
+        Assert that the combination of FASTQ filename is unique.
+        """
+        tmp=[z['fastq_1'] for z in self.modified]+[z['fastq_2'] for z in self.modified]
+
+        for iter in range(0,len(tmp)):
+            current_val=tmp.pop(0)
+            if current_val.endswith(".fastq.gz"):
+                continue
+            if current_val.endswith(".fq.gz"):
+                continue
+            if current_val=='NO_FILE':
+                continue
+            if current_val in tmp:
+                raise AssertionError("Errors multiple instances of file '%s' detected" % (current_val))
+                sys.exit(1)
+            else:
+                raise AssertionError("Unexpected file format detected for '%s'" % (current_val))
+
+
+    def validate_unique_values(self,col):
+        """
+        Assert a single unique value exists in array
+        """
+        if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]):
+                raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col))
+                sys.exit(1)
+
+    def validate_common_values(self,col):
+        """
+        Assert each value in array is unique
+        """
+        if len(set([z[col] for z in self.modified]))!=1:
+            raise AssertionError("Errors multiple values detected for '%s'. Only a single value should be used" % (col))
+            sys.exit(1)
+
+
+def read_head(handle, num_lines=10):
+    """Read the specified number of lines from the current position in the file."""
+    lines = []
+    for idx, line in enumerate(handle):
+        if idx == num_lines:
+            break
+        lines.append(line)
+    return "".join(lines)
+
+
+def sniff_format(handle):
+    """
+    Detect the tabular format.
+
+    Args:
+        handle (text file): A handle to a `text file`_ object. The read position is
+        expected to be at the beginning (index 0).
+
+    Returns:
+        csv.Dialect: The detected tabular format.
+
+    .. _text file:
+        https://docs.python.org/3/glossary.html#term-text-file
+
+    """
+    peek = read_head(handle)
+    handle.seek(0)
+    sniffer = csv.Sniffer()
+    dialect = sniffer.sniff(peek)
+    return dialect
+
+
+def check_samplesheet(file_in, file_out):
+    """
+    Check that the tabular samplesheet has the structure expected by nf-core pipelines.
+
+    Validate the general shape of the table, expected columns, and each row. Also add
+    an additional column which records whether one or two FASTQ reads were found.
+
+    Args:
+        file_in (pathlib.Path): The given tabular samplesheet. The format can be either
+            CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
+        file_out (pathlib.Path): Where the validated and transformed samplesheet should
+            be created; always in CSV format.
+
+    Example:
+        This function checks that the samplesheet follows the following structure,
+        see also the `viral recon samplesheet`_::
+
+        analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json
+        sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,C0HVY.2,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R1.fq.gz,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R2.fq.gz,'@RG\tID:C0HVY.2\tSM:SA624380\tLB:Pond-147580\tPU:74_8a\tPI:298\tCN:EXT\tPL:ILLUMINA\tPM:HiSeq 2000\tDT:2014-12-12\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,875ef550-e536-4456-9ef5-50e5362456df.analysis.json
+        sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RE2.1,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R2.fq.gz,'@RG\tID:D0RE2.1\tSM:SA624380\tLB:Pond-147580\tPU:74_8b\tPI:298\tCN:EXT\tPL:ILLUMINA\tPM:HiSeq 2000\tDT:2014-12-12\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,875ef550-e536-4456-9ef5-50e5362456df.analysis.json
+        sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RH0.2,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R2.fq.gz,'@RG\tID:D0RH0.2\tSM:SA624380\tLB:Pond-147580\tPU:74_8c\tPI:298\tCN:EXT\tPL:ILLUMINA\tPM:HiSeq 2000\tDT:2014-12-12\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,875ef550-e536-4456-9ef5-50e5362456df.analysis.json
+
+    """
+    required_columns = {"sample","lane","fastq_1","fastq_2","single_end"}
+    conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json"}
+
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_in.open(newline="") as in_handle:
+        reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle))
+        # Validate the existence of the expected header columns.
+        if not required_columns.issubset(reader.fieldnames) and not conditional_columns.issubset(reader.fieldnames):
+            req_cols = ", ".join(required_columns)
+            logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.")
+            sys.exit(1)
+        # Validate each row.
+        checker = RowChecker()
+        for i, row in enumerate(reader):
+            try:
+                checker.validate_and_transform(row)
+            except AssertionError as error:
+                logger.critical(f"{str(error)} On line {i + 2}.")
+                sys.exit(1)
+        checker.validate_unique_fastq()
+        for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]:
+            checker.validate_common_values(col)
+        for col in ["lane"]:
+            checker.validate_unique_values(col)
+    
+
+    header = checker.modified[0].keys()
+    # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
+    with file_out.open(mode="w", newline="") as out_handle:
+        writer = csv.DictWriter(out_handle, header, delimiter=",")
+        writer.writeheader()
+        for row in checker.modified:
+            writer.writerow(row)
+
+
+def parse_args(argv=None):
+    """Define and immediately parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Validate and transform a tabular samplesheet.",
+        epilog=\
+        '''
+Check that the tabular samplesheet has the structure expected by nf-core pipelines.
+
+Validate the general shape of the table, expected columns, and each row. Also add
+an additional column which records whether one or two FASTQ reads were found.
+
+Args:
+file_in (pathlib.Path): The given tabular samplesheet. The format can be either
+        CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``.
+file_out (pathlib.Path): Where the validated and transformed samplesheet should
+    be created; always in CSV format.
+
+Example:
+    This function checks that the samplesheet follows the following structure,
+
+    analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json
+    sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,C0HVY.2,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R1.fq.gz,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R2.fq.gz,'@RG\\tID:C0HVY.2\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8a\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json
+    sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RE2.1,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R2.fq.gz,'@RG\\tID:D0RE2.1\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8b\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json
+    sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RH0.2,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R2.fq.gz,'@RG\\tID:D0RH0.2\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8c\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json
+''',
+        formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "file_in",
+        metavar="FILE_IN",
+        type=Path,
+        help="Tabular input samplesheet in CSV or TSV format.",
+    )
+    parser.add_argument(
+        "file_out",
+        metavar="FILE_OUT",
+        type=Path,
+        help="Transformed output samplesheet in CSV format.",
+    )
+    parser.add_argument(
+        "-l",
+        "--log-level",
+        help="The desired log level (default WARNING).",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"),
+        default="WARNING",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv=None):
+    """Coordinate argument parsing and program execution."""
+    args = parse_args(argv)
+    logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
+    if not args.file_in.is_file():
+        logger.error(f"The given input file {args.file_in} was not found!")
+        sys.exit(2)
+    args.file_out.parent.mkdir(parents=True, exist_ok=True)
+    check_samplesheet(args.file_in, args.file_out)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
\ No newline at end of file
diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml
new file mode 100644
index 0000000..9b3272b
--- /dev/null
+++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml
@@ -0,0 +1,7 @@
+name: custom_dumpsoftwareversions
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::multiqc=1.19
diff --git a/modules/nf-core/cutadapt/environment.yml b/modules/nf-core/cutadapt/environment.yml
new file mode 100644
index 0000000..d32a8f9
--- /dev/null
+++ b/modules/nf-core/cutadapt/environment.yml
@@ -0,0 +1,7 @@
+name: cutadapt
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::cutadapt=3.4
diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml
new file mode 100644
index 0000000..1787b38
--- /dev/null
+++ b/modules/nf-core/fastqc/environment.yml
@@ -0,0 +1,7 @@
+name: fastqc
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::fastqc=0.12.1
diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml
new file mode 100644
index 0000000..7625b75
--- /dev/null
+++ b/modules/nf-core/multiqc/environment.yml
@@ -0,0 +1,7 @@
+name: multiqc
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::multiqc=1.19

From 055b57a9053b04ea10de841dd819ded0e8333855 Mon Sep 17 00:00:00 2001
From: Linda Xiang <linda.xiang@oicr.on.ca>
Date: Tue, 9 Apr 2024 12:38:32 -0400
Subject: [PATCH 4/6] update stage_input fixes

---
 modules.json                                  | 28 ++++--
 .../checkinput/resources/usr/bin/dnaalnqc.py  | 53 ++++++-----
 .../resources/usr/bin/germlinevar.py          | 51 ++++++-----
 .../checkinput/resources/usr/bin/prealnqc.py  |  5 +-
 .../prep/sample/resources/usr/bin/main.py     | 16 ++--
 .../samtools/index/environment.yml            |  8 ++
 .../samtools/index/main.nf                    | 48 ++++++++++
 .../samtools/index/meta.yml                   | 57 ++++++++++++
 .../tabix/tabix/environment.yml               |  9 ++
 .../icgc-argo-workflows/tabix/tabix/main.nf   | 42 +++++++++
 .../icgc-argo-workflows/tabix/tabix/meta.yml  | 45 ++++++++++
 .../icgc-argo-workflows/stage_input/main.nf   | 90 ++++++++++++++-----
 12 files changed, 356 insertions(+), 96 deletions(-)
 create mode 100644 modules/icgc-argo-workflows/samtools/index/environment.yml
 create mode 100644 modules/icgc-argo-workflows/samtools/index/main.nf
 create mode 100644 modules/icgc-argo-workflows/samtools/index/meta.yml
 create mode 100644 modules/icgc-argo-workflows/tabix/tabix/environment.yml
 create mode 100644 modules/icgc-argo-workflows/tabix/tabix/main.nf
 create mode 100644 modules/icgc-argo-workflows/tabix/tabix/meta.yml

diff --git a/modules.json b/modules.json
index dd1f6b2..2d60334 100644
--- a/modules.json
+++ b/modules.json
@@ -6,8 +6,8 @@
             "modules": {
                 "icgc-argo-workflows": {
                     "checkinput": {
-                        "branch": "stage_input_fixB",
-                        "git_sha": "af24d4d6b59921ee048c304926897567ac956b00",
+                        "branch": "main",
+                        "git_sha": "e1f2b946b457eac191c0fa97ae1d159a15874c6b",
                         "installed_by": ["stage_input"]
                     },
                     "cleanup": {
@@ -26,12 +26,17 @@
                         "installed_by": ["modules"]
                     },
                     "prep/sample": {
-                        "branch": "stage_input_fixB",
-                        "git_sha": "f253d1e6d4dc5f6ac0e6440041ee7e55b8203e35",
+                        "branch": "main",
+                        "git_sha": "dbd4c7b18c86be15f2ca341085d90c0488545d53",
                         "installed_by": ["stage_input"]
                     },
+                    "samtools/index": {
+                        "branch": "main",
+                        "git_sha": "3f8cbdb457ed1b642b4f9b079850f2a92da9fcc0",
+                        "installed_by": ["modules", "stage_input"]
+                    },
                     "score/download": {
-                        "branch": "stage_input_fixB",
+                        "branch": "main",
                         "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec",
                         "installed_by": ["song_score_download"]
                     },
@@ -41,7 +46,7 @@
                         "installed_by": ["song_score_upload"]
                     },
                     "song/get": {
-                        "branch": "stage_input_fixB",
+                        "branch": "main",
                         "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec",
                         "installed_by": ["song_score_download"]
                     },
@@ -59,13 +64,18 @@
                         "branch": "main",
                         "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec",
                         "installed_by": ["song_score_upload"]
+                    },
+                    "tabix/tabix": {
+                        "branch": "main",
+                        "git_sha": "e1f2b946b457eac191c0fa97ae1d159a15874c6b",
+                        "installed_by": ["stage_input"]
                     }
                 }
             },
             "subworkflows": {
                 "icgc-argo-workflows": {
                     "song_score_download": {
-                        "branch": "stage_input_fixB",
+                        "branch": "main",
                         "git_sha": "92aa620385099e94401c22b8633cc55ed34ca10e",
                         "installed_by": ["stage_input"]
                     },
@@ -75,8 +85,8 @@
                         "installed_by": ["subworkflows"]
                     },
                     "stage_input": {
-                        "branch": "stage_input_fixB",
-                        "git_sha": "af24d4d6b59921ee048c304926897567ac956b00",
+                        "branch": "main",
+                        "git_sha": "e9dfe346ae3334973f406be3051a1091cad1dca6",
                         "installed_by": ["subworkflows"]
                     }
                 }
diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py
index 4e111cd..afea641 100755
--- a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py
+++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py
@@ -10,7 +10,7 @@
 import sys
 from collections import Counter
 from pathlib import Path
-
+import os
 logger = logging.getLogger()
 
 
@@ -39,8 +39,8 @@ def __init__(
         sex_col = 'sex',
         status_col = 'status',
         sample_col = 'sample',
-        cram_col = 'cram',
-        crai_col = 'crai',
+        bam_cram_col = 'bam_cram',
+        bai_crai_col = 'bai_crai',
         experiment_col = 'experiment',
         genome_build_col = "genome_build",
         analysis_json_col = 'analysis_json',
@@ -68,8 +68,8 @@ def __init__(
         self._sex_col = sex_col
         self._status_col = status_col
         self._sample_col = sample_col
-        self._cram_col = cram_col
-        self._crai_col = crai_col
+        self._bam_cram_col = bam_cram_col
+        self._bai_crai_col = bai_crai_col
         self._experiment_col = experiment_col
         self._genome_build_col = genome_build_col
         self._analysis_json_col = analysis_json_col
@@ -92,13 +92,13 @@ def validate_and_transform(self, row):
         self._validate_patient(row) if row.get(self._patient_col) else ""
         self._validate_status(row) if row.get(self._status_col) else ""
         self._validate_sample(row)
-        self._validate_cram(row)
-        self._validate_crai(row)
+        self._validate_bam_cram(row)
+        self._validate_bai_crai(row) if row.get(self._bai_crai_col) else ""
         self._validate_experiment(row) if row.get(self._experiment_col) else ""
         self._validate_genome_build(row) if row.get(self._genome_build_col) else ""
         self._validate_analysis_json(row) if row.get(self._analysis_json_col) else ""
 
-
+        print(row)
         tmp_dict={
             "analysis_type" : row[self._analysis_type_col] if row.get(self._analysis_type_col) else "sequencing_alignment",
             "study_id" : row[self._study_id_col] if row.get(self._study_id_col) else "LOCAL",
@@ -106,8 +106,8 @@ def validate_and_transform(self, row):
             "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA",
             "status" : row[self._status_col] if row.get(self._status_col) else "0",
             "sample" : row[self._sample_col],
-            "cram" : row[self._cram_col],
-            "crai" : row[self._crai_col],
+            "bam_cram" : row[self._bam_cram_col],
+            "bai_crai" : row[self._bai_crai_col] if row.get(self._bai_crai_col) else None,
             "experiment": row[self._experiment_col] if row.get(self._experiment_col) else "WGS",
             "genome_build": row[self._genome_build_col] if row.get(self._genome_build_col) else "GRCh38",
             "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None
@@ -153,21 +153,19 @@ def _validate_sample(self, row):
         if len(row[self._sample_col]) <= 0:
             raise AssertionError("'sample' input is required.")
     
-    def _validate_cram(self, row):
+    def _validate_bam_cram(self, row):
         """Assert that expected cram is correct."""
-        if len(row[self._cram_col]) <= 0:
-            raise AssertionError("'cram' input is required.")
-        if not row[self._cram_col].endswith(".cram"):
-            raise AssertionError("'cram' input format is incorrect, ensure file ends with '.cram'")
+        if len(row[self._bam_cram_col]) <= 0:
+            raise AssertionError("'bam_cram' input is required.")
+        if not row[self._bam_cram_col].endswith(".cram") and not row[self._bam_cram_col].endswith(".bam"):
+            raise AssertionError("'bam_cram' input format is incorrect, ensure file ends with '.bam' or '.cram'")
 
-    def _validate_crai(self, row):
+    def _validate_bai_crai(self, row):
         """Assert that expected crai is correct."""
-        if len(row[self._crai_col]) <= 0:
-            raise AssertionError("'crai' input is required.")
-        if not row[self._crai_col].endswith(".crai"):
-            raise AssertionError("'crai' input format is incorrect, ensure file ends with '.crai'")
-        if row[self._crai_col].split("/")[-1].replace(".cram.crai","")!=row[self._cram_col].split("/")[-1].replace(".cram",""):
-            raise AssertionError("'cram' and 'crai' file name bodies do not match.")
+        if not row[self._bai_crai_col].endswith(".crai") and not row[self._bai_crai_col].endswith(".bai"):
+            raise AssertionError("'bai_crai' input format is incorrect, ensure file ends with '.crai' or '.bai'")
+        if row[self._bai_crai_col].split("/")[-1].replace(".cram.crai","").replace(".bam.bai","")!=row[self._bam_cram_col].split("/")[-1].replace(".cram","").replace(".bam",""):
+            raise AssertionError("'bam_cram' and 'bai_crai' file name bodies do not match.")
 
     def _validate_experiment(self, row):
         """Assert that expected Experiment is correct."""
@@ -195,7 +193,7 @@ def validate_unique_values(self,col):
         """
         Assert a single unique value exists in array
         """
-        if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]):
+        if len(set([z[col] for z in self.modified if z[col] is not None]))!=len([z[col] for z in self.modified if z[col] is not None]):
                 raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col))
                 sys.exit(1)
 
@@ -260,7 +258,7 @@ def check_samplesheet(file_in, file_out):
     analysis_type,study_id,patient,sex,status,sample,cram,crai,genome_build,analysis_json
     sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,WXS,hg38,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json
     """
-    required_columns = {"sample","cram","crai"}
+    required_columns = {"sample","bam_cram"}
     conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json"}
 
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
@@ -280,9 +278,10 @@ def check_samplesheet(file_in, file_out):
                 logger.critical(f"{str(error)} On line {i + 2}.")
                 sys.exit(1)
 
-        for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]:
-            checker.validate_common_values(col)
-        for col in ["cram","crai"]:
+        #Check unnncessary for dnaalnqc
+        #for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]:
+        #    checker.validate_common_values(col)
+        for col in ["bam_cram","bai_crai"]:
             checker.validate_unique_values(col)
 
     header = checker.modified[0].keys()
diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py
index caccb36..3ff44ac 100755
--- a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py
+++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py
@@ -10,7 +10,7 @@
 import sys
 from collections import Counter
 from pathlib import Path
-
+import os
 logger = logging.getLogger()
 
 
@@ -37,8 +37,8 @@ def __init__(
         sex_col = 'sex',
         status_col = 'status',
         sample_col = 'sample',
-        cram_col = 'cram',
-        crai_col = 'crai',
+        bam_cram_col = 'bam_cram',
+        bai_crai_col = 'bai_crai',
         experiment_col = 'experiment',
         genome_build_col = "genome_build",
         analysis_json_col = 'analysis_json',
@@ -66,8 +66,8 @@ def __init__(
         self._sex_col = sex_col
         self._status_col = status_col
         self._sample_col = sample_col
-        self._cram_col = cram_col
-        self._crai_col = crai_col
+        self._bam_cram_col = bam_cram_col
+        self._bai_crai_col = bai_crai_col
         self._experiment_col = experiment_col
         self._genome_build_col = genome_build_col
         self._analysis_json_col = analysis_json_col
@@ -90,8 +90,8 @@ def validate_and_transform(self, row):
         self._validate_patient(row) if row.get(self._patient_col) else ""
         self._validate_status(row) if row.get(self._status_col) else ""
         self._validate_sample(row)
-        self._validate_cram(row)
-        self._validate_crai(row)
+        self._validate_bam_cram(row)
+        self._validate_bai_crai(row) if row.get(self._bai_crai_col) else ""
         self._validate_experiment(row) if row.get(self._experiment_col) else ""
         self._validate_genome_build(row) if row.get(self._genome_build_col) else ""
         self._validate_analysis_json(row) if row.get(self._analysis_json_col) else ""
@@ -103,8 +103,8 @@ def validate_and_transform(self, row):
             "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA",
             "status" : row[self._status_col] if row.get(self._status_col) else "0",
             "sample" : row[self._sample_col],
-            "cram" : row[self._cram_col],
-            "crai" : row[self._crai_col],
+            "bam_cram" : row[self._bam_cram_col],
+            "bai_crai" : row[self._bai_crai_col] if row.get(self._bai_crai_col) else None,
             "experiment": row[self._experiment_col] if row.get(self._experiment_col) else "WGS",
             "genome_build": row[self._genome_build_col] if row.get(self._genome_build_col) else "GRCh38",
             "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None
@@ -150,21 +150,19 @@ def _validate_sample(self, row):
         if len(row[self._sample_col]) <= 0:
             raise AssertionError("'sample' input is required.")
     
-    def _validate_cram(self, row):
+    def _validate_bam_cram(self, row):
         """Assert that expected cram is correct."""
-        if len(row[self._cram_col]) <= 0:
-            raise AssertionError("'cram' input is required.")
-        if not row[self._cram_col].endswith(".cram"):
-            raise AssertionError("'cram' input format is incorrect, ensure file ends with '.cram'")
+        if len(row[self._bam_cram_col]) <= 0:
+            raise AssertionError("'bam_cram' input is required.")
+        if not row[self._bam_cram_col].endswith(".cram") and not row[self._bam_cram_col].endswith(".bam"):
+            raise AssertionError("'bam_cram' input format is incorrect, ensure file ends with '.bam' or '.cram'")
 
-    def _validate_crai(self, row):
+    def _validate_bai_crai(self, row):
         """Assert that expected crai is correct."""
-        if len(row[self._crai_col]) <= 0:
-            raise AssertionError("'crai' input is required.")
-        if not row[self._crai_col].endswith(".crai"):
-            raise AssertionError("'crai' input format is incorrect, ensure file ends with '.crai'")
-        if row[self._crai_col].split("/")[-1].replace(".cram.crai","")!=row[self._cram_col].split("/")[-1].replace(".cram",""):
-            raise AssertionError("'cram' and 'crai' file name bodies do not match.")
+        if not row[self._bai_crai_col].endswith(".crai") and not row[self._bai_crai_col].endswith(".bai"):
+            raise AssertionError("'bai_crai' input format is incorrect, ensure file ends with '.crai' or '.bai'")
+        if row[self._bai_crai_col].split("/")[-1].replace(".cram.crai","").replace(".bam.bai","")!=row[self._bam_cram_col].split("/")[-1].replace(".cram","").replace(".bam",""):
+            raise AssertionError("'bam_cram' and 'bai_crai' file name bodies do not match.")
 
     def _validate_experiment(self, row):
         """Assert that expected Experiment is correct."""
@@ -191,7 +189,7 @@ def validate_unique_values(self,col):
         """
         Assert a single unique value exists in array
         """
-        if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]):
+        if len(set([z[col] for z in self.modified if z[col] is not None]))!=len([z[col] for z in self.modified if z[col] is not None]):
                 raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col))
                 sys.exit(1)
 
@@ -255,7 +253,7 @@ def check_samplesheet(file_in, file_out):
     analysis_type,study_id,patient,sex,status,sample,cram,crai,genome_build,analysis_json
     sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,WXS,hg38,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json
     """
-    required_columns = {"sample","cram","crai"}
+    required_columns = {"sample","bam_cram"}
     conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json"}
 
     # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`.
@@ -275,9 +273,10 @@ def check_samplesheet(file_in, file_out):
                 logger.critical(f"{str(error)} On line {i + 2}.")
                 sys.exit(1)
 
-        for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]:
-            checker.validate_common_values(col)
-        for col in ["cram","crai"]:
+        # Check unnecessary for gerrmlinevar 
+        #for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]:
+        #    checker.validate_common_values(col)
+        for col in ["bam_cram","bai_crai"]:
             checker.validate_unique_values(col)
 
     header = checker.modified[0].keys()
diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py
index f9d5704..241aa18 100755
--- a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py
+++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py
@@ -337,8 +337,9 @@ def check_samplesheet(file_in, file_out):
                 logger.critical(f"{str(error)} On line {i + 2}.")
                 sys.exit(1)
         checker.validate_unique_fastq()
-        for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]:
-            checker.validate_common_values(col)
+        #Check unnncessary for prealnqc
+        #for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]:
+        #    checker.validate_common_values(col)
         for col in ["lane"]:
             checker.validate_unique_values(col)
     
diff --git a/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py b/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py
index c13d3dc..56b9a61 100755
--- a/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py
+++ b/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py
@@ -334,18 +334,18 @@ def main():
     
     elif analysis_type == 'sequencing_alignment':
       for fp in args.input_files:
-        if fp.endswith('cram'): 
-          cram = os.path.join(os.getcwd(), args.outdir, os.path.basename(fp))
-          os.symlink(os.path.abspath(fp), cram)
-        elif fp.endswith('crai'):
-          crai = os.path.join(os.getcwd(), args.outdir, os.path.basename(fp))
-          os.symlink(os.path.abspath(fp), crai)
+        if fp.endswith('cram') or fp.endswith('bam'): 
+          bam_cram = os.path.join(os.getcwd(), args.outdir, os.path.basename(fp))
+          os.symlink(os.path.abspath(fp), bam_cram)
+        elif fp.endswith('crai') or fp.endswith('bai'):
+          bai_crai = os.path.join(os.getcwd(), args.outdir, os.path.basename(fp))
+          os.symlink(os.path.abspath(fp), bai_crai)
         else:
           sys.exit("Error: not supported input file format")
       with open(output_sample_sheet, 'w', newline='') as f:
         csvwriter = csv.writer(f, delimiter=',')
-        csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','cram','crai',"genome_build",'experiment', 'analysis_json'])
-        csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, cram, crai, genome_build,experiment, metadata_json])
+        csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','bam_cram','bai_crai',"genome_build",'experiment', 'analysis_json'])
+        csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, bam_cram, bai_crai, genome_build,experiment, metadata_json])
 
     elif analysis_type == 'variant_calling':
       for fp in song_analysis['files']:
diff --git a/modules/icgc-argo-workflows/samtools/index/environment.yml b/modules/icgc-argo-workflows/samtools/index/environment.yml
new file mode 100644
index 0000000..a5e5064
--- /dev/null
+++ b/modules/icgc-argo-workflows/samtools/index/environment.yml
@@ -0,0 +1,8 @@
+name: samtools_index
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::samtools=1.19.2
+  - bioconda::htslib=1.19.1
diff --git a/modules/icgc-argo-workflows/samtools/index/main.nf b/modules/icgc-argo-workflows/samtools/index/main.nf
new file mode 100644
index 0000000..dc14f98
--- /dev/null
+++ b/modules/icgc-argo-workflows/samtools/index/main.nf
@@ -0,0 +1,48 @@
+process SAMTOOLS_INDEX {
+    tag "$meta.id"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' :
+        'biocontainers/samtools:1.19.2--h50ea8bc_0' }"
+
+    input:
+    tuple val(meta), path(input)
+
+    output:
+    tuple val(meta), path("*.bai") , optional:true, emit: bai
+    tuple val(meta), path("*.csi") , optional:true, emit: csi
+    tuple val(meta), path("*.crai"), optional:true, emit: crai
+    path  "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    samtools \\
+        index \\
+        -@ ${task.cpus-1} \\
+        $args \\
+        $input
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch ${input}.bai
+    touch ${input}.crai
+    touch ${input}.csi
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/icgc-argo-workflows/samtools/index/meta.yml b/modules/icgc-argo-workflows/samtools/index/meta.yml
new file mode 100644
index 0000000..01a4ee0
--- /dev/null
+++ b/modules/icgc-argo-workflows/samtools/index/meta.yml
@@ -0,0 +1,57 @@
+name: samtools_index
+description: Index SAM/BAM/CRAM file
+keywords:
+  - index
+  - bam
+  - sam
+  - cram
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM/CRAM/SAM file
+      pattern: "*.{bam,cram,sam}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bai:
+      type: file
+      description: BAM/CRAM/SAM index file
+      pattern: "*.{bai,crai,sai}"
+  - crai:
+      type: file
+      description: BAM/CRAM/SAM index file
+      pattern: "*.{bai,crai,sai}"
+  - csi:
+      type: file
+      description: CSI index file
+      pattern: "*.{csi}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@drpatelh"
+  - "@ewels"
+  - "@maxulysse"
+maintainers:
+  - "@drpatelh"
+  - "@ewels"
+  - "@maxulysse"
diff --git a/modules/icgc-argo-workflows/tabix/tabix/environment.yml b/modules/icgc-argo-workflows/tabix/tabix/environment.yml
new file mode 100644
index 0000000..8233baa
--- /dev/null
+++ b/modules/icgc-argo-workflows/tabix/tabix/environment.yml
@@ -0,0 +1,9 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+name: "tabix_tabix"
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - "bioconda::tabix=1.11"
diff --git a/modules/icgc-argo-workflows/tabix/tabix/main.nf b/modules/icgc-argo-workflows/tabix/tabix/main.nf
new file mode 100644
index 0000000..0076f98
--- /dev/null
+++ b/modules/icgc-argo-workflows/tabix/tabix/main.nf
@@ -0,0 +1,42 @@
+process TABIX_TABIX {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "bioconda::tabix=1.11"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0':
+        'biocontainers/tabix:1.11--hdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(tab)
+
+    output:
+    tuple val(meta), path("*.tbi"), optional:true, emit: tbi
+    tuple val(meta), path("*.csi"), optional:true, emit: csi
+    path  "versions.yml"          , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    """
+    tabix $args $tab
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${tab}.tbi
+    cat <<-END_VERSIONS > versions.yml
+
+    "${task.process}":
+        tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//')
+    END_VERSIONS
+    """
+}
\ No newline at end of file
diff --git a/modules/icgc-argo-workflows/tabix/tabix/meta.yml b/modules/icgc-argo-workflows/tabix/tabix/meta.yml
new file mode 100644
index 0000000..3c4e2e0
--- /dev/null
+++ b/modules/icgc-argo-workflows/tabix/tabix/meta.yml
@@ -0,0 +1,45 @@
+name: tabix_tabix
+description: create tabix index from a sorted bgzip tab-delimited genome file
+keywords:
+  - index
+  - tabix
+  - vcf
+tools:
+  - tabix:
+      description: Generic indexer for TAB-delimited genome position files.
+      homepage: https://www.htslib.org/doc/tabix.html
+      documentation: https://www.htslib.org/doc/tabix.1.html
+      doi: 10.1093/bioinformatics/btq671
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - tab:
+      type: file
+      description: TAB-delimited genome position file compressed with bgzip
+      pattern: "*.{bed.gz,gff.gz,sam.gz,vcf.gz}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - tbi:
+      type: file
+      description: tabix index file
+      pattern: "*.{tbi}"
+  - csi:
+      type: file
+      description: coordinate sorted index file
+      pattern: "*.{csi}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@joseespinosa"
+  - "@drpatelh"
+  - "@maxulysse"
\ No newline at end of file
diff --git a/subworkflows/icgc-argo-workflows/stage_input/main.nf b/subworkflows/icgc-argo-workflows/stage_input/main.nf
index b34f760..cc66f72 100644
--- a/subworkflows/icgc-argo-workflows/stage_input/main.nf
+++ b/subworkflows/icgc-argo-workflows/stage_input/main.nf
@@ -2,6 +2,9 @@
 include { SONG_SCORE_DOWNLOAD          } from '../../icgc-argo-workflows/song_score_download/main'
 include { PREP_SAMPLE                  } from '../../../modules/icgc-argo-workflows/prep/sample/main'
 include { CHECKINPUT                   } from '../../../modules/icgc-argo-workflows/checkinput/main'
+include { SAMTOOLS_INDEX as BAM_INDEX  } from '../../../modules/icgc-argo-workflows/samtools/index/main'
+include { SAMTOOLS_INDEX as CRAM_INDEX } from '../../../modules/icgc-argo-workflows/samtools/index/main'
+include { TABIX_TABIX                  } from '../../../modules/icgc-argo-workflows/tabix/tabix/main'
 
 workflow STAGE_INPUT {
 
@@ -55,8 +58,7 @@ workflow STAGE_INPUT {
         exit 1, "When no API_TOKEN is provided, a local samplesheet must be provided."
       }
     }
-    //Collect meta,data files and analysis_json
-    //Two channels for meta,files and meta,analysis_json will be refined afterwards
+    //Collect meta,data files and analysis_json from new samplesheet.csv and handle approrpiately
     analysis_input
     .collectFile(keepHeader: true, name: 'sample_sheet.csv')
     .splitCsv(header:true)
@@ -76,7 +78,7 @@ workflow STAGE_INPUT {
            experiment:row.experiment,
            single_end : row.single_end.toBoolean()
            ], 
-           [file(row.fastq_1), file(row.fastq_2)],
+           [file(row.fastq_1,checkIfExists: true), file(row.fastq_2,checkIfExists: true)],
            row.analysis_json
            )
        } else if (row.analysis_type == "sequencing_experiment" && row.single_end.toLowerCase() == 'true') {
@@ -94,7 +96,7 @@ workflow STAGE_INPUT {
            experiment:row.experiment,
            single_end : row.single_end.toBoolean()
            ], 
-           [file(row.fastq_1)],
+           [file(row.fastq_1,checkIfExists: true)],
            row.analysis_json
            ) 
       } else if (row.analysis_type == "sequencing_alignment") {
@@ -108,8 +110,8 @@ workflow STAGE_INPUT {
           status:row.status.toInteger(),
           genome_build:row.genome_build,
           experiment:row.experiment,
-          data_type:'cram'], 
-          [file(row.cram), file(row.crai)],
+          data_type: "${row.bam_cram}".replaceAll(/^.*\./,"").toLowerCase()], 
+          [file(row.bam_cram,checkIfExists: true), row.bai_crai],
           row.analysis_json
           )
       }
@@ -126,7 +128,7 @@ workflow STAGE_INPUT {
           genome_build:row.genome_build,
           experiment:row.experiment,
           data_type:'vcf'],
-          [file(row.vcf), file(row.tbi)],
+          [file(row.vcf,checkIfExists: true), row.tbi],
           row.analysis_json
           )
       }
@@ -143,28 +145,17 @@ workflow STAGE_INPUT {
           genome_build:row.genome_build,
           experiment:row.experiment,
           data_type:'tgz'],
-          [file(row.qc_file)],
+          [file(row.qc_file,checkIfExists: true)],
           row.analysis_json
           )
       }
     }
-    .set { ch_input_sample }
+    .set {ch_input_sample}
 
-    //We want to still have meta when analysis_json doesn't exist
-    ch_input_sample.map{ meta,files,analysis ->
-      if (analysis){
-        tuple([meta,file(analysis)])
-      } else {
-        tuple([meta,null])
-      }
-    }
-    .unique{it[1]}
-    .set{ ch_meta_analysis }
-
-    //Reorganize files as "sequencing_experiment expected input is tuple while other types are flat"
+    //Reorganize files as flat tuple except "sequencing_experiment
     ch_input_sample.map{ meta,files,analysis ->
       if (meta.analysis_type == "sequencing_experiment"){
-        tuple([meta,files])
+        tuple([meta,files]) //tuple([meta,[read1,read2]])
       } else if (meta.analysis_type == "sequencing_alignment") {
         tuple([meta,files[0],files[1]])
       } else if (meta.analysis_type == "variant_calling") {
@@ -172,12 +163,63 @@ workflow STAGE_INPUT {
       } else if (meta.analysis_type == "qc_metrics") {
         tuple([meta,files[0]])
       }
-    }.set{ch_meta_files}
+    }.branch{ //identify files that require indexing
+      bam_to_index : it[0].analysis_type=='sequencing_alignment' && it[2].isEmpty() && it[0].data_type=='bam'
+        return tuple([it[0],it[1]])
+      cram_to_index : it[0].analysis_type=='sequencing_alignment' && it[2].isEmpty() && it[0].data_type=='cram'
+        return tuple([it[0],it[1]])
+      vcf_to_index : it[0].analysis_type=='variant_calling' && it[2].isEmpty()
+        return tuple([it[0],it[1]])
+      indexed : (it[0].analysis_type=='sequencing_alignment' && ! it[2].isEmpty()) | (it[0].analysis_type=='variant_calling' && ! it[2].isEmpty())
+        return tuple([it[0],it[1],it[2]])      
+      others: (it[0].analysis_type=='sequencing_experiment') | (it[0].analysis_type=='qc_metrics')
+        return tuple([it[0],it[1]])
+    }.set{ch_index_split}
+
+
+    //Perform indexiing
+    BAM_INDEX(ch_index_split.bam_to_index)
+    CRAM_INDEX(ch_index_split.cram_to_index)
+    TABIX_TABIX(ch_index_split.vcf_to_index)
+
+
+    //Combine BAM and BAI into single channel
+    ch_index_split.bam_to_index.join(BAM_INDEX.out.bai) //[meta,bam,bai]
+    .set{indexed_bam}
+
+    //Combine CRAM and CRAI into single channel
+    ch_index_split.cram_to_index.join(CRAM_INDEX.out.crai) //[meta,cram,crai]
+    .set{indexed_cram}
+
+    //Combine VCF and TBI into single channel
+    ch_index_split.vcf_to_index.join(TABIX_TABIX.out.tbi) //[meta,vcf,tbi]
+    .set{indexed_vcf}
+
+    //Combine newly indexed files, previously indexed and others into single channel
+    Channel.empty()
+    .mix(indexed_bam)
+    .mix(indexed_cram)
+    .mix(indexed_vcf)
+    .mix(ch_index_split.indexed)
+    .mix(ch_index_split.others)
+    .set{ch_meta_files}
+
+
+    //We want to still have meta when analysis_json doesn't exist
+    ch_input_sample.map{ meta,files,analysis ->
+      if (analysis){
+        tuple([meta,file(analysis,checkIfExists: true)])
+      } else {
+        tuple([meta,null])
+      }
+    }
+    .unique{it[1]}
+    .set{ ch_meta_analysis }
 
     emit:
     meta_analysis = ch_meta_analysis // channel: [ val(meta), analysis_json]
     meta_files  = ch_meta_files      // channel: [ val(meta), [ files ] ]
-    upRdpc = upRdpc_flag
+    upRdpc = upRdpc_flag // [boolean]
     
     versions = ch_versions                   // channel: [ versions.yml ]
 }
\ No newline at end of file

From f7348dc95b74aab9c2a9dc6c7a61a00b1f08a066 Mon Sep 17 00:00:00 2001
From: Linda Xiang <linda.xiang@oicr.on.ca>
Date: Mon, 15 Apr 2024 16:49:25 -0400
Subject: [PATCH 5/6] fix the prefix of the path to the asset files

---
 conf/test.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/test.config b/conf/test.config
index bc75b89..a0be0a6 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -21,5 +21,5 @@ params {
 
     // input data
     local_mode = true
-    input = 'assets/tests/csv/sample_sheet.csv'    
+    input = "${projectDir}/assets/tests/csv/sample_sheet.csv"
 }

From aeed6c4e8795ca36575efeb8436d3821d1292c15 Mon Sep 17 00:00:00 2001
From: Linda Xiang <linda.xiang@oicr.on.ca>
Date: Tue, 23 Apr 2024 16:31:07 -0400
Subject: [PATCH 6/6] add one test case for running workflow in rdpc

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index 14704d2..6031aed 100644
--- a/README.md
+++ b/README.md
@@ -23,6 +23,12 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
    ```
 
 5. Start running your own analysis!
+   
+   If you are getting the input data from & sending output data to ICGC-ARGO data center, and you have valid api_token, you can run the workflow with:
+   ```bash
+   nextflow run icgc-argo-workflows/prealnqc -profile <rdpc,rdpc_qa,rdpc_dev>,standard --api_token <YOUR_API_TOKEN> --study_id <STUDY_ID> --analysis_ids <ANALYSIS_IDS>
+   ```
+   Otherwise, you can provide the path to the input data in `samplesheet.csv` and run the workflow with:
    ```bash
    nextflow run icgc-argo-workflows/prealnqc -profile standard --input samplesheet.csv --outdir <OUTDIR>
    ```