From 21883f11be21aa27fcdd55795b4731ff815fc52a Mon Sep 17 00:00:00 2001 From: Linda Xiang Date: Tue, 30 Jan 2024 16:54:50 -0500 Subject: [PATCH 1/6] update stage_input fixes --- assets/tests/csv/sample_sheet.csv | 8 +- conf/modules.config | 6 + conf/test_rdpc_qa.config | 2 +- modules.json | 41 ++-- modules/icgc-argo-workflows/cleanup/main.nf | 7 +- .../prep/sample/resources/usr/bin/main.py | 28 ++- .../score/download/main.nf | 6 +- .../icgc-argo-workflows/score/upload/main.nf | 5 +- modules/icgc-argo-workflows/song/get/main.nf | 4 +- .../icgc-argo-workflows/song/manifest/main.nf | 5 +- .../icgc-argo-workflows/song/publish/main.nf | 5 +- .../icgc-argo-workflows/song/submit/main.nf | 4 +- .../custom/dumpsoftwareversions/main.nf | 6 +- .../custom/dumpsoftwareversions/meta.yml | 7 +- .../templates/dumpsoftwareversions.py | 3 +- modules/nf-core/cutadapt/main.nf | 19 +- modules/nf-core/cutadapt/meta.yml | 5 +- modules/nf-core/fastqc/main.nf | 18 +- modules/nf-core/fastqc/meta.yml | 5 + modules/nf-core/multiqc/main.nf | 10 +- modules/nf-core/multiqc/meta.yml | 11 +- nextflow.config | 5 +- .../icgc-argo-workflows/stage_input/main.nf | 217 +++++++++++------- .../icgc-argo-workflows/stage_input/meta.yml | 5 +- workflows/prealnqc.nf | 122 +++++----- 25 files changed, 323 insertions(+), 231 deletions(-) diff --git a/assets/tests/csv/sample_sheet.csv b/assets/tests/csv/sample_sheet.csv index 2401860..c90c857 100644 --- a/assets/tests/csv/sample_sheet.csv +++ b/assets/tests/csv/sample_sheet.csv @@ -1,4 +1,4 @@ -sample,lane,fastq_1,fastq_2 -TEST,C0HVY.2,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/C0HVY.2_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/C0HVY.2_r2.fq.gz -TEST,D0RE2.1,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RE2.1_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RE2.1_r2.fq.gz -TEST,D0RH0.2,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RH0.2_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RH0.2_r2.fq.gz \ No newline at end of file +sample,lane,fastq_1,fastq_2,read_group_count,single_end +TEST,C0HVY.2,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/C0HVY.2_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/C0HVY.2_r2.fq.gz,3,False +TEST,D0RE2.1,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RE2.1_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RE2.1_r2.fq.gz,3,False +TEST,D0RH0.2,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RH0.2_r1.fq.gz,https://raw.githubusercontent.com/icgc-argo-workflows/dna-seq-processing-wfs/main/tests/data/D0RH0.2_r2.fq.gz,3,False \ No newline at end of file diff --git a/conf/modules.config b/conf/modules.config index c48f648..c595d00 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -32,5 +32,11 @@ process { ] } + withName: 'SONG.*|SCORE.*' { + ext.prefix = "" + ext.api_download_token = params.api_download_token ?: params.api_token + ext.api_upload_token = params.api_upload_token ?: params.api_token + } + } diff --git a/conf/test_rdpc_qa.config b/conf/test_rdpc_qa.config index d175d31..1d9d487 100644 --- a/conf/test_rdpc_qa.config +++ b/conf/test_rdpc_qa.config @@ -22,6 +22,6 @@ params { // Input data for rdpc mode local_mode = false study_id = "TEST-QA" - analysis_ids = "875ef550-e536-4456-9ef5-50e5362456df" + analysis_ids = "875ef550-e536-4456-9ef5-50e5362456df,9bb63c49-86c8-44e2-b63c-4986c804e274" } diff --git a/modules.json b/modules.json index 5487e51..dd1f6b2 100644 --- a/modules.json +++ b/modules.json @@ -5,9 +5,14 @@ "https://github.com/icgc-argo-workflows/argo-modules.git": { "modules": { "icgc-argo-workflows": { + "checkinput": { + "branch": "stage_input_fixB", + "git_sha": "af24d4d6b59921ee048c304926897567ac956b00", + "installed_by": ["stage_input"] + }, "cleanup": { "branch": "main", - "git_sha": "517b0a82ed697564891e18ff5dba35a70f9da225", + "git_sha": "8d014598ef81d65bece3684bd67aef7afae2cda9", "installed_by": ["modules"] }, "payload/qcmetrics": { @@ -21,38 +26,38 @@ "installed_by": ["modules"] }, "prep/sample": { - "branch": "main", - "git_sha": "4ae27e792724f69f7211db10bcd9e3373abc1837", + "branch": "stage_input_fixB", + "git_sha": "f253d1e6d4dc5f6ac0e6440041ee7e55b8203e35", "installed_by": ["stage_input"] }, "score/download": { - "branch": "main", - "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0", + "branch": "stage_input_fixB", + "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec", "installed_by": ["song_score_download"] }, "score/upload": { "branch": "main", - "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0", + "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec", "installed_by": ["song_score_upload"] }, "song/get": { - "branch": "main", - "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0", + "branch": "stage_input_fixB", + "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec", "installed_by": ["song_score_download"] }, "song/manifest": { "branch": "main", - "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0", + "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec", "installed_by": ["song_score_upload"] }, "song/publish": { "branch": "main", - "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0", + "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec", "installed_by": ["song_score_upload"] }, "song/submit": { "branch": "main", - "git_sha": "f5e2d027a4f886a8702f5c4be825801513b578d0", + "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec", "installed_by": ["song_score_upload"] } } @@ -60,7 +65,7 @@ "subworkflows": { "icgc-argo-workflows": { "song_score_download": { - "branch": "main", + "branch": "stage_input_fixB", "git_sha": "92aa620385099e94401c22b8633cc55ed34ca10e", "installed_by": ["stage_input"] }, @@ -70,8 +75,8 @@ "installed_by": ["subworkflows"] }, "stage_input": { - "branch": "main", - "git_sha": "4ae27e792724f69f7211db10bcd9e3373abc1837", + "branch": "stage_input_fixB", + "git_sha": "af24d4d6b59921ee048c304926897567ac956b00", "installed_by": ["subworkflows"] } } @@ -82,22 +87,22 @@ "nf-core": { "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", "installed_by": ["modules"] }, "cutadapt": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "07881e42938b4f0070e864b45d424b01745bc3a4", "installed_by": ["modules"] }, "fastqc": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "c9488585ce7bd35ccd2a30faa2371454c8112fb9", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", "installed_by": ["modules"] } } diff --git a/modules/icgc-argo-workflows/cleanup/main.nf b/modules/icgc-argo-workflows/cleanup/main.nf index 2f505c9..815f1a7 100644 --- a/modules/icgc-argo-workflows/cleanup/main.nf +++ b/modules/icgc-argo-workflows/cleanup/main.nf @@ -3,9 +3,10 @@ process CLEANUP { label 'process_low' conda "conda-forge::coreutils=9.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + container "${ workflow.containerEngine == 'singularity' ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'docker.io/ubuntu:20.04'}" + //container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + // 'ubuntu:20.04' }" input: path files_to_delete // more accurately, other non-hidden files in the same folder will be deleted as well diff --git a/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py b/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py index 7dc42e7..c13d3dc 100755 --- a/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py +++ b/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py @@ -262,9 +262,16 @@ def main(): specimen_type = song_analysis['samples'][0]['specimen']['specimenType'] tumour_normal_designation = song_analysis['samples'][0]['specimen']['tumourNormalDesignation'] status = '0' if tumour_normal_designation == 'Normal' else '1' + + if song_analysis.get('workflow'): + genome_build = song_analysis['workflow']["genome_build"] + else: + genome_build = None + analysis_type = song_analysis['analysisType']['name'] output_sample_sheet = f'{args.outdir}/{sample_id}_{analysis_type}_sample_sheet.csv' - + experiment=song_analysis['experiment']['experimental_strategy'] + sample_sheet = dict() if analysis_type == 'sequencing_experiment': @@ -317,13 +324,13 @@ def main(): if rgs_missed_lane: # throw error here if that happens sys.exit("Error: no lane BAM has been generated for some read groups: '%s'. " "Please make sure supplied sequencing files and metadata are correct." % "', '".join(rgs_missed_lane)) - + with open(output_sample_sheet, 'w', newline='') as f: csvwriter = csv.writer(f, delimiter=',') - csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','lane','fastq_1','fastq_2','read_group','single_end','read_group_count','analysis_json']) + csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','lane','fastq_1','fastq_2','read_group','single_end','read_group_count',"experiment", 'analysis_json']) for k,v in sample_sheet.items(): single_end = True if v['file_r2'] == 'No_File' else False - csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, k, v['file_r1'], v['file_r2'], v['read_group'], single_end, read_group_count, metadata_json]) + csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, k, v['file_r1'], v['file_r2'], v['read_group'], single_end, read_group_count,experiment, metadata_json]) elif analysis_type == 'sequencing_alignment': for fp in args.input_files: @@ -337,8 +344,8 @@ def main(): sys.exit("Error: not supported input file format") with open(output_sample_sheet, 'w', newline='') as f: csvwriter = csv.writer(f, delimiter=',') - csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','cram','crai', 'analysis_json']) - csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, cram, crai, metadata_json]) + csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','cram','crai',"genome_build",'experiment', 'analysis_json']) + csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, cram, crai, genome_build,experiment, metadata_json]) elif analysis_type == 'variant_calling': for fp in song_analysis['files']: @@ -355,13 +362,13 @@ def main(): sys.exit("Error: not supported input file format") with open(output_sample_sheet, 'w', newline='') as f: csvwriter = csv.writer(f, delimiter=',') - csvwriter.writerow(['analysis_type','study_id','patient','sex','sample','variantcaller','vcf','tbi', 'analysis_json']) - csvwriter.writerow([analysis_type, study_id, donor_id, sex, sample_id, variantcaller, vcf, tbi, metadata_json]) + csvwriter.writerow(['analysis_type','study_id','patient','sex','sample','variantcaller','vcf','tbi',"genome_build",'experiment', 'analysis_json']) + csvwriter.writerow([analysis_type, study_id, donor_id, sex, sample_id, variantcaller, vcf, tbi ,genome_build,experiment, metadata_json]) elif analysis_type == 'qc_metrics': with open(output_sample_sheet, 'w', newline='') as f: csvwriter = csv.writer(f, delimiter=',') - csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','qc_tools','qc_file', 'analysis_json']) + csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','qc_tools','qc_file',"genome_build", 'experiment','analysis_json']) for fp in args.input_files: for fq in song_analysis['files']: @@ -370,8 +377,7 @@ def main(): os.symlink(os.path.abspath(fp), qc_file) qc_tools = ','.join(fq['info']['analysis_tools']) - csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, qc_tools, qc_file, metadata_json]) - + csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, qc_tools, qc_file, genome_build, experiment, metadata_json]) if __name__ == "__main__": main() diff --git a/modules/icgc-argo-workflows/score/download/main.nf b/modules/icgc-argo-workflows/score/download/main.nf index 87d0c50..792bc31 100644 --- a/modules/icgc-argo-workflows/score/download/main.nf +++ b/modules/icgc-argo-workflows/score/download/main.nf @@ -1,5 +1,3 @@ - - process SCORE_DOWNLOAD { tag "${analysis_id}" label 'process_medium' @@ -32,7 +30,7 @@ process SCORE_DOWNLOAD { def score_url = params.score_url_download ?: params.score_url def transport_parallel = params.transport_parallel ?: task.cpus def transport_mem = params.transport_mem ?: "2" - def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`" + def accessToken = task.ext.api_download_token ?: "`cat /tmp/rdpc_secret/secret`" def VERSION = params.score_container_version ?: '5.8.1' """ export METADATA_URL=${song_url} @@ -48,4 +46,4 @@ process SCORE_DOWNLOAD { score-client: ${VERSION} END_VERSIONS """ -} +} \ No newline at end of file diff --git a/modules/icgc-argo-workflows/score/upload/main.nf b/modules/icgc-argo-workflows/score/upload/main.nf index b4aa4a4..db22f3e 100644 --- a/modules/icgc-argo-workflows/score/upload/main.nf +++ b/modules/icgc-argo-workflows/score/upload/main.nf @@ -1,5 +1,4 @@ - process SCORE_UPLOAD { tag "${analysis_id}" label 'process_medium' @@ -29,7 +28,7 @@ process SCORE_UPLOAD { def score_url = params.score_url_upload ?: params.score_url def transport_parallel = params.transport_parallel ?: task.cpus def transport_mem = params.transport_mem ?: "2" - def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`" + def accessToken = task.ext.api_upload_token ?: "`cat /tmp/rdpc_secret/secret`" def VERSION = params.score_container_version ?: '5.8.1' """ export METADATA_URL=${song_url} @@ -45,4 +44,4 @@ process SCORE_UPLOAD { score-client: ${VERSION} END_VERSIONS """ -} +} \ No newline at end of file diff --git a/modules/icgc-argo-workflows/song/get/main.nf b/modules/icgc-argo-workflows/song/get/main.nf index 22e0cb3..b476dd0 100644 --- a/modules/icgc-argo-workflows/song/get/main.nf +++ b/modules/icgc-argo-workflows/song/get/main.nf @@ -26,7 +26,7 @@ process SONG_GET { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${analysis_id}" def song_url = params.song_url_download ?: params.song_url - def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`" + def accessToken = task.ext.api_download_token ?: "`cat /tmp/rdpc_secret/secret`" def VERSION = params.song_container_version ?: '5.0.2' """ export CLIENT_SERVER_URL=${song_url} @@ -40,4 +40,4 @@ process SONG_GET { song-client: ${VERSION} END_VERSIONS """ -} +} \ No newline at end of file diff --git a/modules/icgc-argo-workflows/song/manifest/main.nf b/modules/icgc-argo-workflows/song/manifest/main.nf index 3691416..49a209f 100644 --- a/modules/icgc-argo-workflows/song/manifest/main.nf +++ b/modules/icgc-argo-workflows/song/manifest/main.nf @@ -1,4 +1,3 @@ - process SONG_MANIFEST { tag "${analysis_id}" label 'process_single' @@ -26,7 +25,7 @@ process SONG_MANIFEST { script: def args = task.ext.args ?: '' def song_url = params.song_url_upload ?: params.song_url - def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`" + def accessToken = task.ext.api_upload_token ?: "`cat /tmp/rdpc_secret/secret`" def VERSION = params.song_container_version ?: '5.0.2' def study_id = "${meta.study_id}" """ @@ -41,4 +40,4 @@ process SONG_MANIFEST { song-client: ${VERSION} END_VERSIONS """ -} +} \ No newline at end of file diff --git a/modules/icgc-argo-workflows/song/publish/main.nf b/modules/icgc-argo-workflows/song/publish/main.nf index 903564a..ef92c7d 100644 --- a/modules/icgc-argo-workflows/song/publish/main.nf +++ b/modules/icgc-argo-workflows/song/publish/main.nf @@ -1,4 +1,3 @@ - process SONG_PUBLISH { tag "${analysis_id}" label 'process_single' @@ -25,7 +24,7 @@ process SONG_PUBLISH { script: def args = task.ext.args ?: '' def song_url = params.song_url_upload ?: params.song_url - def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`" + def accessToken = task.ext.api_upload_token ?: "`cat /tmp/rdpc_secret/secret`" def study_id = "${meta.study_id}" def VERSION = params.song_container_version ?: '5.0.2' """ @@ -40,4 +39,4 @@ process SONG_PUBLISH { song-client: ${VERSION} END_VERSIONS """ -} +} \ No newline at end of file diff --git a/modules/icgc-argo-workflows/song/submit/main.nf b/modules/icgc-argo-workflows/song/submit/main.nf index c948338..4d63d92 100644 --- a/modules/icgc-argo-workflows/song/submit/main.nf +++ b/modules/icgc-argo-workflows/song/submit/main.nf @@ -27,7 +27,7 @@ process SONG_SUBMIT { script: def args = task.ext.args ?: '' def song_url = params.song_url_upload ?: params.song_url - def accessToken = params.api_token ?: "`cat /tmp/rdpc_secret/secret`" + def accessToken = task.ext.api_upload_token ?: "`cat /tmp/rdpc_secret/secret`" def VERSION = params.song_container_version ?: '5.0.2' def study_id = "${meta.study_id}" """ @@ -44,4 +44,4 @@ process SONG_SUBMIT { END_VERSIONS """ -} +} \ No newline at end of file diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 3df2176..f218761 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.13" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 60b546a..5f15a5f 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,7 +1,9 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: - custom + - dump - version tools: - custom: @@ -14,7 +16,6 @@ input: type: file description: YML file containing software versions pattern: "*.yml" - output: - yml: type: file @@ -28,7 +29,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@drpatelh" - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index e55b8d4..da03340 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -4,11 +4,10 @@ """Provide functions to merge multiple versions.yml files.""" +import yaml import platform from textwrap import dedent -import yaml - def _make_versions_html(versions): """Generate a tabular HTML output of all versions for MultiQC.""" diff --git a/modules/nf-core/cutadapt/main.nf b/modules/nf-core/cutadapt/main.nf index dd030a6..e232a70 100644 --- a/modules/nf-core/cutadapt/main.nf +++ b/modules/nf-core/cutadapt/main.nf @@ -1,11 +1,11 @@ process CUTADAPT { tag "$meta.id" - label 'process_low' + label 'process_medium' - conda "bioconda::cutadapt=3.4" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/cutadapt:3.4--py39h38f01e4_1' : - 'quay.io/biocontainers/cutadapt:3.4--py39h38f01e4_1' }" + 'biocontainers/cutadapt:3.4--py39h38f01e4_1' }" input: tuple val(meta), path(reads) @@ -34,4 +34,17 @@ process CUTADAPT { cutadapt: \$(cutadapt --version) END_VERSIONS """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def trimmed = meta.single_end ? "${prefix}.trim.fastq.gz" : "${prefix}_1.trim.fastq.gz ${prefix}_2.trim.fastq.gz" + """ + touch ${prefix}.cutadapt.log + touch ${trimmed} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cutadapt: \$(cutadapt --version) + END_VERSIONS + """ } diff --git a/modules/nf-core/cutadapt/meta.yml b/modules/nf-core/cutadapt/meta.yml index bcfe291..5ecfe27 100644 --- a/modules/nf-core/cutadapt/meta.yml +++ b/modules/nf-core/cutadapt/meta.yml @@ -10,7 +10,7 @@ tools: description: | Cutadapt finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads. documentation: https://cutadapt.readthedocs.io/en/stable/index.html - doi: DOI:10.14806/ej.17.1.200 + doi: 10.14806/ej.17.1.200 licence: ["MIT"] input: - meta: @@ -44,3 +44,6 @@ output: authors: - "@drpatelh" - "@kevinmenden" +maintainers: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 0699836..9e19a74 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -1,11 +1,11 @@ process FASTQC { tag "$meta.id" - label 'process_low' + label 'process_medium' - conda "bioconda::fastqc=0.11.9" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'quay.io/biocontainers/fastqc:0.11.9--0' }" + 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : + 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" input: tuple val(meta), path(reads) @@ -29,11 +29,15 @@ process FASTQC { printf "%s %s\\n" $rename_to | while read old_name new_name; do [ -f "\${new_name}" ] || ln -s \$old_name \$new_name done - fastqc $args --threads $task.cpus $renamed_files + + fastqc \\ + $args \\ + --threads $task.cpus \\ + $renamed_files cat <<-END_VERSIONS > versions.yml "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) END_VERSIONS """ @@ -45,7 +49,7 @@ process FASTQC { cat <<-END_VERSIONS > versions.yml "${task.process}": - fastqc: \$( fastqc --version | sed -e "s/FastQC v//g" ) + fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) END_VERSIONS """ } diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml index 4da5bb5..ee5507e 100644 --- a/modules/nf-core/fastqc/meta.yml +++ b/modules/nf-core/fastqc/meta.yml @@ -50,3 +50,8 @@ authors: - "@grst" - "@ewels" - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@grst" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 68f66be..1b9f7c4 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_single' - conda "bioconda::multiqc=1.13" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" @@ -25,12 +25,14 @@ process MULTIQC { def args = task.ext.args ?: '' def config = multiqc_config ? "--config $multiqc_config" : '' def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' + def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' """ multiqc \\ --force \\ $args \\ $config \\ $extra_config \\ + $logo \\ . cat <<-END_VERSIONS > versions.yml @@ -41,7 +43,7 @@ process MULTIQC { stub: """ - touch multiqc_data + mkdir multiqc_data touch multiqc_plots touch multiqc_report.html diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index ebc29b2..45a9bc3 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,4 +1,4 @@ -name: MultiQC +name: multiqc description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: - QC @@ -12,7 +12,6 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] - input: - multiqc_files: type: file @@ -30,14 +29,13 @@ input: type: file description: Optional logo file for MultiQC pattern: "*.{png}" - output: - report: type: file description: MultiQC report file pattern: "multiqc_report.html" - data: - type: dir + type: directory description: MultiQC data dir pattern: "multiqc_data" - plots: @@ -53,3 +51,8 @@ authors: - "@bunop" - "@drpatelh" - "@jfy133" +maintainers: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index cf6cf9b..0c74f62 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,6 +14,7 @@ params { local_mode = false study_id = null analysis_ids = null + cleanup = true // rdpc data staging tempdir = null @@ -22,14 +23,16 @@ params { score_url = null song_url_download = null score_url_download = null + api_download_token = null song_url_upload = null score_url_upload = null + api_upload_token = null transport_parallel = null transport_mem = null song_container = "ghcr.io/overture-stack/song-client" song_container_version = "5.0.2" score_container = "ghcr.io/overture-stack/score" - score_container_version = "5.9.0" + score_container_version = "5.10.0" // Boilerplate options outdir = null diff --git a/subworkflows/icgc-argo-workflows/stage_input/main.nf b/subworkflows/icgc-argo-workflows/stage_input/main.nf index d351874..b34f760 100644 --- a/subworkflows/icgc-argo-workflows/stage_input/main.nf +++ b/subworkflows/icgc-argo-workflows/stage_input/main.nf @@ -1,130 +1,183 @@ include { SONG_SCORE_DOWNLOAD } from '../../icgc-argo-workflows/song_score_download/main' -include { PREP_SAMPLE } from '../../../modules/icgc-argo-workflows/prep/sample/main.nf' +include { PREP_SAMPLE } from '../../../modules/icgc-argo-workflows/prep/sample/main' +include { CHECKINPUT } from '../../../modules/icgc-argo-workflows/checkinput/main' workflow STAGE_INPUT { take: - study_analysis // channel: study_id, analysis_id - + study_id // channel: study_id + analysis_ids // channel: analysis_ids + samplesheet // channel: samplesheet + main: ch_versions = Channel.empty() - SONG_SCORE_DOWNLOAD( study_analysis ) - ch_versions = ch_versions.mix(SONG_SCORE_DOWNLOAD.out.versions) + //If local_mode is specified do not upload To RDPC + if (params.local_mode){ + upRdpc_flag=false + } else { + //Otherwise only upload to RDPC is API_Token is present + if (params.api_token || params.api_upload_token){ + upRdpc_flag=true + } else { + upRdpc_flag=false + } + } + + //Apply appropriate action if API_TOKEN is supplied + if (params.api_token || params.api_download_token){ + //If IDs are present proceed with download otherwise exit + if (study_id && analysis_ids){ + + Channel.from(analysis_ids.split(",")) + .map{analysis_id -> tuple([study_id,analysis_id])} + .set{ch_study_analysis} + + SONG_SCORE_DOWNLOAD( ch_study_analysis ) + ch_versions = ch_versions.mix(SONG_SCORE_DOWNLOAD.out.versions) - PREP_SAMPLE ( SONG_SCORE_DOWNLOAD.out.analysis_files ) - ch_versions = ch_versions.mix(PREP_SAMPLE.out.versions) + PREP_SAMPLE ( SONG_SCORE_DOWNLOAD.out.analysis_files ) + ch_versions = ch_versions.mix(PREP_SAMPLE.out.versions) - PREP_SAMPLE.out.sample_sheet_csv + analysis_input = PREP_SAMPLE.out.sample_sheet_csv + } else { + exit 1, "Using using API_Token, both a study_id and analysis_ids must be specified." + } + } else { + //If no API_Token, check for local samplesheet + if (samplesheet){ + CHECKINPUT(file(samplesheet,checkIfExists: true),workflow.Manifest.name) + ch_versions = ch_versions.mix(CHECKINPUT.out.versions) + + analysis_input = CHECKINPUT.out.csv + } else { + exit 1, "When no API_TOKEN is provided, a local samplesheet must be provided." + } + } + //Collect meta,data files and analysis_json + //Two channels for meta,files and meta,analysis_json will be refined afterwards + analysis_input .collectFile(keepHeader: true, name: 'sample_sheet.csv') .splitCsv(header:true) .map{ row -> - if (row.analysis_type == "sequencing_experiment") { - tuple([ - id:"${row.sample}-${row.lane}".toString(), - study_id:row.study_id, - patient:row.patient, - sex:row.sex, - status:row.status.toInteger(), - sample:row.sample, - read_group:row.read_group.toString(), - data_type:'fastq', - size:1, - numLanes:row.read_group_count], - [file(row.fastq_1), file(row.fastq_2)]) - } - else if (row.analysis_type == "sequencing_alignment") { + if (row.analysis_type == "sequencing_experiment" && row.single_end.toLowerCase() == 'false') { + tuple([ + analysis_type : row.analysis_type, + id:"${row.sample}-${row.lane}".toString(), + study_id:row.study_id, + patient:row.patient, + sex:row.sex, + status:row.status.toInteger(), + sample:row.sample, + read_group:row.read_group.toString(), + data_type:'fastq', + numLanes:row.read_group_count, + experiment:row.experiment, + single_end : row.single_end.toBoolean() + ], + [file(row.fastq_1), file(row.fastq_2)], + row.analysis_json + ) + } else if (row.analysis_type == "sequencing_experiment" && row.single_end.toLowerCase() == 'true') { + tuple([ + analysis_type : row.analysis_type, + id:"${row.sample}-${row.lane}".toString(), + study_id:row.study_id, + patient:row.patient, + sex:row.sex, + status:row.status.toInteger(), + sample:row.sample, + read_group:row.read_group.toString(), + data_type:'fastq', + numLanes:row.read_group_count, + experiment:row.experiment, + single_end : row.single_end.toBoolean() + ], + [file(row.fastq_1)], + row.analysis_json + ) + } else if (row.analysis_type == "sequencing_alignment") { tuple([ + analysis_type : row.analysis_type, id:"${row.sample}".toString(), study_id:row.study_id, patient:row.patient, sample:row.sample, sex:row.sex, - status:row.status.toInteger(), + status:row.status.toInteger(), + genome_build:row.genome_build, + experiment:row.experiment, data_type:'cram'], - file(row.cram), file(row.crai)) + [file(row.cram), file(row.crai)], + row.analysis_json + ) } else if (row.analysis_type == "variant_calling") { tuple([ + analysis_type : row.analysis_type, id:"${row.sample}".toString(), study_id:row.study_id, patient:row.patient, - sample:row.sample, + sample:row.sample, + sex:row.sex, + status:row.status.toInteger(), variantcaller:row.variantcaller, - data_type:'vcf'], file(row.vcf), file(row.tbi)) + genome_build:row.genome_build, + experiment:row.experiment, + data_type:'vcf'], + [file(row.vcf), file(row.tbi)], + row.analysis_json + ) } else if (row.analysis_type == "qc_metrics") { tuple([ + analysis_type : row.analysis_type, id:"${row.sample}".toString(), study_id:row.study_id, patient:row.patient, sample:row.sample, sex:row.sex, status:row.status.toInteger(), - qc_tools:row.qc_tools, - data_type:'tgz'], file(row.qc_file)) + qc_tools:row.qc_tools, + genome_build:row.genome_build, + experiment:row.experiment, + data_type:'tgz'], + [file(row.qc_file)], + row.analysis_json + ) } } .set { ch_input_sample } - PREP_SAMPLE.out.sample_sheet_csv - .collectFile(keepHeader: true) - .splitCsv(header:true) - .map{ row -> - if (row.analysis_type == "sequencing_experiment") { - tuple([ - id:"${row.sample}-${row.lane}".toString(), - study_id:row.study_id, - patient:row.patient, - sex:row.sex, - status:row.status.toInteger(), - sample:row.sample, - read_group:row.read_group.toString(), - data_type:'json', - size:1, - numLanes:row.read_group_count], - file(row.analysis_json)) - } - else if (row.analysis_type == "sequencing_alignment") { - tuple([ - id:"${row.sample}".toString(), - study_id:row.study_id, - patient:row.patient, - sample:row.sample, - sex:row.sex, - status:row.status.toInteger(), - data_type:'json'], - file(row.analysis_json)) - } - else if (row.analysis_type == "variant_calling") { - tuple([ - id:"${row.sample}".toString(), - study_id:row.study_id, - patient:row.patient, - sample:row.sample, - variantcaller:row.variantcaller, - data_type:'json'], file(row.analysis_json)) - } - else if (row.analysis_type == "qc_metrics") { - tuple([ - id:"${row.sample}".toString(), - study_id:row.study_id, - patient:row.patient, - sample:row.sample, - sex:row.sex, - status:row.status.toInteger(), - qc_tools:row.qc_tools, - data_type:'json'], file(row.analysis_json)) + //We want to still have meta when analysis_json doesn't exist + ch_input_sample.map{ meta,files,analysis -> + if (analysis){ + tuple([meta,file(analysis)]) + } else { + tuple([meta,null]) } } - .set { ch_meta_analysis } + .unique{it[1]} + .set{ ch_meta_analysis } + + //Reorganize files as "sequencing_experiment expected input is tuple while other types are flat" + ch_input_sample.map{ meta,files,analysis -> + if (meta.analysis_type == "sequencing_experiment"){ + tuple([meta,files]) + } else if (meta.analysis_type == "sequencing_alignment") { + tuple([meta,files[0],files[1]]) + } else if (meta.analysis_type == "variant_calling") { + tuple([meta,files[0],files[1]]) + } else if (meta.analysis_type == "qc_metrics") { + tuple([meta,files[0]]) + } + }.set{ch_meta_files} emit: - analysis_json = SONG_SCORE_DOWNLOAD.out.analysis_json // channel: [ analysis_json ] - meta_analysis = ch_meta_analysis // channel: [ val(meta), analysis_json] - sample_files = ch_input_sample // channel: [ val(meta), [ files ] ] - input_files = SONG_SCORE_DOWNLOAD.out.files // channel: [files] + meta_analysis = ch_meta_analysis // channel: [ val(meta), analysis_json] + meta_files = ch_meta_files // channel: [ val(meta), [ files ] ] + upRdpc = upRdpc_flag versions = ch_versions // channel: [ versions.yml ] } \ No newline at end of file diff --git a/subworkflows/icgc-argo-workflows/stage_input/meta.yml b/subworkflows/icgc-argo-workflows/stage_input/meta.yml index 9c565cc..792a5ea 100644 --- a/subworkflows/icgc-argo-workflows/stage_input/meta.yml +++ b/subworkflows/icgc-argo-workflows/stage_input/meta.yml @@ -9,7 +9,8 @@ keywords: modules: - song/get - score/download - - prep_sample + - prep/sample + - checkinput input: - study_id: type: string @@ -38,4 +39,4 @@ output: pattern: "versions.yml" authors: - "@lindaxiang" - \ No newline at end of file + diff --git a/workflows/prealnqc.nf b/workflows/prealnqc.nf index 75ae34b..d5f1cca 100644 --- a/workflows/prealnqc.nf +++ b/workflows/prealnqc.nf @@ -28,7 +28,7 @@ ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.mu IMPORT LOCAL MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { INPUT_CHECK } from '../subworkflows/local/input_check' +// include { INPUT_CHECK } from '../subworkflows/local/input_check' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -61,32 +61,16 @@ workflow PREALNQC { ch_versions = Channel.empty() - // Read in samplesheet, validate and stage input files - if (params.local_mode) { - if (params.input) { - ch_input = Channel.fromPath(params.input) - ch_input_sample = INPUT_CHECK (ch_input).reads - } - else { exit 1, 'Input samplesheet must be specified for local mode!' } - } else if (params.study_id && params.analysis_ids) { - ch_study = Channel.of(params.study_id) - ch_analysis_ids = Channel.fromList(params.analysis_ids.split(',') as List) - ch_input = ch_study.combine(ch_analysis_ids) - - STAGE_INPUT(ch_input) - ch_input_sample = STAGE_INPUT.out.sample_files - ch_metadata = STAGE_INPUT.out.meta_analysis - ch_versions = ch_versions.mix(STAGE_INPUT.out.versions) - - } else { exit 1, 'study_id & analysis_ids must be specified for rdpc mode!' } - + // Stage input files + STAGE_INPUT(params.study_id, params.analysis_ids, params.input) + ch_versions = ch_versions.mix(STAGE_INPUT.out.versions) // MODULE: Run FastQC - FASTQC( ch_input_sample ) + FASTQC( STAGE_INPUT.out.meta_files ) ch_versions = ch_versions.mix(FASTQC.out.versions) - // MODULE: Perform cutadpat - CUTADAPT( ch_input_sample ) + // // MODULE: Perform cutadpat + CUTADAPT( STAGE_INPUT.out.meta_files ) ch_versions = ch_versions.mix(CUTADAPT.out.versions) // Gather QC files @@ -109,7 +93,7 @@ workflow PREALNQC { // Group the QC files by sampleId ch_qc_files .transpose() - .map { meta, files -> [[id: meta.sample], files] } + .map { meta, files -> [[id: meta.sample, study_id: meta.study_id], files] } .groupTuple() .set{ ch_meta_qcfiles } @@ -118,49 +102,55 @@ workflow PREALNQC { // Collect Software Versions CUSTOM_DUMPSOFTWAREVERSIONS (ch_versions.unique{ it.text }.collectFile(name: 'collated_versions.yml')) - - // upload QC files and metadata to song/score - if (!params.local_mode) { - // make metadata and files match - ch_metadata.map { meta, metadata -> [[id: meta.sample], metadata]} - .unique().set{ ch_meta_metadata } - - ch_meta_metadata.join(ch_meta_qcfiles).join(PREP_METRICS.out.metrics_json) - .set { ch_metadata_upload } - - // // generate payload - PAYLOAD_QCMETRICS( - ch_metadata_upload, CUSTOM_DUMPSOFTWAREVERSIONS.out.yml.collect()) - - // SONG_SCORE_UPLOAD(PAYLOAD_QCMETRICS.out.payload_files) - - // // cleanup - // // Gather files to remove - // ch_files = Channel.empty() - // ch_files = ch_files.mix(STAGE_INPUT.out.sample_files) - // ch_files = ch_files.mix(STAGE_INPUT.out.analysis_meta) - // ch_files = ch_files.mix(FASTQC.out.zip) - // ch_files = ch_files.mix(FASTQC.out.html) - // ch_files = ch_files.mix(CUTADAPT.out.log) - // ch_files = ch_files.mix(CUTADAPT.out.reads) - // ch_files.map{ meta, files -> files} - // .unique() - // .set { ch_files_to_remove1 } - - // PAYLOAD_QCMETRICS.out.payload_files - // .map {meta, payload, files -> files} - // .unique() - // .set { ch_files_to_remove2 } - - // ch_files_to_remove = Channel.empty() - // ch_files_to_remove = ch_files_to_remove.mix(STAGE_INPUT.out.input_files) - // ch_files_to_remove = ch_files_to_remove.mix(MULTIQC.out.report) - // ch_files_to_remove = ch_files_to_remove.mix(MULTIQC.out.data) - // ch_files_to_remove = ch_files_to_remove.mix(ch_files_to_remove1) - // ch_files_to_remove = ch_files_to_remove.mix(ch_files_to_remove2) - // CLEANUP(ch_files_to_remove.unique().collect(), SONG_SCORE_UPLOAD.out.analysis_id) + + // Combine channels to determine upload status and payload creation + // make metadata and files match + STAGE_INPUT.out.meta_analysis.map { meta, metadata -> [[id: meta.sample, study_id: meta.study_id], metadata]} + .unique().set{ ch_meta_metadata } + + ch_meta_metadata.join(ch_meta_qcfiles).join(PREP_METRICS.out.metrics_json) + .set { ch_metadata_files } + + STAGE_INPUT.out.upRdpc.combine(ch_metadata_files) + .map{upRdpc, meta, metadata, files, metrics -> + [[id: meta.id, study_id: meta.study_id, upRdpc: upRdpc], + metadata, files, metrics]} + .branch{ + upload: it[0].upRdpc + }.set{ch_metadata_files_status} + + // generate payload + PAYLOAD_QCMETRICS( + ch_metadata_files_status.upload, CUSTOM_DUMPSOFTWAREVERSIONS.out.yml.collect()) + + SONG_SCORE_UPLOAD(PAYLOAD_QCMETRICS.out.payload_files) + + if (params.cleanup) { + // cleanup + // Gather files to remove + ch_files = Channel.empty() + ch_files = ch_files.mix(STAGE_INPUT.out.meta_files) + ch_files = ch_files.mix(STAGE_INPUT.out.meta_analysis) + ch_files = ch_files.mix(FASTQC.out.zip) + ch_files = ch_files.mix(FASTQC.out.html) + ch_files = ch_files.mix(CUTADAPT.out.log) + ch_files = ch_files.mix(CUTADAPT.out.reads) + ch_files.map{ meta, files -> files} + .unique() + .set { ch_files_to_remove1 } + + PAYLOAD_QCMETRICS.out.payload_files + .map {meta, payload, files -> files} + .unique() + .set { ch_files_to_remove2 } + + ch_files_to_remove = Channel.empty() + ch_files_to_remove = ch_files_to_remove.mix(MULTIQC.out.report) + ch_files_to_remove = ch_files_to_remove.mix(MULTIQC.out.data) + ch_files_to_remove = ch_files_to_remove.mix(ch_files_to_remove1) + ch_files_to_remove = ch_files_to_remove.mix(ch_files_to_remove2) + CLEANUP(ch_files_to_remove.unique().collect(), SONG_SCORE_UPLOAD.out.analysis_id) } - } /* From a4bfeb5dcab2ce811f2524bad042ddca1ce7d28f Mon Sep 17 00:00:00 2001 From: Linda Xiang Date: Sat, 24 Feb 2024 11:05:20 -0500 Subject: [PATCH 2/6] update README.md --- README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9ca5e4b..14704d2 100644 --- a/README.md +++ b/README.md @@ -11,15 +11,20 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool 2. Install [`Docker`](https://docs.docker.com/engine/installation/). -3. Download the pipeline and test it on a minimal dataset with a single command: +3. Test the workflow running in `Local` mode on a minimal dataset with a single command: ```bash nextflow run icgc-argo-workflows/prealnqc -profile test,standard ``` -4. Start running your own analysis! +4. Test the workflow running in `RDPC` mode with a single command if you have access to `RDPC-QA` env and have your valid api_token available: ```bash - nextflow run icgc-argo-workflows/prealnqc --input samplesheet.csv --outdir -profile standard + nextflow run icgc-argo-workflows/prealnqc -profile rdpc_qa,test_rdpc_qa,standard --api_token + ``` + +5. Start running your own analysis! + ```bash + nextflow run icgc-argo-workflows/prealnqc -profile standard --input samplesheet.csv --outdir ``` ## Pipeline summary From e20d5c42f63ed75d0730449154c024964f70b554 Mon Sep 17 00:00:00 2001 From: Linda Xiang Date: Thu, 21 Mar 2024 08:28:30 -0400 Subject: [PATCH 3/6] add updated modules --- .../icgc-argo-workflows/checkinput/main.nf | 56 +++ .../icgc-argo-workflows/checkinput/meta.yml | 43 ++ .../checkinput/resources/usr/bin/dnaaln.py | 476 ++++++++++++++++++ .../checkinput/resources/usr/bin/dnaalnqc.py | 356 +++++++++++++ .../resources/usr/bin/germlinevar.py | 352 +++++++++++++ .../checkinput/resources/usr/bin/prealnqc.py | 416 +++++++++++++++ .../dumpsoftwareversions/environment.yml | 7 + modules/nf-core/cutadapt/environment.yml | 7 + modules/nf-core/fastqc/environment.yml | 7 + modules/nf-core/multiqc/environment.yml | 7 + 10 files changed, 1727 insertions(+) create mode 100644 modules/icgc-argo-workflows/checkinput/main.nf create mode 100644 modules/icgc-argo-workflows/checkinput/meta.yml create mode 100755 modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaaln.py create mode 100755 modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py create mode 100755 modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py create mode 100755 modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py create mode 100644 modules/nf-core/custom/dumpsoftwareversions/environment.yml create mode 100644 modules/nf-core/cutadapt/environment.yml create mode 100644 modules/nf-core/fastqc/environment.yml create mode 100644 modules/nf-core/multiqc/environment.yml diff --git a/modules/icgc-argo-workflows/checkinput/main.nf b/modules/icgc-argo-workflows/checkinput/main.nf new file mode 100644 index 0000000..45eff91 --- /dev/null +++ b/modules/icgc-argo-workflows/checkinput/main.nf @@ -0,0 +1,56 @@ +process CHECKINPUT { + tag "$samplesheet" + label 'process_single' + + conda "conda-forge::python=3.8.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'quay.io/biocontainers/python:3.8.3' }" + + input: + path samplesheet + val workflow_name + + output: + path 'samplesheet.valid.csv', emit: csv + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + case '$workflow_name' in + 'Pre Alignment QC') + echo $workflow_name detected; + prealnqc.py \\ + $samplesheet \\ + samplesheet.valid.csv + ;; + 'DNA Alignment QC') + dnaalnqc.py \\ + $samplesheet \\ + samplesheet.valid.csv + ;; + 'DNA Alignment') + dnaaln.py \\ + $samplesheet \\ + samplesheet.valid.csv + ;; + 'Germline Variant Call') + germlinevar.py \\ + $samplesheet \\ + samplesheet.valid.csv + ;; + *) + echo -n "Unknown workflow" + exit 1 + ;; + esac + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/icgc-argo-workflows/checkinput/meta.yml b/modules/icgc-argo-workflows/checkinput/meta.yml new file mode 100644 index 0000000..f19a5f7 --- /dev/null +++ b/modules/icgc-argo-workflows/checkinput/meta.yml @@ -0,0 +1,43 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "checkinput" +## TODO nf-core: Add a description of the module and list keywords +description: write your description here +keywords: + - sort + - example + - genomics +tools: + - "checkinput": + ## TODO nf-core: Add a description and other details for the software below + description: "" + homepage: "" + documentation: "" + tool_dev_url: "" + doi: "" + licence: "" + +## TODO nf-core: Add a description of all of the variables used as input +input: + # + ## TODO nf-core: Delete / customise this example input + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + +## TODO nf-core: Add a description of all of the variables used as output +output: + # + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + ## TODO nf-core: Delete / customise this example output + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + +authors: + - "@edsu7" diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaaln.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaaln.py new file mode 100755 index 0000000..e18de9b --- /dev/null +++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaaln.py @@ -0,0 +1,476 @@ +#!/usr/bin/env python + + +"""Provide a command line tool to validate and transform tabular samplesheets.""" + + +import argparse +import csv +import logging +import sys +from collections import Counter +from pathlib import Path + +logger = logging.getLogger() + + +class RowChecker: + """ + Define a service that can validate and transform each given row. + + Attributes: + modified (list): A list of dicts, where each dict corresponds to a previously + validated and transformed row. The order of rows is maintained. + + """ + + VALID_FORMATS = ( + ".bam", + ".cram", + ) + + def __init__( + self, + analysis_type_col = 'analysis_type', + study_id_col = 'study_id', + patient_col = 'patient', + sex_col = 'sex', + status_col = 'status', + sample_col = 'sample', + lane_col = 'lane', + fastq_1_col = 'fastq_1', + fastq_2_col = 'fastq_2', + library_name_col = 'library_name', + platform_unit_col = 'platform_unit', + platform_col = 'platform', + sequencing_center_col = 'sequencing_center', + sequencing_date_col = 'sequencing_date', + platform_model_col = 'platform_model', + single_end_col = 'single_end', + read_group_count_col = 'read_group_count', + experiment_col = 'experiment', + analysis_json_col = 'analysis_json', + **kwargs, + ): + """ + Initialize the row checker with the expected column names. +analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json + Args: + sample_col (str): The name of the column that contains the sample name + (default "sample"). + first_col (str): The name of the column that contains the first (or only) + FASTQ file path (default "fastq_1"). + second_col (str): The name of the column that contains the second (if any) + FASTQ file path (default "fastq_2"). + single_col (str): The name of the new column that will be inserted and + records whether the sample contains single- or paired-end sequencing + reads (default "single_end"). + + """ + super().__init__(**kwargs) + self._analysis_type_col = analysis_type_col + self._study_id_col = study_id_col + self._patient_col = patient_col + self._sex_col = sex_col + self._status_col = status_col + self._sample_col = sample_col + self._lane_col = lane_col + self._fastq_1_col = fastq_1_col + self._fastq_2_col = fastq_2_col + self._library_name_col = library_name_col + self._platform_unit_col = platform_unit_col + self._platform_col = platform_col + self._sequencing_center_col = sequencing_center_col + self._sequencing_date_col = sequencing_date_col + self._platform_model_col = platform_model_col + self._single_end_col = single_end_col + self._read_group_count_col = read_group_count_col + self._experiment_col = experiment_col + self._analysis_json_col = analysis_json_col + self._seen = [] + self.modified = [] + + + def validate_and_transform(self, row): + """ + Perform all validations on the given row and insert the read pairing status. + + Args: + row (dict): A mapping from column headers (keys) to elements of that row + (values). + + """ + self._validate_analysis_type(row) if row.get(self._analysis_type_col) else "" + self._validate_sex(row) if row.get(self._sex_col) else "" + self._validate_study_id(row) if row.get(self._study_id_col) else "" + self._validate_patient(row) if row.get(self._patient_col) else "" + self._validate_sex(row) if row.get(self._sex_col) else "" + self._validate_status(row) if row.get(self._status_col) else "" + self._validate_sample(row) + self._validate_lane(row) + self._validate_single_end(row) + self._validate_fastq_1(row) + self._validate_fastq_2(row) + self._validate_library_name(row) + self._validate_platform_unit(row) + self._validate_platform_col(row) if row.get(self._platform_col) else "" + self._validate_sequencing_center_col(row) if row.get(self._sequencing_center_col) else "" + self._validate_sequencing_date_col(row) if row.get(self._sequencing_date_col) else "" + self._validate_platform_model_col(row) if row.get(self._platform_model_col) else "" + self._validate_read_group_count(row) + self._validate_experiment(row) if row.get(self._experiment_col) else "" + self._validate_analysis_json(row) if row.get(self._analysis_json_col) else "" + + tmp_dict={ + "analysis_type" : row[self._analysis_type_col] if row.get(self._analysis_type_col) else "sequencing_experiment", + "study_id" : row[self._study_id_col] if row.get(self._study_id_col) else "LOCAL", + "patient" : row[self._patient_col] if row.get(self._patient_col) else row[self._sample_col], + "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA", + "status" : row[self._status_col] if row.get(self._status_col) else "0", + "sample" : row[self._sample_col], + "lane" : row[self._lane_col], + "fastq_1" : row[self._fastq_1_col], + "fastq_2" : row[self._fastq_2_col] if row.get(self._fastq_2_col) else "NO_FILE", + "single_end" : row[self._single_end_col].lower(), + "read_group_count" : row[self._read_group_count_col], + "experiment" : row[self._experiment_col] if row.get(self._experiment_col) else "WGS", + "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None + } + + read_group_info=[] + description=[] + + for col in [ + 'experiment', + 'study_id', + 'experiment', + 'patient', + 'sample', + 'status' + ]: + if tmp_dict.get(col): + if col=='status': + if tmp_dict['status']==1: + description.append("Tumour") + else: + description.append("Normal") + continue + description.append(tmp_dict[col]) + + for col,id in zip( + [ + self._lane_col, + self._sample_col, + self._library_name_col, + self._platform_unit_col, + self._sequencing_center_col, + self._platform_col, + self._platform_model_col, + self._sequencing_date_col + ], + ["ID","SM","LB","PU","CN","PL","PM","DT"]): + if row.get(col): + read_group_info.append("%s:%s" % (id,row[col])) + + tmp_dict['read_group']="'@RG\\t%s\\tDS:%s'" % ("\\t".join(read_group_info),"|".join(description)) + + self._seen.append(row) + self.modified.append(tmp_dict) + + def _validate_analysis_type(self, row): + """Assert that expected analysis is correct.""" + if len(row[self._analysis_type_col]) <= 0: + raise AssertionError("'analysis_type' input is required.") + if row[self._analysis_type_col]!="sequencing_experiment": + raise AssertionError("analysis_type for \"DNA Alignment\" should be \"sequencing_experiment\"") + + def _validate_study_id(self, row): + """Assert that expected study_id is correct.""" + if len(row[self._study_id_col]) <= 0: + raise AssertionError("'study_id' input is required.") + + def _validate_patient(self, row): + """Assert that expected patient is correct.""" + if len(row[self._patient_col]) <= 0: + raise AssertionError("'patient' input is required.") + + def _validate_sex(self, row): + """Assert that expected sex is correct.""" + if len(row[self._sex_col]) <= 0: + raise AssertionError("'analysis_type' input is required.") + if row[self._sex_col]!="XX" and row[self._sex_col]!="XY" and row[self._sex_col]!="NA": + raise AssertionError("sex should be one of the following values : XX,XY,NA") + + def _validate_status(self, row): + """Assert that expected tumour status is correct.""" + if len(row[self._status_col]) <= 0: + raise AssertionError("'status' input is required.") + if row[self._status_col]!="1" and row[self._status_col]!="0": + raise AssertionError("Tumour status should be \"0\" is normal else \"1\"") + + def _validate_sample(self, row): + """Assert that expected sample is correct.""" + if len(row[self._sample_col]) <= 0: + raise AssertionError("'sample' input is required.") + + + def _validate_lane(self, row): + """Assert that expected lane is correct.""" + if len(row[self._lane_col]) <= 0: + raise AssertionError("'lane' input is required.") + + + def _validate_fastq_1(self, row): + """Assert that expected fastq_1 is correct.""" + if len(row[self._fastq_1_col]) <= 0: + raise AssertionError("'fastq_1' input is required.") + if not ( + row[self._fastq_1_col].endswith(".fq.gz") or + row[self._fastq_1_col].endswith(".fastq.gz") or + row[self._fastq_1_col].endswith(".bam") + ): + raise AssertionError("'fastq_1' incorrect format detected.") + + + def _validate_fastq_2(self, row): + """Assert that expected fastq_2 is correct.""" + if row[self._single_end_col].lower()=="true": + return + + if len(row[self._fastq_2_col]) <= 0: + raise AssertionError("'fastq_2' input is required.") + if row[self._fastq_2_col].endswith(".fastq.gz"): + if row[self._fastq_2_col].split("/")[-1].replace("R2.fastq.gz","").replace("r2.fastq.gz","")!=row[self._fastq_1_col].split("/")[-1].replace("R1.fastq.gz","").replace("r1.fastq.gz",""): + raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.") + if row[self._fastq_2_col].endswith(".fq.gz"): + if row[self._fastq_2_col].split("/")[-1].replace("R2.fq.gz","").replace("r2.fq.gz","")!=row[self._fastq_1_col].split("/")[-1].replace("R1.fq.gz","").replace("r1.fq.gz",""): + raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.") + if row[self._fastq_2_col].endswith(".bam"): + if row[self._fastq_2_col]!=row[self._fastq_1_col]: + raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.") + + def _validate_single_end(self, row): + """Assert that expected single_end is correct.""" + if len(row[self._single_end_col]) <= 0: + raise AssertionError("'single_end' input is required.") + if row[self._single_end_col].lower()!="true" and row[self._single_end_col].lower()!="false": + raise AssertionError("'single_end' should be specifed as \"True\" or \"False\".") + + + def _validate_read_group_count(self, row): + """Assert that expected read_group_count is correct.""" + if len(row[self._read_group_count_col]) <= 0: + raise AssertionError("'read_group_count' input is required.") + + def _validate_experiment(self, row): + """Assert that expected Experiment is correct.""" + if len(row[self._experiment_col]) <= 0: + raise AssertionError("'experiment' input is required.") + for val in ["WGS","WXS","RNA-Seq","Bisulfite-Seq","ChIP-Seq","Targeted-Seq"]: + if val==row[self._experiment_col]: + return + raise AssertionError("'experiment' type does not match the following: \"WGS\",\"WXS\",\"RNA-Seq\",\"Bisulfite-Seq\",\"ChIP-Seq\",\"Targeted-Seq\".") + + def _validate_analysis_json(self, row): + """Assert that expected analysis_json is correct.""" + if len(row[self._analysis_json_col]) <= 0: + raise AssertionError("'analysis_json' input is required.") + if not row[self._analysis_json_col].endswith(".json"): + raise AssertionError("'analysis_json' input should have the suffix \".json\".") + + def _validate_library_name(self, row): + """Assert that expected library_name is correct.""" + if len(row[self._library_name_col]) <= 0: + raise AssertionError("'library_name' input is required.") + + def _validate_platform_unit(self, row): + """Assert that expected platform_unit is correct.""" + if len(row[self._platform_unit_col]) <= 0: + raise AssertionError("'platform_unit' input is required.") + + def _validate_platform_col(self, row): + """Assert that expected platform is correct.""" + if len(row[self._platform_col]) <= 0: + raise AssertionError("'platform' input is required.") + + def _validate_sequencing_center_col(self, row): + """Assert that expected sequencing_center is correct.""" + if len(row[self._sequencing_center_col]) <= 0: + raise AssertionError("'sequencing_center' input is required.") + + def _validate_sequencing_date_col(self, row): + """Assert that expected sequencing_date is correct.""" + if len(row[self._sequencing_date_col]) <= 0: + raise AssertionError("'sequencing_date' input is required.") + + def _validate_platform_model_col(self, row): + """Assert that expected platform_model is correct.""" + if len(row[self._platform_model_col]) <= 0: + raise AssertionError("'platform_model' input is required.") + + def validate_unique_fastq(self): + """ + Assert that the combination of FASTQ filename is unique. + """ + tmp=[z['fastq_1'] for z in self.modified]+[z['fastq_2'] for z in self.modified] + + for iter in range(0,len(tmp)): + current_val=tmp.pop(0) + if current_val.endswith(".fastq.gz"): + continue + if current_val.endswith(".fq.gz"): + continue + if current_val=='NO_FILE': + continue + if current_val in tmp: + raise AssertionError("Errors multiple instances of file '%s' detected" % (current_val)) + sys.exit(1) + else: + raise AssertionError("Unexpected file format detected for '%s'" % (current_val)) + + def validate_unique_values(self,col): + """ + Assert a single unique value exists in array + """ + if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]): + raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col)) + sys.exit(1) + + def validate_common_values(self,col): + """ + Assert each value in array is unique + """ + if len(set([z[col] for z in self.modified]))!=1: + raise AssertionError("Errors multiple values detected for '%s'. Only a single value should be used" % (col)) + sys.exit(1) + +def read_head(handle, num_lines=10): + """Read the specified number of lines from the current position in the file.""" + lines = [] + for idx, line in enumerate(handle): + if idx == num_lines: + break + lines.append(line) + return "".join(lines) + + +def sniff_format(handle): + """ + Detect the tabular format. + + Args: + handle (text file): A handle to a `text file`_ object. The read position is + expected to be at the beginning (index 0). + + Returns: + csv.Dialect: The detected tabular format. + + .. _text file: + https://docs.python.org/3/glossary.html#term-text-file + + """ + peek = read_head(handle) + handle.seek(0) + sniffer = csv.Sniffer() + dialect = sniffer.sniff(peek) + return dialect + + +def check_samplesheet(file_in, file_out): + required_columns = {"sample","lane","fastq_1","fastq_2","single_end","read_group_count","library_name","platform_unit"} + conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json","platform","sequencing_center","sequencing_date","platform_model"} + + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_in.open(newline="") as in_handle: + reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) + # Validate the existence of the expected header columns. + if not required_columns.issubset(reader.fieldnames) and not conditional_columns.issubset(reader.fieldnames): + req_cols = ", ".join(required_columns) + logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") + sys.exit(1) + # Validate each row. + checker = RowChecker() + for i, row in enumerate(reader): + try: + checker.validate_and_transform(row) + except AssertionError as error: + logger.critical(f"{str(error)} On line {i + 2}.") + sys.exit(1) + checker.validate_unique_fastq() + for col in["sample","study_id","sex","patient","experiment","read_group_count","status","analysis_json"]: + checker.validate_common_values(col) + for col in ["lane"]: + checker.validate_unique_values(col) + + + header = checker.modified[0].keys() + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_out.open(mode="w", newline="") as out_handle: + writer = csv.DictWriter(out_handle, header, delimiter=",") + writer.writeheader() + for row in checker.modified: + writer.writerow(row) + + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog=\ + ''' +Check that the tabular samplesheet has the structure expected by nf-core pipelines. + +Validate the general shape of the table, expected columns, and each row. Also add +an additional column which records whether one or two FASTQ reads were found. + +Args: +file_in (pathlib.Path): The given tabular samplesheet. The format can be either + CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. +file_out (pathlib.Path): Where the validated and transformed samplesheet should + be created; always in CSV format. + +Example: + This function checks that the samplesheet follows the following structure, + + analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json + sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,C0HVY.2,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R1.fq.gz,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R2.fq.gz,'@RG\\tID:C0HVY.2\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8a\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json + sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RE2.1,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R2.fq.gz,'@RG\\tID:D0RE2.1\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8b\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json + sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RH0.2,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R2.fq.gz,'@RG\\tID:D0RH0.2\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8c\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json +''', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "file_in", + metavar="FILE_IN", + type=Path, + help="Tabular input samplesheet in CSV or TSV format.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Transformed output samplesheet in CSV format.", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) + return parser.parse_args(argv) + + +def main(argv=None): + """Coordinate argument parsing and program execution.""" + args = parse_args(argv) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + if not args.file_in.is_file(): + logger.error(f"The given input file {args.file_in} was not found!") + sys.exit(2) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + check_samplesheet(args.file_in, args.file_out) + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py new file mode 100755 index 0000000..4e111cd --- /dev/null +++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python + + +"""Provide a command line tool to validate and transform tabular samplesheets.""" + + +import argparse +import csv +import logging +import sys +from collections import Counter +from pathlib import Path + +logger = logging.getLogger() + + +class RowChecker: + """ + Define a service that can validate and transform each given row. + + Attributes: + modified (list): A list of dicts, where each dict corresponds to a previously + validated and transformed row. The order of rows is maintained. + + """ + + VALID_FORMATS = ( + ".bam", + ".cram", + ) + + def __init__( + self, + #sample_col="sample", + #first_col="bam_cram", + analysis_type_col = 'analysis_type', + study_id_col = 'study_id', + patient_col = 'patient', + sex_col = 'sex', + status_col = 'status', + sample_col = 'sample', + cram_col = 'cram', + crai_col = 'crai', + experiment_col = 'experiment', + genome_build_col = "genome_build", + analysis_json_col = 'analysis_json', + **kwargs, + ): + """ + Initialize the row checker with the expected column names. +analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json + Args: + sample_col (str): The name of the column that contains the sample name + (default "sample"). + first_col (str): The name of the column that contains the first (or only) + FASTQ file path (default "fastq_1"). + second_col (str): The name of the column that contains the second (if any) + FASTQ file path (default "fastq_2"). + single_col (str): The name of the new column that will be inserted and + records whether the sample contains single- or paired-end sequencing + reads (default "single_end"). + + """ + super().__init__(**kwargs) + self._analysis_type_col = analysis_type_col + self._study_id_col = study_id_col + self._patient_col = patient_col + self._sex_col = sex_col + self._status_col = status_col + self._sample_col = sample_col + self._cram_col = cram_col + self._crai_col = crai_col + self._experiment_col = experiment_col + self._genome_build_col = genome_build_col + self._analysis_json_col = analysis_json_col + self._seen = [] + self.modified = [] + + def validate_and_transform(self, row): + """ + Perform all validations on the given row and insert the read pairing status. + + Args: + row (dict): A mapping from column headers (keys) to elements of that row + (values). + + """ + #{"analysis_type","study_id","patient","sex","status","sample","cram","crai","analysis_json"} + self._validate_analysis_type(row) if row.get(self._analysis_type_col) else "" + self._validate_sex(row) if row.get(self._sex_col) else "" + self._validate_study_id(row) if row.get(self._study_id_col) else "" + self._validate_patient(row) if row.get(self._patient_col) else "" + self._validate_status(row) if row.get(self._status_col) else "" + self._validate_sample(row) + self._validate_cram(row) + self._validate_crai(row) + self._validate_experiment(row) if row.get(self._experiment_col) else "" + self._validate_genome_build(row) if row.get(self._genome_build_col) else "" + self._validate_analysis_json(row) if row.get(self._analysis_json_col) else "" + + + tmp_dict={ + "analysis_type" : row[self._analysis_type_col] if row.get(self._analysis_type_col) else "sequencing_alignment", + "study_id" : row[self._study_id_col] if row.get(self._study_id_col) else "LOCAL", + "patient" : row[self._patient_col] if row.get(self._patient_col) else row[self._sample_col], + "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA", + "status" : row[self._status_col] if row.get(self._status_col) else "0", + "sample" : row[self._sample_col], + "cram" : row[self._cram_col], + "crai" : row[self._crai_col], + "experiment": row[self._experiment_col] if row.get(self._experiment_col) else "WGS", + "genome_build": row[self._genome_build_col] if row.get(self._genome_build_col) else "GRCh38", + "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None + } + + self._seen.append(row) + self.modified.append(tmp_dict) + + + def _validate_analysis_type(self, row): + """Assert that expected analysis is correct.""" + if len(row[self._analysis_type_col]) <= 0: + raise AssertionError("'analysis_type' input is required.") + if row[self._analysis_type_col]!="sequencing_alignment": + raise AssertionError("analysis_type for \"DNA Alignment QC\" should be \"sequencing_alignment\"") + + def _validate_study_id(self, row): + """Assert that expected study_id is correct.""" + if len(row[self._study_id_col]) <= 0: + raise AssertionError("'study_id' input is required.") + + def _validate_patient(self, row): + """Assert that expected patient is correct.""" + if len(row[self._patient_col]) <= 0: + raise AssertionError("'patient' input is required.") + + def _validate_sex(self, row): + """Assert that expected sex is correct.""" + if len(row[self._sex_col]) <= 0: + raise AssertionError("'analysis_type' input is required.") + if row[self._sex_col]!="XX" and row[self._sex_col]!="XY" and row[self._sex_col]!="NA": + raise AssertionError("sex should be one of the following values : XX,XY,NA") + + def _validate_status(self, row): + """Assert that expected tumour status is correct.""" + if len(row[self._status_col]) <= 0: + raise AssertionError("'status' input is required.") + if row[self._status_col]!="1" and row[self._status_col]!="0": + raise AssertionError("Tumour status should be \"0\" is normal else \"1\"") + + def _validate_sample(self, row): + """Assert that expected sample is correct.""" + if len(row[self._sample_col]) <= 0: + raise AssertionError("'sample' input is required.") + + def _validate_cram(self, row): + """Assert that expected cram is correct.""" + if len(row[self._cram_col]) <= 0: + raise AssertionError("'cram' input is required.") + if not row[self._cram_col].endswith(".cram"): + raise AssertionError("'cram' input format is incorrect, ensure file ends with '.cram'") + + def _validate_crai(self, row): + """Assert that expected crai is correct.""" + if len(row[self._crai_col]) <= 0: + raise AssertionError("'crai' input is required.") + if not row[self._crai_col].endswith(".crai"): + raise AssertionError("'crai' input format is incorrect, ensure file ends with '.crai'") + if row[self._crai_col].split("/")[-1].replace(".cram.crai","")!=row[self._cram_col].split("/")[-1].replace(".cram",""): + raise AssertionError("'cram' and 'crai' file name bodies do not match.") + + def _validate_experiment(self, row): + """Assert that expected Experiment is correct.""" + if len(row[self._experiment_col]) <= 0: + raise AssertionError("'experiment' input is required.") + for val in ["WGS","WXS","RNA-Seq","Bisulfite-Seq","ChIP-Seq","Targeted-Seq"]: + if val==row[self._experiment_col]: + return + raise AssertionError("'experiment' type does not match the following: \"WGS\",\"WXS\",\"RNA-Seq\",\"Bisulfite-Seq\",\"ChIP-Seq\",\"Targeted-Seq\".") + + + def _validate_analysis_json(self, row): + """Assert that expected analysis_json is correct.""" + if len(row[self._analysis_json_col]) <= 0: + raise AssertionError("'analysis_json' input is required.") + if not row[self._analysis_json_col].endswith(".json"): + raise AssertionError("'analysis_json' input should have the suffix \".json\".") + + def _validate_genome_build(self, row): + """Assert that expected genome_build is correct.""" + if len(row[self._genome_build_col]) <= 0: + raise AssertionError("'genome_build' input is required.") + + def validate_unique_values(self,col): + """ + Assert a single unique value exists in array + """ + if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]): + raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col)) + sys.exit(1) + + def validate_common_values(self,col): + """ + Assert each value in array is unique + """ + if len(set([z[col] for z in self.modified]))!=1: + raise AssertionError("Errors multiple values detected for '%s'. Only a single value should be used" % (col)) + sys.exit(1) + + + +def read_head(handle, num_lines=10): + """Read the specified number of lines from the current position in the file.""" + lines = [] + for idx, line in enumerate(handle): + if idx == num_lines: + break + lines.append(line) + return "".join(lines) + + +def sniff_format(handle): + """ + Detect the tabular format. + + Args: + handle (text file): A handle to a `text file`_ object. The read position is + expected to be at the beginning (index 0). + + Returns: + csv.Dialect: The detected tabular format. + + .. _text file: + https://docs.python.org/3/glossary.html#term-text-file + + """ + peek = read_head(handle) + handle.seek(0) + sniffer = csv.Sniffer() + dialect = sniffer.sniff(peek) + return dialect + + +def check_samplesheet(file_in, file_out): + """ + Check that the tabular samplesheet has the structure expected by nf-core pipelines. + + Validate the general shape of the table, expected columns, and each row. Also add + an additional column which records whether one or two FASTQ reads were found. + + Args: + file_in (pathlib.Path): The given tabular samplesheet. The format can be either + CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. + file_out (pathlib.Path): Where the validated and transformed samplesheet should + be created; always in CSV format. + + Example: + This function checks that the samplesheet follows the following structure, + + analysis_type,study_id,patient,sex,status,sample,cram,crai,genome_build,analysis_json + sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,WXS,hg38,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json + """ + required_columns = {"sample","cram","crai"} + conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json"} + + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_in.open(newline="") as in_handle: + reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) + # Validate the existence of the expected header columns. + if not required_columns.issubset(reader.fieldnames) and not conditional_columns.issubset(reader.fieldnames): + req_cols = ", ".join(required_columns) + logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") + sys.exit(1) + # Validate each row. + checker = RowChecker() + for i, row in enumerate(reader): + try: + checker.validate_and_transform(row) + except AssertionError as error: + logger.critical(f"{str(error)} On line {i + 2}.") + sys.exit(1) + + for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]: + checker.validate_common_values(col) + for col in ["cram","crai"]: + checker.validate_unique_values(col) + + header = checker.modified[0].keys() + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_out.open(mode="w", newline="") as out_handle: + writer = csv.DictWriter(out_handle, header, delimiter=",") + writer.writeheader() + for row in checker.modified: + writer.writerow(row) + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog=\ + ''' +Check that the tabular samplesheet has the structure expected by nf-core pipelines. + +Validate the general shape of the table, expected columns, and each row. Also add +an additional column which records whether one or two FASTQ reads were found. + +Args: +file_in (pathlib.Path): The given tabular samplesheet. The format can be either + CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. +file_out (pathlib.Path): Where the validated and transformed samplesheet should + be created; always in CSV format. + +Example: + This function checks that the samplesheet follows the following structure, + + analysis_type,study_id,patient,sex,status,sample,cram,crai,analysis_json + sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json + ''', + + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "file_in", + metavar="FILE_IN", + type=Path, + help="Tabular input samplesheet in CSV or TSV format.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Transformed output samplesheet in CSV format.", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) + return parser.parse_args(argv) + + +def main(argv=None): + """Coordinate argument parsing and program execution.""" + args = parse_args(argv) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + if not args.file_in.is_file(): + logger.error(f"The given input file {args.file_in} was not found!") + sys.exit(2) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + check_samplesheet(args.file_in, args.file_out) + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py new file mode 100755 index 0000000..caccb36 --- /dev/null +++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py @@ -0,0 +1,352 @@ +#!/usr/bin/env python + + +"""Provide a command line tool to validate and transform tabular samplesheets.""" + + +import argparse +import csv +import logging +import sys +from collections import Counter +from pathlib import Path + +logger = logging.getLogger() + + +class RowChecker: + """ + Define a service that can validate and transform each given row. + + Attributes: + modified (list): A list of dicts, where each dict corresponds to a previously + validated and transformed row. The order of rows is maintained. + + """ + + VALID_FORMATS = ( + ".bam", + ".cram", + ) + + def __init__( + self, + analysis_type_col = 'analysis_type', + study_id_col = 'study_id', + patient_col = 'patient', + sex_col = 'sex', + status_col = 'status', + sample_col = 'sample', + cram_col = 'cram', + crai_col = 'crai', + experiment_col = 'experiment', + genome_build_col = "genome_build", + analysis_json_col = 'analysis_json', + **kwargs, + ): + """ + Initialize the row checker with the expected column names. +analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json + Args: + sample_col (str): The name of the column that contains the sample name + (default "sample"). + first_col (str): The name of the column that contains the first (or only) + FASTQ file path (default "fastq_1"). + second_col (str): The name of the column that contains the second (if any) + FASTQ file path (default "fastq_2"). + single_col (str): The name of the new column that will be inserted and + records whether the sample contains single- or paired-end sequencing + reads (default "single_end"). + + """ + super().__init__(**kwargs) + self._analysis_type_col = analysis_type_col + self._study_id_col = study_id_col + self._patient_col = patient_col + self._sex_col = sex_col + self._status_col = status_col + self._sample_col = sample_col + self._cram_col = cram_col + self._crai_col = crai_col + self._experiment_col = experiment_col + self._genome_build_col = genome_build_col + self._analysis_json_col = analysis_json_col + self._seen = [] + self.modified = [] + + def validate_and_transform(self, row): + """ + Perform all validations on the given row and insert the read pairing status. + + Args: + row (dict): A mapping from column headers (keys) to elements of that row + (values). + + """ + #{"analysis_type","study_id","patient","sex","status","sample","cram","crai","analysis_json"} + self._validate_analysis_type(row) if row.get(self._analysis_type_col) else "" + self._validate_sex(row) if row.get(self._sex_col) else "" + self._validate_study_id(row) if row.get(self._study_id_col) else "" + self._validate_patient(row) if row.get(self._patient_col) else "" + self._validate_status(row) if row.get(self._status_col) else "" + self._validate_sample(row) + self._validate_cram(row) + self._validate_crai(row) + self._validate_experiment(row) if row.get(self._experiment_col) else "" + self._validate_genome_build(row) if row.get(self._genome_build_col) else "" + self._validate_analysis_json(row) if row.get(self._analysis_json_col) else "" + + tmp_dict={ + "analysis_type" : row[self._analysis_type_col] if row.get(self._analysis_type_col) else "sequencing_alignment", + "study_id" : row[self._study_id_col] if row.get(self._study_id_col) else "LOCAL", + "patient" : row[self._patient_col] if row.get(self._patient_col) else row[self._sample_col], + "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA", + "status" : row[self._status_col] if row.get(self._status_col) else "0", + "sample" : row[self._sample_col], + "cram" : row[self._cram_col], + "crai" : row[self._crai_col], + "experiment": row[self._experiment_col] if row.get(self._experiment_col) else "WGS", + "genome_build": row[self._genome_build_col] if row.get(self._genome_build_col) else "GRCh38", + "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None + } + + self._seen.append(row) + self.modified.append(tmp_dict) + + + def _validate_analysis_type(self, row): + """Assert that expected analysis is correct.""" + if len(row[self._analysis_type_col]) <= 0: + raise AssertionError("'analysis_type' input is required.") + if row[self._analysis_type_col]!="sequencing_alignment": + raise AssertionError("analysis_type for \"Germline Variant Call\" should be \"sequencing_alignment\"") + + def _validate_study_id(self, row): + """Assert that expected study_id is correct.""" + if len(row[self._study_id_col]) <= 0: + raise AssertionError("'study_id' input is required.") + + def _validate_patient(self, row): + """Assert that expected patient is correct.""" + if len(row[self._patient_col]) <= 0: + raise AssertionError("'patient' input is required.") + + def _validate_sex(self, row): + """Assert that expected sex is correct.""" + if len(row[self._sex_col]) <= 0: + raise AssertionError("'analysis_type' input is required.") + if row[self._sex_col]!="XX" and row[self._sex_col]!="XY" and row[self._sex_col]!="NA": + raise AssertionError("sex should be one of the following values : XX,XY,NA") + + def _validate_status(self, row): + """Assert that expected tumour status is correct.""" + if len(row[self._status_col]) <= 0: + raise AssertionError("'status' input is required.") + if row[self._status_col]!="1" and row[self._status_col]!="0": + raise AssertionError("Tumour status should be \"0\" is normal else \"1\"") + + def _validate_sample(self, row): + """Assert that expected sample is correct.""" + if len(row[self._sample_col]) <= 0: + raise AssertionError("'sample' input is required.") + + def _validate_cram(self, row): + """Assert that expected cram is correct.""" + if len(row[self._cram_col]) <= 0: + raise AssertionError("'cram' input is required.") + if not row[self._cram_col].endswith(".cram"): + raise AssertionError("'cram' input format is incorrect, ensure file ends with '.cram'") + + def _validate_crai(self, row): + """Assert that expected crai is correct.""" + if len(row[self._crai_col]) <= 0: + raise AssertionError("'crai' input is required.") + if not row[self._crai_col].endswith(".crai"): + raise AssertionError("'crai' input format is incorrect, ensure file ends with '.crai'") + if row[self._crai_col].split("/")[-1].replace(".cram.crai","")!=row[self._cram_col].split("/")[-1].replace(".cram",""): + raise AssertionError("'cram' and 'crai' file name bodies do not match.") + + def _validate_experiment(self, row): + """Assert that expected Experiment is correct.""" + if len(row[self._experiment_col]) <= 0: + raise AssertionError("'experiment' input is required.") + for val in ["WGS","WXS","RNA-Seq","Bisulfite-Seq","ChIP-Seq","Targeted-Seq"]: + if val==row[self._experiment_col]: + return + raise AssertionError("'experiment' type does not match the following: \"WGS\",\"WXS\",\"RNA-Seq\",\"Bisulfite-Seq\",\"ChIP-Seq\",\"Targeted-Seq\".") + + def _validate_analysis_json(self, row): + """Assert that expected analysis_json is correct.""" + if len(row[self._analysis_json_col]) <= 0: + raise AssertionError("'analysis_json' input is required.") + if not row[self._analysis_json_col].endswith(".json"): + raise AssertionError("'analysis_json' input should have the suffix \".json\".") + + def _validate_genome_build(self, row): + """Assert that expected genome_build is correct.""" + if len(row[self._genome_build_col]) <= 0: + raise AssertionError("'genome_build' input is required.") + + def validate_unique_values(self,col): + """ + Assert a single unique value exists in array + """ + if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]): + raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col)) + sys.exit(1) + + def validate_common_values(self,col): + """ + Assert each value in array is unique + """ + if len(set([z[col] for z in self.modified]))!=1: + raise AssertionError("Errors multiple values detected for '%s'. Only a single value should be used" % (col)) + sys.exit(1) + + +def read_head(handle, num_lines=10): + """Read the specified number of lines from the current position in the file.""" + lines = [] + for idx, line in enumerate(handle): + if idx == num_lines: + break + lines.append(line) + return "".join(lines) + + +def sniff_format(handle): + """ + Detect the tabular format. + + Args: + handle (text file): A handle to a `text file`_ object. The read position is + expected to be at the beginning (index 0). + + Returns: + csv.Dialect: The detected tabular format. + + .. _text file: + https://docs.python.org/3/glossary.html#term-text-file + + """ + peek = read_head(handle) + handle.seek(0) + sniffer = csv.Sniffer() + dialect = sniffer.sniff(peek) + return dialect + + +def check_samplesheet(file_in, file_out): + """ + Check that the tabular samplesheet has the structure expected by nf-core pipelines. + + Validate the general shape of the table, expected columns, and each row. Also add + an additional column which records whether one or two FASTQ reads were found. + + Args: + file_in (pathlib.Path): The given tabular samplesheet. The format can be either + CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. + file_out (pathlib.Path): Where the validated and transformed samplesheet should + be created; always in CSV format. + + Example: + This function checks that the samplesheet follows the following structure, + + analysis_type,study_id,patient,sex,status,sample,cram,crai,genome_build,analysis_json + sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,WXS,hg38,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json + """ + required_columns = {"sample","cram","crai"} + conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json"} + + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_in.open(newline="") as in_handle: + reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) + # Validate the existence of the expected header columns. + if not required_columns.issubset(reader.fieldnames) and not conditional_columns.issubset(reader.fieldnames): + req_cols = ", ".join(required_columns) + logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") + sys.exit(1) + # Validate each row. + checker = RowChecker() + for i, row in enumerate(reader): + try: + checker.validate_and_transform(row) + except AssertionError as error: + logger.critical(f"{str(error)} On line {i + 2}.") + sys.exit(1) + + for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]: + checker.validate_common_values(col) + for col in ["cram","crai"]: + checker.validate_unique_values(col) + + header = checker.modified[0].keys() + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_out.open(mode="w", newline="") as out_handle: + writer = csv.DictWriter(out_handle, header, delimiter=",") + writer.writeheader() + for row in checker.modified: + writer.writerow(row) + + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog=\ + ''' +Check that the tabular samplesheet has the structure expected by nf-core pipelines. + +Validate the general shape of the table, expected columns, and each row. Also add +an additional column which records whether one or two FASTQ reads were found. + +Args: +file_in (pathlib.Path): The given tabular samplesheet. The format can be either + CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. +file_out (pathlib.Path): Where the validated and transformed samplesheet should + be created; always in CSV format. + +Example: + This function checks that the samplesheet follows the following structure, + + analysis_type,study_id,patient,sex,status,sample,cram,crai,analysis_json + sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json + ''', + + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "file_in", + metavar="FILE_IN", + type=Path, + help="Tabular input samplesheet in CSV or TSV format.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Transformed output samplesheet in CSV format.", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) + return parser.parse_args(argv) + + +def main(argv=None): + """Coordinate argument parsing and program execution.""" + args = parse_args(argv) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + if not args.file_in.is_file(): + logger.error(f"The given input file {args.file_in} was not found!") + sys.exit(2) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + check_samplesheet(args.file_in, args.file_out) + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py new file mode 100755 index 0000000..f9d5704 --- /dev/null +++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py @@ -0,0 +1,416 @@ +#!/usr/bin/env python + + +"""Provide a command line tool to validate and transform tabular samplesheets.""" + + +import argparse +import csv +import logging +import sys +from collections import Counter +from pathlib import Path + +logger = logging.getLogger() + + +class RowChecker: + """ + Define a service that can validate and transform each given row. + + Attributes: + modified (list): A list of dicts, where each dict corresponds to a previously + validated and transformed row. The order of rows is maintained. + + """ + + VALID_FORMATS = ( + ".bam", + ".cram", + ) + + def __init__( + self, + analysis_type_col = 'analysis_type', + study_id_col = 'study_id', + patient_col = 'patient', + sex_col = 'sex', + status_col = 'status', + sample_col = 'sample', + lane_col = 'lane', + fastq_1_col = 'fastq_1', + fastq_2_col = 'fastq_2', + single_end_col = 'single_end', + read_group_count_col = 'read_group_count', + experiment_col = 'experiment', + analysis_json_col = 'analysis_json', + **kwargs, + ): + """ + Initialize the row checker with the expected column names. +analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json + Args: + sample_col (str): The name of the column that contains the sample name + (default "sample"). + first_col (str): The name of the column that contains the first (or only) + FASTQ file path (default "fastq_1"). + second_col (str): The name of the column that contains the second (if any) + FASTQ file path (default "fastq_2"). + single_col (str): The name of the new column that will be inserted and + records whether the sample contains single- or paired-end sequencing + reads (default "single_end"). + + """ + super().__init__(**kwargs) + self._analysis_type_col = analysis_type_col + self._study_id_col = study_id_col + self._patient_col = patient_col + self._sex_col = sex_col + self._status_col = status_col + self._sample_col = sample_col + self._lane_col = lane_col + self._fastq_1_col = fastq_1_col + self._fastq_2_col = fastq_2_col + self._single_end_col = single_end_col + self._read_group_count_col = read_group_count_col + self._experiment_col = experiment_col + self._analysis_json_col = analysis_json_col + self._seen = [] + self.modified = [] + + def validate_and_transform(self, row): + """ + Perform all validations on the given row and insert the read pairing status. + + Args: + row (dict): A mapping from column headers (keys) to elements of that row + (values). + + """ + self._validate_analysis_type(row) if row.get(self._analysis_type_col) else "" + self._validate_study_id(row) if row.get(self._study_id_col) else "" + self._validate_patient(row) if row.get(self._patient_col) else "" + self._validate_sex(row) if row.get(self._sex_col) else "" + self._validate_status(row) if row.get(self._status_col) else "" + self._validate_sample(row) + self._validate_lane(row) + self._validate_single_end(row) + self._validate_fastq_1(row) + self._validate_fastq_2(row) + self._validate_read_group_count(row) if row.get(self._read_group_count_col) else "" + self._validate_experiment(row) if row.get(self._experiment_col) else "" + self._validate_analysis_json(row) if row.get(self._analysis_json_col) else "" + + tmp_dict={ + "analysis_type" : row[self._analysis_type_col] if row.get(self._analysis_type_col) else "sequencing_experiment", + "study_id" : row[self._study_id_col] if row.get(self._study_id_col) else "LOCAL", + "patient" : row[self._patient_col] if row.get(self._patient_col) else row[self._sample_col], + "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA", + "status" : row[self._status_col] if row.get(self._status_col) else "0", + "sample" : row[self._sample_col], + "lane" : row[self._lane_col], + "fastq_1" : row[self._fastq_1_col], + "fastq_2" : row[self._fastq_2_col], + "single_end" : row[self._single_end_col].lower(), + "read_group_count" : row[self._read_group_count_col] if row.get(self._read_group_count_col) else None, + "experiment" : row[self._experiment_col] if row.get(self._experiment_col) else "WGS", + "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None + } + + self._seen.append(row) + self.modified.append(tmp_dict) + + + def _validate_analysis_type(self, row): + """Assert that expected analysis is correct.""" + if len(row[self._analysis_type_col]) <= 0: + raise AssertionError("'analysis_type' input is required.") + if row[self._analysis_type_col]!="sequencing_experiment": + raise AssertionError("analysis_type for \"Pre Alignment QC\" should be \"sequencing_experiment\"") + + def _validate_study_id(self, row): + """Assert that expected study_id is correct.""" + if len(row[self._study_id_col]) <= 0: + raise AssertionError("'study_id' input is required.") + + def _validate_patient(self, row): + """Assert that expected patient is correct.""" + if len(row[self._patient_col]) <= 0: + raise AssertionError("'patient' input is required.") + + def _validate_sex(self, row): + """Assert that expected sex is correct.""" + if len(row[self._sex_col]) <= 0: + raise AssertionError("'analysis_type' input is required.") + if row[self._sex_col]!="XX" and row[self._sex_col]!="XY" and row[self._sex_col]!="NA": + raise AssertionError("sex should be one of the following values : XX,XY,NA") + + def _validate_status(self, row): + """Assert that expected tumour status is correct.""" + if len(row[self._status_col]) <= 0: + raise AssertionError("'status' input is required.") + if row[self._status_col]!="1" and row[self._status_col]!="0": + raise AssertionError("Tumour status should be \"0\" is normal else \"1\"") + + def _validate_sample(self, row): + """Assert that expected sample is correct.""" + if len(row[self._sample_col]) <= 0: + raise AssertionError("'sample' input is required.") + + + def _validate_lane(self, row): + """Assert that expected lane is correct.""" + if len(row[self._lane_col]) <= 0: + raise AssertionError("'lane' input is required.") + + + def _validate_fastq_1(self, row): + """Assert that expected fastq_1 is correct.""" + if len(row[self._fastq_1_col]) <= 0: + raise AssertionError("'fastq_1' input is required.") + if not ( + row[self._fastq_1_col].endswith(".fq.gz") or + row[self._fastq_1_col].endswith(".fastq.gz") or + row[self._fastq_1_col].endswith(".bam") + ): + raise AssertionError("'fastq_1' incorrect format detected.") + + + def _validate_fastq_2(self, row): + """Assert that expected fastq_2 is correct.""" + if row[self._single_end_col].lower()=="true": + return + + if len(row[self._fastq_2_col]) <= 0: + raise AssertionError("'fastq_2' input is required.") + if row[self._fastq_2_col].endswith(".fastq.gz"): + if row[self._fastq_2_col].split("/")[-1].replace("R2.fastq.gz","").replace("r2.fastq.gz","")!=row[self._fastq_1_col].split("/")[-1].replace("R1.fastq.gz","").replace("r1.fastq.gz",""): + raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.") + if row[self._fastq_2_col].endswith(".fq.gz"): + if row[self._fastq_2_col].split("/")[-1].replace("R2.fq.gz","").replace("r2.fq.gz","")!=row[self._fastq_1_col].split("/")[-1].replace("R1.fq.gz","").replace("r1.fq.gz",""): + raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.") + if row[self._fastq_2_col].endswith(".bam"): + if row[self._fastq_2_col]!=row[self._fastq_1_col]: + raise AssertionError("'fastq_1' and 'fastq_2' prefix differ.") + + + def _validate_single_end(self, row): + """Assert that expected single_end is correct.""" + if len(row[self._single_end_col]) <= 0: + raise AssertionError("'single_end' input is required.") + if row[self._single_end_col].lower()!="true" and row[self._single_end_col].lower()!="false": + raise AssertionError("'single_end' should be specifed as \"True\" or \"False\".") + + + def _validate_read_group_count(self, row): + """Assert that expected read_group_count is correct.""" + if len(row[self._read_group_count_col]) <= 0: + raise AssertionError("'read_group_count' input is required.") + + def _validate_experiment(self, row): + """Assert that expected Experiment is correct.""" + if len(row[self._experiment_col]) <= 0: + raise AssertionError("'experiment' input is required.") + for val in ["WGS","WXS","RNA-Seq","Bisulfite-Seq","ChIP-Seq","Targeted-Seq"]: + if val==row[self._experiment_col]: + return + raise AssertionError("'experiment' type does not match the following: \"WGS\",\"WXS\",\"RNA-Seq\",\"Bisulfite-Seq\",\"ChIP-Seq\",\"Targeted-Seq\".") + + + def _validate_analysis_json(self, row): + """Assert that expected analysis_json is correct.""" + if len(row[self._analysis_json_col]) <= 0: + raise AssertionError("'analysis_json' input is required.") + if not row[self._analysis_json_col].endswith(".json"): + raise AssertionError("'analysis_json' input should have the suffix \".json\".") + + def validate_unique_fastq(self): + """ + Assert that the combination of FASTQ filename is unique. + """ + tmp=[z['fastq_1'] for z in self.modified]+[z['fastq_2'] for z in self.modified] + + for iter in range(0,len(tmp)): + current_val=tmp.pop(0) + if current_val.endswith(".fastq.gz"): + continue + if current_val.endswith(".fq.gz"): + continue + if current_val=='NO_FILE': + continue + if current_val in tmp: + raise AssertionError("Errors multiple instances of file '%s' detected" % (current_val)) + sys.exit(1) + else: + raise AssertionError("Unexpected file format detected for '%s'" % (current_val)) + + + def validate_unique_values(self,col): + """ + Assert a single unique value exists in array + """ + if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]): + raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col)) + sys.exit(1) + + def validate_common_values(self,col): + """ + Assert each value in array is unique + """ + if len(set([z[col] for z in self.modified]))!=1: + raise AssertionError("Errors multiple values detected for '%s'. Only a single value should be used" % (col)) + sys.exit(1) + + +def read_head(handle, num_lines=10): + """Read the specified number of lines from the current position in the file.""" + lines = [] + for idx, line in enumerate(handle): + if idx == num_lines: + break + lines.append(line) + return "".join(lines) + + +def sniff_format(handle): + """ + Detect the tabular format. + + Args: + handle (text file): A handle to a `text file`_ object. The read position is + expected to be at the beginning (index 0). + + Returns: + csv.Dialect: The detected tabular format. + + .. _text file: + https://docs.python.org/3/glossary.html#term-text-file + + """ + peek = read_head(handle) + handle.seek(0) + sniffer = csv.Sniffer() + dialect = sniffer.sniff(peek) + return dialect + + +def check_samplesheet(file_in, file_out): + """ + Check that the tabular samplesheet has the structure expected by nf-core pipelines. + + Validate the general shape of the table, expected columns, and each row. Also add + an additional column which records whether one or two FASTQ reads were found. + + Args: + file_in (pathlib.Path): The given tabular samplesheet. The format can be either + CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. + file_out (pathlib.Path): Where the validated and transformed samplesheet should + be created; always in CSV format. + + Example: + This function checks that the samplesheet follows the following structure, + see also the `viral recon samplesheet`_:: + + analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json + sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,C0HVY.2,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R1.fq.gz,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R2.fq.gz,'@RG\tID:C0HVY.2\tSM:SA624380\tLB:Pond-147580\tPU:74_8a\tPI:298\tCN:EXT\tPL:ILLUMINA\tPM:HiSeq 2000\tDT:2014-12-12\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,875ef550-e536-4456-9ef5-50e5362456df.analysis.json + sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RE2.1,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R2.fq.gz,'@RG\tID:D0RE2.1\tSM:SA624380\tLB:Pond-147580\tPU:74_8b\tPI:298\tCN:EXT\tPL:ILLUMINA\tPM:HiSeq 2000\tDT:2014-12-12\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,875ef550-e536-4456-9ef5-50e5362456df.analysis.json + sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RH0.2,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R2.fq.gz,'@RG\tID:D0RH0.2\tSM:SA624380\tLB:Pond-147580\tPU:74_8c\tPI:298\tCN:EXT\tPL:ILLUMINA\tPM:HiSeq 2000\tDT:2014-12-12\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,875ef550-e536-4456-9ef5-50e5362456df.analysis.json + + """ + required_columns = {"sample","lane","fastq_1","fastq_2","single_end"} + conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json"} + + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_in.open(newline="") as in_handle: + reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) + # Validate the existence of the expected header columns. + if not required_columns.issubset(reader.fieldnames) and not conditional_columns.issubset(reader.fieldnames): + req_cols = ", ".join(required_columns) + logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") + sys.exit(1) + # Validate each row. + checker = RowChecker() + for i, row in enumerate(reader): + try: + checker.validate_and_transform(row) + except AssertionError as error: + logger.critical(f"{str(error)} On line {i + 2}.") + sys.exit(1) + checker.validate_unique_fastq() + for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]: + checker.validate_common_values(col) + for col in ["lane"]: + checker.validate_unique_values(col) + + + header = checker.modified[0].keys() + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_out.open(mode="w", newline="") as out_handle: + writer = csv.DictWriter(out_handle, header, delimiter=",") + writer.writeheader() + for row in checker.modified: + writer.writerow(row) + + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog=\ + ''' +Check that the tabular samplesheet has the structure expected by nf-core pipelines. + +Validate the general shape of the table, expected columns, and each row. Also add +an additional column which records whether one or two FASTQ reads were found. + +Args: +file_in (pathlib.Path): The given tabular samplesheet. The format can be either + CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. +file_out (pathlib.Path): Where the validated and transformed samplesheet should + be created; always in CSV format. + +Example: + This function checks that the samplesheet follows the following structure, + + analysis_type,study_id,patient,sex,status,sample,lane,fastq_1,fastq_2,read_group,single_end,read_group_count,analysis_json + sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,C0HVY.2,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R1.fq.gz,TEST-QA.DO263089.SA624380.C0HVY.2.8775eee1cacedc27428856591023d837_R2.fq.gz,'@RG\\tID:C0HVY.2\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8a\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json + sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RE2.1,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RE2.1.b8ac1a3b5b52ced6068b28c4e9b4e5e9_R2.fq.gz,'@RG\\tID:D0RE2.1\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8b\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json + sequencing_experiment,TEST-QA,DO263089,XX,1,SA624380,D0RH0.2,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R1.fq.gz,TEST-QA.DO263089.SA624380.D0RH0.2.231146e66d802729c719428e33e555a8_R2.fq.gz,'@RG\\tID:D0RH0.2\\tSM:SA624380\\tLB:Pond-147580\\tPU:74_8c\\tPI:298\\tCN:EXT\\tPL:ILLUMINA\\tPM:HiSeq 2000\\tDT:2014-12-12\\tDS:WGS|TEST-QA|SP224367|DO263089|Cell line - derived from tumour|Tumour',False,3,WXS,875ef550-e536-4456-9ef5-50e5362456df.analysis.json +''', + formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "file_in", + metavar="FILE_IN", + type=Path, + help="Tabular input samplesheet in CSV or TSV format.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Transformed output samplesheet in CSV format.", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) + return parser.parse_args(argv) + + +def main(argv=None): + """Coordinate argument parsing and program execution.""" + args = parse_args(argv) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + if not args.file_in.is_file(): + logger.error(f"The given input file {args.file_in} was not found!") + sys.exit(2) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + check_samplesheet(args.file_in, args.file_out) + + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 0000000..9b3272b --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,7 @@ +name: custom_dumpsoftwareversions +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.19 diff --git a/modules/nf-core/cutadapt/environment.yml b/modules/nf-core/cutadapt/environment.yml new file mode 100644 index 0000000..d32a8f9 --- /dev/null +++ b/modules/nf-core/cutadapt/environment.yml @@ -0,0 +1,7 @@ +name: cutadapt +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::cutadapt=3.4 diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml new file mode 100644 index 0000000..1787b38 --- /dev/null +++ b/modules/nf-core/fastqc/environment.yml @@ -0,0 +1,7 @@ +name: fastqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastqc=0.12.1 diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml new file mode 100644 index 0000000..7625b75 --- /dev/null +++ b/modules/nf-core/multiqc/environment.yml @@ -0,0 +1,7 @@ +name: multiqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.19 From 055b57a9053b04ea10de841dd819ded0e8333855 Mon Sep 17 00:00:00 2001 From: Linda Xiang Date: Tue, 9 Apr 2024 12:38:32 -0400 Subject: [PATCH 4/6] update stage_input fixes --- modules.json | 28 ++++-- .../checkinput/resources/usr/bin/dnaalnqc.py | 53 ++++++----- .../resources/usr/bin/germlinevar.py | 51 ++++++----- .../checkinput/resources/usr/bin/prealnqc.py | 5 +- .../prep/sample/resources/usr/bin/main.py | 16 ++-- .../samtools/index/environment.yml | 8 ++ .../samtools/index/main.nf | 48 ++++++++++ .../samtools/index/meta.yml | 57 ++++++++++++ .../tabix/tabix/environment.yml | 9 ++ .../icgc-argo-workflows/tabix/tabix/main.nf | 42 +++++++++ .../icgc-argo-workflows/tabix/tabix/meta.yml | 45 ++++++++++ .../icgc-argo-workflows/stage_input/main.nf | 90 ++++++++++++++----- 12 files changed, 356 insertions(+), 96 deletions(-) create mode 100644 modules/icgc-argo-workflows/samtools/index/environment.yml create mode 100644 modules/icgc-argo-workflows/samtools/index/main.nf create mode 100644 modules/icgc-argo-workflows/samtools/index/meta.yml create mode 100644 modules/icgc-argo-workflows/tabix/tabix/environment.yml create mode 100644 modules/icgc-argo-workflows/tabix/tabix/main.nf create mode 100644 modules/icgc-argo-workflows/tabix/tabix/meta.yml diff --git a/modules.json b/modules.json index dd1f6b2..2d60334 100644 --- a/modules.json +++ b/modules.json @@ -6,8 +6,8 @@ "modules": { "icgc-argo-workflows": { "checkinput": { - "branch": "stage_input_fixB", - "git_sha": "af24d4d6b59921ee048c304926897567ac956b00", + "branch": "main", + "git_sha": "e1f2b946b457eac191c0fa97ae1d159a15874c6b", "installed_by": ["stage_input"] }, "cleanup": { @@ -26,12 +26,17 @@ "installed_by": ["modules"] }, "prep/sample": { - "branch": "stage_input_fixB", - "git_sha": "f253d1e6d4dc5f6ac0e6440041ee7e55b8203e35", + "branch": "main", + "git_sha": "dbd4c7b18c86be15f2ca341085d90c0488545d53", "installed_by": ["stage_input"] }, + "samtools/index": { + "branch": "main", + "git_sha": "3f8cbdb457ed1b642b4f9b079850f2a92da9fcc0", + "installed_by": ["modules", "stage_input"] + }, "score/download": { - "branch": "stage_input_fixB", + "branch": "main", "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec", "installed_by": ["song_score_download"] }, @@ -41,7 +46,7 @@ "installed_by": ["song_score_upload"] }, "song/get": { - "branch": "stage_input_fixB", + "branch": "main", "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec", "installed_by": ["song_score_download"] }, @@ -59,13 +64,18 @@ "branch": "main", "git_sha": "19ee48fdf1672ef9723e3093531be7ddea3e27ec", "installed_by": ["song_score_upload"] + }, + "tabix/tabix": { + "branch": "main", + "git_sha": "e1f2b946b457eac191c0fa97ae1d159a15874c6b", + "installed_by": ["stage_input"] } } }, "subworkflows": { "icgc-argo-workflows": { "song_score_download": { - "branch": "stage_input_fixB", + "branch": "main", "git_sha": "92aa620385099e94401c22b8633cc55ed34ca10e", "installed_by": ["stage_input"] }, @@ -75,8 +85,8 @@ "installed_by": ["subworkflows"] }, "stage_input": { - "branch": "stage_input_fixB", - "git_sha": "af24d4d6b59921ee048c304926897567ac956b00", + "branch": "main", + "git_sha": "e9dfe346ae3334973f406be3051a1091cad1dca6", "installed_by": ["subworkflows"] } } diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py index 4e111cd..afea641 100755 --- a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py +++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/dnaalnqc.py @@ -10,7 +10,7 @@ import sys from collections import Counter from pathlib import Path - +import os logger = logging.getLogger() @@ -39,8 +39,8 @@ def __init__( sex_col = 'sex', status_col = 'status', sample_col = 'sample', - cram_col = 'cram', - crai_col = 'crai', + bam_cram_col = 'bam_cram', + bai_crai_col = 'bai_crai', experiment_col = 'experiment', genome_build_col = "genome_build", analysis_json_col = 'analysis_json', @@ -68,8 +68,8 @@ def __init__( self._sex_col = sex_col self._status_col = status_col self._sample_col = sample_col - self._cram_col = cram_col - self._crai_col = crai_col + self._bam_cram_col = bam_cram_col + self._bai_crai_col = bai_crai_col self._experiment_col = experiment_col self._genome_build_col = genome_build_col self._analysis_json_col = analysis_json_col @@ -92,13 +92,13 @@ def validate_and_transform(self, row): self._validate_patient(row) if row.get(self._patient_col) else "" self._validate_status(row) if row.get(self._status_col) else "" self._validate_sample(row) - self._validate_cram(row) - self._validate_crai(row) + self._validate_bam_cram(row) + self._validate_bai_crai(row) if row.get(self._bai_crai_col) else "" self._validate_experiment(row) if row.get(self._experiment_col) else "" self._validate_genome_build(row) if row.get(self._genome_build_col) else "" self._validate_analysis_json(row) if row.get(self._analysis_json_col) else "" - + print(row) tmp_dict={ "analysis_type" : row[self._analysis_type_col] if row.get(self._analysis_type_col) else "sequencing_alignment", "study_id" : row[self._study_id_col] if row.get(self._study_id_col) else "LOCAL", @@ -106,8 +106,8 @@ def validate_and_transform(self, row): "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA", "status" : row[self._status_col] if row.get(self._status_col) else "0", "sample" : row[self._sample_col], - "cram" : row[self._cram_col], - "crai" : row[self._crai_col], + "bam_cram" : row[self._bam_cram_col], + "bai_crai" : row[self._bai_crai_col] if row.get(self._bai_crai_col) else None, "experiment": row[self._experiment_col] if row.get(self._experiment_col) else "WGS", "genome_build": row[self._genome_build_col] if row.get(self._genome_build_col) else "GRCh38", "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None @@ -153,21 +153,19 @@ def _validate_sample(self, row): if len(row[self._sample_col]) <= 0: raise AssertionError("'sample' input is required.") - def _validate_cram(self, row): + def _validate_bam_cram(self, row): """Assert that expected cram is correct.""" - if len(row[self._cram_col]) <= 0: - raise AssertionError("'cram' input is required.") - if not row[self._cram_col].endswith(".cram"): - raise AssertionError("'cram' input format is incorrect, ensure file ends with '.cram'") + if len(row[self._bam_cram_col]) <= 0: + raise AssertionError("'bam_cram' input is required.") + if not row[self._bam_cram_col].endswith(".cram") and not row[self._bam_cram_col].endswith(".bam"): + raise AssertionError("'bam_cram' input format is incorrect, ensure file ends with '.bam' or '.cram'") - def _validate_crai(self, row): + def _validate_bai_crai(self, row): """Assert that expected crai is correct.""" - if len(row[self._crai_col]) <= 0: - raise AssertionError("'crai' input is required.") - if not row[self._crai_col].endswith(".crai"): - raise AssertionError("'crai' input format is incorrect, ensure file ends with '.crai'") - if row[self._crai_col].split("/")[-1].replace(".cram.crai","")!=row[self._cram_col].split("/")[-1].replace(".cram",""): - raise AssertionError("'cram' and 'crai' file name bodies do not match.") + if not row[self._bai_crai_col].endswith(".crai") and not row[self._bai_crai_col].endswith(".bai"): + raise AssertionError("'bai_crai' input format is incorrect, ensure file ends with '.crai' or '.bai'") + if row[self._bai_crai_col].split("/")[-1].replace(".cram.crai","").replace(".bam.bai","")!=row[self._bam_cram_col].split("/")[-1].replace(".cram","").replace(".bam",""): + raise AssertionError("'bam_cram' and 'bai_crai' file name bodies do not match.") def _validate_experiment(self, row): """Assert that expected Experiment is correct.""" @@ -195,7 +193,7 @@ def validate_unique_values(self,col): """ Assert a single unique value exists in array """ - if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]): + if len(set([z[col] for z in self.modified if z[col] is not None]))!=len([z[col] for z in self.modified if z[col] is not None]): raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col)) sys.exit(1) @@ -260,7 +258,7 @@ def check_samplesheet(file_in, file_out): analysis_type,study_id,patient,sex,status,sample,cram,crai,genome_build,analysis_json sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,WXS,hg38,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json """ - required_columns = {"sample","cram","crai"} + required_columns = {"sample","bam_cram"} conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json"} # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. @@ -280,9 +278,10 @@ def check_samplesheet(file_in, file_out): logger.critical(f"{str(error)} On line {i + 2}.") sys.exit(1) - for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]: - checker.validate_common_values(col) - for col in ["cram","crai"]: + #Check unnncessary for dnaalnqc + #for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]: + # checker.validate_common_values(col) + for col in ["bam_cram","bai_crai"]: checker.validate_unique_values(col) header = checker.modified[0].keys() diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py index caccb36..3ff44ac 100755 --- a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py +++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/germlinevar.py @@ -10,7 +10,7 @@ import sys from collections import Counter from pathlib import Path - +import os logger = logging.getLogger() @@ -37,8 +37,8 @@ def __init__( sex_col = 'sex', status_col = 'status', sample_col = 'sample', - cram_col = 'cram', - crai_col = 'crai', + bam_cram_col = 'bam_cram', + bai_crai_col = 'bai_crai', experiment_col = 'experiment', genome_build_col = "genome_build", analysis_json_col = 'analysis_json', @@ -66,8 +66,8 @@ def __init__( self._sex_col = sex_col self._status_col = status_col self._sample_col = sample_col - self._cram_col = cram_col - self._crai_col = crai_col + self._bam_cram_col = bam_cram_col + self._bai_crai_col = bai_crai_col self._experiment_col = experiment_col self._genome_build_col = genome_build_col self._analysis_json_col = analysis_json_col @@ -90,8 +90,8 @@ def validate_and_transform(self, row): self._validate_patient(row) if row.get(self._patient_col) else "" self._validate_status(row) if row.get(self._status_col) else "" self._validate_sample(row) - self._validate_cram(row) - self._validate_crai(row) + self._validate_bam_cram(row) + self._validate_bai_crai(row) if row.get(self._bai_crai_col) else "" self._validate_experiment(row) if row.get(self._experiment_col) else "" self._validate_genome_build(row) if row.get(self._genome_build_col) else "" self._validate_analysis_json(row) if row.get(self._analysis_json_col) else "" @@ -103,8 +103,8 @@ def validate_and_transform(self, row): "sex" : row[self._sex_col] if row.get(self._sex_col) else "NA", "status" : row[self._status_col] if row.get(self._status_col) else "0", "sample" : row[self._sample_col], - "cram" : row[self._cram_col], - "crai" : row[self._crai_col], + "bam_cram" : row[self._bam_cram_col], + "bai_crai" : row[self._bai_crai_col] if row.get(self._bai_crai_col) else None, "experiment": row[self._experiment_col] if row.get(self._experiment_col) else "WGS", "genome_build": row[self._genome_build_col] if row.get(self._genome_build_col) else "GRCh38", "analysis_json": row[self._analysis_json_col] if row.get(self._analysis_json_col) else None @@ -150,21 +150,19 @@ def _validate_sample(self, row): if len(row[self._sample_col]) <= 0: raise AssertionError("'sample' input is required.") - def _validate_cram(self, row): + def _validate_bam_cram(self, row): """Assert that expected cram is correct.""" - if len(row[self._cram_col]) <= 0: - raise AssertionError("'cram' input is required.") - if not row[self._cram_col].endswith(".cram"): - raise AssertionError("'cram' input format is incorrect, ensure file ends with '.cram'") + if len(row[self._bam_cram_col]) <= 0: + raise AssertionError("'bam_cram' input is required.") + if not row[self._bam_cram_col].endswith(".cram") and not row[self._bam_cram_col].endswith(".bam"): + raise AssertionError("'bam_cram' input format is incorrect, ensure file ends with '.bam' or '.cram'") - def _validate_crai(self, row): + def _validate_bai_crai(self, row): """Assert that expected crai is correct.""" - if len(row[self._crai_col]) <= 0: - raise AssertionError("'crai' input is required.") - if not row[self._crai_col].endswith(".crai"): - raise AssertionError("'crai' input format is incorrect, ensure file ends with '.crai'") - if row[self._crai_col].split("/")[-1].replace(".cram.crai","")!=row[self._cram_col].split("/")[-1].replace(".cram",""): - raise AssertionError("'cram' and 'crai' file name bodies do not match.") + if not row[self._bai_crai_col].endswith(".crai") and not row[self._bai_crai_col].endswith(".bai"): + raise AssertionError("'bai_crai' input format is incorrect, ensure file ends with '.crai' or '.bai'") + if row[self._bai_crai_col].split("/")[-1].replace(".cram.crai","").replace(".bam.bai","")!=row[self._bam_cram_col].split("/")[-1].replace(".cram","").replace(".bam",""): + raise AssertionError("'bam_cram' and 'bai_crai' file name bodies do not match.") def _validate_experiment(self, row): """Assert that expected Experiment is correct.""" @@ -191,7 +189,7 @@ def validate_unique_values(self,col): """ Assert a single unique value exists in array """ - if len(set([z[col] for z in self.modified]))!=len([z[col] for z in self.modified]): + if len(set([z[col] for z in self.modified if z[col] is not None]))!=len([z[col] for z in self.modified if z[col] is not None]): raise AssertionError("Errors duplicates values detected for '%s'. Each row should have an unique value" % (col)) sys.exit(1) @@ -255,7 +253,7 @@ def check_samplesheet(file_in, file_out): analysis_type,study_id,patient,sex,status,sample,cram,crai,genome_build,analysis_json sequencing_alignment,TEST-QA,DO262466,XY,1,SA622744,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram,TEST-QA.DO262466.SA622744.wxs.20210712.aln.cram.crai,WXS,hg38,4f6d6ddf-3759-4a30-ad6d-df37591a3033.analysis.json """ - required_columns = {"sample","cram","crai"} + required_columns = {"sample","bam_cram"} conditional_columns = {"study_id","sex","patient","status","experiment","analysis_json"} # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. @@ -275,9 +273,10 @@ def check_samplesheet(file_in, file_out): logger.critical(f"{str(error)} On line {i + 2}.") sys.exit(1) - for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]: - checker.validate_common_values(col) - for col in ["cram","crai"]: + # Check unnecessary for gerrmlinevar + #for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]: + # checker.validate_common_values(col) + for col in ["bam_cram","bai_crai"]: checker.validate_unique_values(col) header = checker.modified[0].keys() diff --git a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py index f9d5704..241aa18 100755 --- a/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py +++ b/modules/icgc-argo-workflows/checkinput/resources/usr/bin/prealnqc.py @@ -337,8 +337,9 @@ def check_samplesheet(file_in, file_out): logger.critical(f"{str(error)} On line {i + 2}.") sys.exit(1) checker.validate_unique_fastq() - for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]: - checker.validate_common_values(col) + #Check unnncessary for prealnqc + #for col in["sample","study_id","sex","patient","experiment","status","analysis_json"]: + # checker.validate_common_values(col) for col in ["lane"]: checker.validate_unique_values(col) diff --git a/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py b/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py index c13d3dc..56b9a61 100755 --- a/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py +++ b/modules/icgc-argo-workflows/prep/sample/resources/usr/bin/main.py @@ -334,18 +334,18 @@ def main(): elif analysis_type == 'sequencing_alignment': for fp in args.input_files: - if fp.endswith('cram'): - cram = os.path.join(os.getcwd(), args.outdir, os.path.basename(fp)) - os.symlink(os.path.abspath(fp), cram) - elif fp.endswith('crai'): - crai = os.path.join(os.getcwd(), args.outdir, os.path.basename(fp)) - os.symlink(os.path.abspath(fp), crai) + if fp.endswith('cram') or fp.endswith('bam'): + bam_cram = os.path.join(os.getcwd(), args.outdir, os.path.basename(fp)) + os.symlink(os.path.abspath(fp), bam_cram) + elif fp.endswith('crai') or fp.endswith('bai'): + bai_crai = os.path.join(os.getcwd(), args.outdir, os.path.basename(fp)) + os.symlink(os.path.abspath(fp), bai_crai) else: sys.exit("Error: not supported input file format") with open(output_sample_sheet, 'w', newline='') as f: csvwriter = csv.writer(f, delimiter=',') - csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','cram','crai',"genome_build",'experiment', 'analysis_json']) - csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, cram, crai, genome_build,experiment, metadata_json]) + csvwriter.writerow(['analysis_type','study_id','patient','sex','status','sample','bam_cram','bai_crai',"genome_build",'experiment', 'analysis_json']) + csvwriter.writerow([analysis_type, study_id, donor_id, sex, status, sample_id, bam_cram, bai_crai, genome_build,experiment, metadata_json]) elif analysis_type == 'variant_calling': for fp in song_analysis['files']: diff --git a/modules/icgc-argo-workflows/samtools/index/environment.yml b/modules/icgc-argo-workflows/samtools/index/environment.yml new file mode 100644 index 0000000..a5e5064 --- /dev/null +++ b/modules/icgc-argo-workflows/samtools/index/environment.yml @@ -0,0 +1,8 @@ +name: samtools_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.19.2 + - bioconda::htslib=1.19.1 diff --git a/modules/icgc-argo-workflows/samtools/index/main.nf b/modules/icgc-argo-workflows/samtools/index/main.nf new file mode 100644 index 0000000..dc14f98 --- /dev/null +++ b/modules/icgc-argo-workflows/samtools/index/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/icgc-argo-workflows/samtools/index/meta.yml b/modules/icgc-argo-workflows/samtools/index/meta.yml new file mode 100644 index 0000000..01a4ee0 --- /dev/null +++ b/modules/icgc-argo-workflows/samtools/index/meta.yml @@ -0,0 +1,57 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" +maintainers: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/icgc-argo-workflows/tabix/tabix/environment.yml b/modules/icgc-argo-workflows/tabix/tabix/environment.yml new file mode 100644 index 0000000..8233baa --- /dev/null +++ b/modules/icgc-argo-workflows/tabix/tabix/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "tabix_tabix" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::tabix=1.11" diff --git a/modules/icgc-argo-workflows/tabix/tabix/main.nf b/modules/icgc-argo-workflows/tabix/tabix/main.nf new file mode 100644 index 0000000..0076f98 --- /dev/null +++ b/modules/icgc-argo-workflows/tabix/tabix/main.nf @@ -0,0 +1,42 @@ +process TABIX_TABIX { + tag "$meta.id" + label 'process_single' + + conda "bioconda::tabix=1.11" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0': + 'biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta), path(tab) + + output: + tuple val(meta), path("*.tbi"), optional:true, emit: tbi + tuple val(meta), path("*.csi"), optional:true, emit: csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + tabix $args $tab + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${tab}.tbi + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/icgc-argo-workflows/tabix/tabix/meta.yml b/modules/icgc-argo-workflows/tabix/tabix/meta.yml new file mode 100644 index 0000000..3c4e2e0 --- /dev/null +++ b/modules/icgc-argo-workflows/tabix/tabix/meta.yml @@ -0,0 +1,45 @@ +name: tabix_tabix +description: create tabix index from a sorted bgzip tab-delimited genome file +keywords: + - index + - tabix + - vcf +tools: + - tabix: + description: Generic indexer for TAB-delimited genome position files. + homepage: https://www.htslib.org/doc/tabix.html + documentation: https://www.htslib.org/doc/tabix.1.html + doi: 10.1093/bioinformatics/btq671 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tab: + type: file + description: TAB-delimited genome position file compressed with bgzip + pattern: "*.{bed.gz,gff.gz,sam.gz,vcf.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tbi: + type: file + description: tabix index file + pattern: "*.{tbi}" + - csi: + type: file + description: coordinate sorted index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@maxulysse" \ No newline at end of file diff --git a/subworkflows/icgc-argo-workflows/stage_input/main.nf b/subworkflows/icgc-argo-workflows/stage_input/main.nf index b34f760..cc66f72 100644 --- a/subworkflows/icgc-argo-workflows/stage_input/main.nf +++ b/subworkflows/icgc-argo-workflows/stage_input/main.nf @@ -2,6 +2,9 @@ include { SONG_SCORE_DOWNLOAD } from '../../icgc-argo-workflows/song_score_download/main' include { PREP_SAMPLE } from '../../../modules/icgc-argo-workflows/prep/sample/main' include { CHECKINPUT } from '../../../modules/icgc-argo-workflows/checkinput/main' +include { SAMTOOLS_INDEX as BAM_INDEX } from '../../../modules/icgc-argo-workflows/samtools/index/main' +include { SAMTOOLS_INDEX as CRAM_INDEX } from '../../../modules/icgc-argo-workflows/samtools/index/main' +include { TABIX_TABIX } from '../../../modules/icgc-argo-workflows/tabix/tabix/main' workflow STAGE_INPUT { @@ -55,8 +58,7 @@ workflow STAGE_INPUT { exit 1, "When no API_TOKEN is provided, a local samplesheet must be provided." } } - //Collect meta,data files and analysis_json - //Two channels for meta,files and meta,analysis_json will be refined afterwards + //Collect meta,data files and analysis_json from new samplesheet.csv and handle approrpiately analysis_input .collectFile(keepHeader: true, name: 'sample_sheet.csv') .splitCsv(header:true) @@ -76,7 +78,7 @@ workflow STAGE_INPUT { experiment:row.experiment, single_end : row.single_end.toBoolean() ], - [file(row.fastq_1), file(row.fastq_2)], + [file(row.fastq_1,checkIfExists: true), file(row.fastq_2,checkIfExists: true)], row.analysis_json ) } else if (row.analysis_type == "sequencing_experiment" && row.single_end.toLowerCase() == 'true') { @@ -94,7 +96,7 @@ workflow STAGE_INPUT { experiment:row.experiment, single_end : row.single_end.toBoolean() ], - [file(row.fastq_1)], + [file(row.fastq_1,checkIfExists: true)], row.analysis_json ) } else if (row.analysis_type == "sequencing_alignment") { @@ -108,8 +110,8 @@ workflow STAGE_INPUT { status:row.status.toInteger(), genome_build:row.genome_build, experiment:row.experiment, - data_type:'cram'], - [file(row.cram), file(row.crai)], + data_type: "${row.bam_cram}".replaceAll(/^.*\./,"").toLowerCase()], + [file(row.bam_cram,checkIfExists: true), row.bai_crai], row.analysis_json ) } @@ -126,7 +128,7 @@ workflow STAGE_INPUT { genome_build:row.genome_build, experiment:row.experiment, data_type:'vcf'], - [file(row.vcf), file(row.tbi)], + [file(row.vcf,checkIfExists: true), row.tbi], row.analysis_json ) } @@ -143,28 +145,17 @@ workflow STAGE_INPUT { genome_build:row.genome_build, experiment:row.experiment, data_type:'tgz'], - [file(row.qc_file)], + [file(row.qc_file,checkIfExists: true)], row.analysis_json ) } } - .set { ch_input_sample } + .set {ch_input_sample} - //We want to still have meta when analysis_json doesn't exist - ch_input_sample.map{ meta,files,analysis -> - if (analysis){ - tuple([meta,file(analysis)]) - } else { - tuple([meta,null]) - } - } - .unique{it[1]} - .set{ ch_meta_analysis } - - //Reorganize files as "sequencing_experiment expected input is tuple while other types are flat" + //Reorganize files as flat tuple except "sequencing_experiment ch_input_sample.map{ meta,files,analysis -> if (meta.analysis_type == "sequencing_experiment"){ - tuple([meta,files]) + tuple([meta,files]) //tuple([meta,[read1,read2]]) } else if (meta.analysis_type == "sequencing_alignment") { tuple([meta,files[0],files[1]]) } else if (meta.analysis_type == "variant_calling") { @@ -172,12 +163,63 @@ workflow STAGE_INPUT { } else if (meta.analysis_type == "qc_metrics") { tuple([meta,files[0]]) } - }.set{ch_meta_files} + }.branch{ //identify files that require indexing + bam_to_index : it[0].analysis_type=='sequencing_alignment' && it[2].isEmpty() && it[0].data_type=='bam' + return tuple([it[0],it[1]]) + cram_to_index : it[0].analysis_type=='sequencing_alignment' && it[2].isEmpty() && it[0].data_type=='cram' + return tuple([it[0],it[1]]) + vcf_to_index : it[0].analysis_type=='variant_calling' && it[2].isEmpty() + return tuple([it[0],it[1]]) + indexed : (it[0].analysis_type=='sequencing_alignment' && ! it[2].isEmpty()) | (it[0].analysis_type=='variant_calling' && ! it[2].isEmpty()) + return tuple([it[0],it[1],it[2]]) + others: (it[0].analysis_type=='sequencing_experiment') | (it[0].analysis_type=='qc_metrics') + return tuple([it[0],it[1]]) + }.set{ch_index_split} + + + //Perform indexiing + BAM_INDEX(ch_index_split.bam_to_index) + CRAM_INDEX(ch_index_split.cram_to_index) + TABIX_TABIX(ch_index_split.vcf_to_index) + + + //Combine BAM and BAI into single channel + ch_index_split.bam_to_index.join(BAM_INDEX.out.bai) //[meta,bam,bai] + .set{indexed_bam} + + //Combine CRAM and CRAI into single channel + ch_index_split.cram_to_index.join(CRAM_INDEX.out.crai) //[meta,cram,crai] + .set{indexed_cram} + + //Combine VCF and TBI into single channel + ch_index_split.vcf_to_index.join(TABIX_TABIX.out.tbi) //[meta,vcf,tbi] + .set{indexed_vcf} + + //Combine newly indexed files, previously indexed and others into single channel + Channel.empty() + .mix(indexed_bam) + .mix(indexed_cram) + .mix(indexed_vcf) + .mix(ch_index_split.indexed) + .mix(ch_index_split.others) + .set{ch_meta_files} + + + //We want to still have meta when analysis_json doesn't exist + ch_input_sample.map{ meta,files,analysis -> + if (analysis){ + tuple([meta,file(analysis,checkIfExists: true)]) + } else { + tuple([meta,null]) + } + } + .unique{it[1]} + .set{ ch_meta_analysis } emit: meta_analysis = ch_meta_analysis // channel: [ val(meta), analysis_json] meta_files = ch_meta_files // channel: [ val(meta), [ files ] ] - upRdpc = upRdpc_flag + upRdpc = upRdpc_flag // [boolean] versions = ch_versions // channel: [ versions.yml ] } \ No newline at end of file From f7348dc95b74aab9c2a9dc6c7a61a00b1f08a066 Mon Sep 17 00:00:00 2001 From: Linda Xiang Date: Mon, 15 Apr 2024 16:49:25 -0400 Subject: [PATCH 5/6] fix the prefix of the path to the asset files --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index bc75b89..a0be0a6 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,5 +21,5 @@ params { // input data local_mode = true - input = 'assets/tests/csv/sample_sheet.csv' + input = "${projectDir}/assets/tests/csv/sample_sheet.csv" } From aeed6c4e8795ca36575efeb8436d3821d1292c15 Mon Sep 17 00:00:00 2001 From: Linda Xiang Date: Tue, 23 Apr 2024 16:31:07 -0400 Subject: [PATCH 6/6] add one test case for running workflow in rdpc --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 14704d2..6031aed 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,12 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool ``` 5. Start running your own analysis! + + If you are getting the input data from & sending output data to ICGC-ARGO data center, and you have valid api_token, you can run the workflow with: + ```bash + nextflow run icgc-argo-workflows/prealnqc -profile ,standard --api_token --study_id --analysis_ids + ``` + Otherwise, you can provide the path to the input data in `samplesheet.csv` and run the workflow with: ```bash nextflow run icgc-argo-workflows/prealnqc -profile standard --input samplesheet.csv --outdir ```