From 3153c9fad0e66ab47d8895c79febcef8c11440d2 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 2 Dec 2022 11:29:01 +0000 Subject: [PATCH 01/98] first commit --- main.nf | 147 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 79 insertions(+), 68 deletions(-) diff --git a/main.nf b/main.nf index 82979f2..a03b4de 100644 --- a/main.nf +++ b/main.nf @@ -46,19 +46,26 @@ log.info "" // Check input parameters // ---------------------------------------------------*/ -if(params.input) { +if(params.families_file) { Channel - .fromPath( "${params.input}" ) - .ifEmpty { exit 1, "VCF file: ${params.input} not found"} - .into { ch_vcf ; ch_vcf_inspect; ch_vcf_for_geneyx } + .fromPath( "${params.families_file}") + .ifEmpty { exit 1, "Family file: ${params.families_file} not found"} + .set {ch_vcf} } else { - exit 1, "please specify VCF file with --input parameter" + exit 1, "please specify Family file with --family_file parameter" } +Channel + .fromPath(params.families_file) + .ifEmpty { exit 1, "Cannot find input file : ${params.input}" } + .splitCsv(skip:1, sep:'\t') + .map { run_id, proband_id, hpo, vcf_path, vcf_index_path, proband_sex, mother_id, father_id -> [ run_id, proband_id, hpo, file(vcf_path), file(vcf_index_path), proband_sex, mother_id, father_id ] } + .set {ch_input} + // Conditional creation of channels, custom if provided else default from bin/ projectDir = workflow.projectDir ch_application_properties = params.application_properties ? Channel.value(file(params.application_properties)) : Channel.fromPath("${projectDir}/bin/application.properties") -ch_auto_config_yml = params.auto_config_yml ? Channel.value(file(params.auto_config_yml)) : Channel.fromPath("${projectDir}/bin/auto_config.yml") +ch_auto_config_yml = params.auto_config_yml ? Channel.value(file(params.auto_config_yml)) : Channel.fromPath("${projectDir}/bin/auto_config.yml") // Stage scripts from bin ch_add_exomiser_fields_script = Channel.value(file("${projectDir}/bin/add_exomiser_fields_to_genotiers.js")) @@ -78,20 +85,33 @@ println(selected_prioritisers) selected_analysis_mode = params.analysis_mode.split(',').collect{it.trim()} if (!checkParameterList(selected_analysis_mode, analysisModesList)) exit 1, "Unknown analysis mode, the available options are:\n$analysisModesList" - -// Prevent an error in AWSBatch (when running by awsbatch executor) -// by which this file is taken as /home/ubuntu/hpo_terms_file.txt instead of its correct path. -hpo_terms_filename = "${projectDir}/${params.hpo_terms_file}" +ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") + +/*-------------------------------------------------- + Create PED and HPO file from design +---------------------------------------------------*/ -Channel.fromPath("${params.hpo_terms_file}") - .splitCsv(sep: ',', skip: 1) - .unique() - .map {it -> it.toString().replaceAll("\\[", "").replaceAll("\\]", "")} - .map {it -> "'"+it.trim()+"'"} - .reduce { a, b -> "$a,$b" } - .into { ch_hpo_terms_file ; ch_hpo_terms_file_inspect; ch_hpo_terms } -ch_hpo_terms_file_inspect.dump(tag:'ch_hpo_terms (retrieve_hpo_terms: false)') +//remove +//ch_vcf_inspect.dump(tag:'ch_vcf') +if (params.ped_file) ped_ch = Channel.value(file(params.ped_file)) +if (params.hpo_file) hpo_ch = Channel.value(file(params.hpo_file)) + + +if(!params.ped_file & !params.hpo_file){ + process ped_hpo_creation { + publishDir "${params.outdir}/familyfile/", mode: 'copy' + input: + file family_file from ch_vcf + output: + file "*-HPO.txt" into hpo_ch + file "*.ped" into ped_ch + script: + """ + python3 $baseDir/${params.py_file} --input_family $family_file + """ + } +} /*-------------------------------------------------- Run containarised Exomiser @@ -99,14 +119,15 @@ ch_hpo_terms_file_inspect.dump(tag:'ch_hpo_terms (retrieve_hpo_terms: false)') ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") -ch_vcf_inspect.dump(tag:'ch_vcf') process exomiser { tag "${vcf}-${prioritiser}" publishDir "${params.outdir}/${sample_name}", mode: 'copy' input: - file(vcf) from ch_vcf + set run_id, proband_id, hpo, file(vcf_path), file(vcf_index_path), proband_sex, mother_id, father_id from ch_input + file "${proband_id}-HPO.txt" from hpo_ch + file "${proband_id}.ped" from ped_ch //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) @@ -114,7 +135,6 @@ process exomiser { file(application_properties) from ch_application_properties file(auto_config_yml) from ch_auto_config_yml file(exomiser_data) from ch_exomiser_data - val(hpo_terms) from ch_hpo_terms each prioritiser from selected_prioritisers output: @@ -129,58 +149,49 @@ process exomiser { def exomiser_executable = "/exomiser/exomiser-cli-"+"${params.exomiser_version}"+".jar" def exomiser = "java -Xms2g -Xmx4g -jar "+"${exomiser_executable}" """ + echo "$vcf_path" # link the staged/downloaded data to predefined path - ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + #ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + #ln -s "\$PWD/${vcf_path}" in.vcf - ls -l # Workaround for symlinked files not found - HPO_TERMS="${hpo_terms}" - - # error if no HPO term found - if [[ "\${HPO_TERMS}" == "null" ]]; then - echo "WARNING: No HPO terms found. So this step of exomiser is skipped, No report will be generated." - echo "Please check HPO terms for the patient in the clinical-portal for whom this sample belongs - ${sample_name}" - # solutions for AWS batch - touch no_hpo_term.html - touch no_hpo_term.vcf - touch no_hpo_term.json - touch no_hpo_term.yml - mkdir -p MultiQC - touch MultiQC/no_hpo_term.html - - else - # Modify auto_config.to pass the params - cp ${auto_config_yml} new_auto_config.yml - - # Swap placeholders with user provided values - sed -i "s/hpo_ids_placeholder/\$HPO_TERMS/g" new_auto_config.yml - sed -i "s/analysis_mode_placeholder/${params.analysis_mode}/g" new_auto_config.yml - sed -i "s/vcf_placeholder/${vcf}/" new_auto_config.yml - sed -i "s/output_prefix_placeholder/sample-${vcf.simpleName}/" new_auto_config.yml - sed -i "s/prioritiser_placeholder/${prioritiser}/" new_auto_config.yml - sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/" new_auto_config.yml - sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/" new_auto_config.yml - sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/" new_auto_config.yml - - # Printing (ls, see files; cat, injected values validation) - ${params.debug_script} - cat new_auto_config.yml - - # Run Exomiser - ${exomiser} \ - --analysis new_auto_config.yml \ - --spring.config.location=$application_properties \ - --exomiser.data-directory='.' - - # Create the slot for CloudOS html report preview - mkdir MultiQC - cp *.html MultiQC/multiqc_report.html - sed -i "s/Anonymous/${sample_name}/" MultiQC/multiqc_report.html - fi + HPO_TERMS="${proband_id}-HPO.txt" + VCF_PATH="in.vcf" + + + # Modify auto_config.to pass the params + cp ${auto_config_yml} new_auto_config.yml + + # Swap placeholders with user provided values + sed -i "s/hpo_ids_placeholder/\$HPO_TERMS/g" new_auto_config.yml + sed -i "s/analysis_mode_placeholder/${params.analysis_mode}/g" new_auto_config.yml + sed -i "s/vcf_placeholder/\$VCF_PATH" new_auto_config.yml + sed -i "s/output_prefix_placeholder/sample-${vcf_path.simpleName}/" new_auto_config.yml + sed -i "s/prioritiser_placeholder/${prioritiser}/" new_auto_config.yml + sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/" new_auto_config.yml + sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/" new_auto_config.yml + sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/" new_auto_config.yml + sed -i "s/ped:/ped: ${proband_id}.ped/" new_auto_config.yml + + # Printing (ls, see files; cat, injected values validation) + ${params.debug_script} + cat new_auto_config.yml + + # Run Exomiser + ${exomiser} \ + --analysis new_auto_config.yml \ + --spring.config.location=$application_properties \ + --exomiser.data-directory='.' + + # Create the slot for CloudOS html report preview + mkdir MultiQC + cp *.html MultiQC/multiqc_report.html + sed -i "s/Anonymous/${proband_id}/" MultiQC/multiqc_report.html + """ }else{ """ - wget -O ${sample_name}.tsv ${params.mock_exomiser_output_https_url} + wget -O ${proband_id}.tsv ${params.mock_exomiser_output_https_url} """ } } @@ -217,7 +228,7 @@ def checkParameterList(list, realList) { } /*-------------------------------------------------- - Definitions of accepted values for params + Definitions of accepted values for params ---------------------------------------------------*/ From b47872f00a15c257d64957e7f14729fd748db8e4 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 2 Dec 2022 13:30:26 +0000 Subject: [PATCH 02/98] dummy ex process --- main.nf | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/main.nf b/main.nf index a03b4de..b9b04ce 100644 --- a/main.nf +++ b/main.nf @@ -121,7 +121,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") process exomiser { - tag "${vcf}-${prioritiser}" + tag "${vcf_path}" publishDir "${params.outdir}/${sample_name}", mode: 'copy' input: @@ -142,6 +142,7 @@ process exomiser { file("*AR.variants.tsv") optional true file("*yml") optional true file("MultiQC/*.html") optional true + file("in.vcf") script: final_step = "finished" @@ -151,14 +152,15 @@ process exomiser { """ echo "$vcf_path" # link the staged/downloaded data to predefined path - #ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle - #ln -s "\$PWD/${vcf_path}" in.vcf + ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + ln -s "\$PWD/${vcf_path}" in.vcf # Workaround for symlinked files not found HPO_TERMS="${proband_id}-HPO.txt" VCF_PATH="in.vcf" + # Modify auto_config.to pass the params cp ${auto_config_yml} new_auto_config.yml @@ -175,18 +177,18 @@ process exomiser { # Printing (ls, see files; cat, injected values validation) ${params.debug_script} - cat new_auto_config.yml + #cat new_auto_config.yml # Run Exomiser - ${exomiser} \ - --analysis new_auto_config.yml \ - --spring.config.location=$application_properties \ - --exomiser.data-directory='.' + #${exomiser} \ + #--analysis new_auto_config.yml \ + #--spring.config.location=$application_properties \ + #--exomiser.data-directory='.' # Create the slot for CloudOS html report preview - mkdir MultiQC - cp *.html MultiQC/multiqc_report.html - sed -i "s/Anonymous/${proband_id}/" MultiQC/multiqc_report.html + #mkdir MultiQC + #cp *.html MultiQC/multiqc_report.html + #sed -i "s/Anonymous/${proband_id}/" MultiQC/multiqc_report.html """ }else{ From 949e701fc8565dfe5efc0aa18fc9480b33935656 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 2 Dec 2022 13:49:18 +0000 Subject: [PATCH 03/98] row split --- main.nf | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/main.nf b/main.nf index b9b04ce..afdb989 100644 --- a/main.nf +++ b/main.nf @@ -55,11 +55,18 @@ if(params.families_file) { exit 1, "please specify Family file with --family_file parameter" } +// Channel +// .fromPath(params.families_file) +// .ifEmpty { exit 1, "Cannot find input file : ${params.input}" } +// .splitCsv(skip:1, sep:'\t') +// .map { run_id, proband_id, hpo, vcf_path, vcf_index_path, proband_sex, mother_id, father_id -> [ run_id, proband_id, hpo, file(vcf_path), file(vcf_index_path), proband_sex, mother_id, father_id ] } +// .set {ch_input} + Channel .fromPath(params.families_file) .ifEmpty { exit 1, "Cannot find input file : ${params.input}" } - .splitCsv(skip:1, sep:'\t') - .map { run_id, proband_id, hpo, vcf_path, vcf_index_path, proband_sex, mother_id, father_id -> [ run_id, proband_id, hpo, file(vcf_path), file(vcf_index_path), proband_sex, mother_id, father_id ] } + .splitCsv(skip:1, sep:'\t', strip: true) + .map {row -> [ row.run_id, row.proband_id, row.hpo, file(row.vcf_path), file(row.vcf_index_path), row.proband_sex, row.mother_id, row.father_id ] } .set {ch_input} // Conditional creation of channels, custom if provided else default from bin/ @@ -121,13 +128,13 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") process exomiser { - tag "${vcf_path}" + tag "${vcf_path1}" publishDir "${params.outdir}/${sample_name}", mode: 'copy' input: - set run_id, proband_id, hpo, file(vcf_path), file(vcf_index_path), proband_sex, mother_id, father_id from ch_input - file "${proband_id}-HPO.txt" from hpo_ch - file "${proband_id}.ped" from ped_ch + set run_id, proband_id, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input + file "${proband_id1}-HPO.txt" from hpo_ch + file "${proband_id1}.ped" from ped_ch //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) @@ -142,8 +149,6 @@ process exomiser { file("*AR.variants.tsv") optional true file("*yml") optional true file("MultiQC/*.html") optional true - file("in.vcf") - script: final_step = "finished" if (!params.mock_exomiser) { @@ -153,14 +158,13 @@ process exomiser { echo "$vcf_path" # link the staged/downloaded data to predefined path ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle - ln -s "\$PWD/${vcf_path}" in.vcf + ln -s "\$PWD/${vcf_path1}" in.vcf # Workaround for symlinked files not found - HPO_TERMS="${proband_id}-HPO.txt" + HPO_TERMS="${proband_id1}-HPO.txt" VCF_PATH="in.vcf" - # Modify auto_config.to pass the params cp ${auto_config_yml} new_auto_config.yml @@ -168,32 +172,32 @@ process exomiser { sed -i "s/hpo_ids_placeholder/\$HPO_TERMS/g" new_auto_config.yml sed -i "s/analysis_mode_placeholder/${params.analysis_mode}/g" new_auto_config.yml sed -i "s/vcf_placeholder/\$VCF_PATH" new_auto_config.yml - sed -i "s/output_prefix_placeholder/sample-${vcf_path.simpleName}/" new_auto_config.yml + sed -i "s/output_prefix_placeholder/sample-${vcf_path1.simpleName}/" new_auto_config.yml sed -i "s/prioritiser_placeholder/${prioritiser}/" new_auto_config.yml sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/" new_auto_config.yml sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/" new_auto_config.yml sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/" new_auto_config.yml - sed -i "s/ped:/ped: ${proband_id}.ped/" new_auto_config.yml + sed -i "s/ped:/ped: ${proband_id1}.ped/" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) ${params.debug_script} - #cat new_auto_config.yml + cat new_auto_config.yml # Run Exomiser - #${exomiser} \ - #--analysis new_auto_config.yml \ - #--spring.config.location=$application_properties \ - #--exomiser.data-directory='.' + ${exomiser} \ + --analysis new_auto_config.yml \ + --spring.config.location=$application_properties \ + --exomiser.data-directory='.' # Create the slot for CloudOS html report preview - #mkdir MultiQC - #cp *.html MultiQC/multiqc_report.html - #sed -i "s/Anonymous/${proband_id}/" MultiQC/multiqc_report.html + mkdir MultiQC + cp *.html MultiQC/multiqc_report.html + sed -i "s/Anonymous/${proband_id1}/" MultiQC/multiqc_report.html """ }else{ """ - wget -O ${proband_id}.tsv ${params.mock_exomiser_output_https_url} + wget -O ${proband_id1}.tsv ${params.mock_exomiser_output_https_url} """ } } From ec2fe69b074c3bf5317813dd1540b1b98137d64b Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 2 Dec 2022 14:00:50 +0000 Subject: [PATCH 04/98] row split --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index afdb989..73e062f 100644 --- a/main.nf +++ b/main.nf @@ -65,7 +65,7 @@ if(params.families_file) { Channel .fromPath(params.families_file) .ifEmpty { exit 1, "Cannot find input file : ${params.input}" } - .splitCsv(skip:1, sep:'\t', strip: true) + .splitCsv(header:true, sep:'\t', strip: true) .map {row -> [ row.run_id, row.proband_id, row.hpo, file(row.vcf_path), file(row.vcf_index_path), row.proband_sex, row.mother_id, row.father_id ] } .set {ch_input} @@ -132,7 +132,7 @@ process exomiser { publishDir "${params.outdir}/${sample_name}", mode: 'copy' input: - set run_id, proband_id, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input + set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input file "${proband_id1}-HPO.txt" from hpo_ch file "${proband_id1}.ped" from ped_ch //The following is expected when CADD is omitted, @@ -155,7 +155,7 @@ process exomiser { def exomiser_executable = "/exomiser/exomiser-cli-"+"${params.exomiser_version}"+".jar" def exomiser = "java -Xms2g -Xmx4g -jar "+"${exomiser_executable}" """ - echo "$vcf_path" + echo "$vcf_path1" # link the staged/downloaded data to predefined path ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle ln -s "\$PWD/${vcf_path1}" in.vcf From 68c0840310dd16535fc1c9f1704e6f33808c5283 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 2 Dec 2022 15:38:49 +0000 Subject: [PATCH 05/98] sed g fix --- main.nf | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/main.nf b/main.nf index 73e062f..6e9c204 100644 --- a/main.nf +++ b/main.nf @@ -171,13 +171,13 @@ process exomiser { # Swap placeholders with user provided values sed -i "s/hpo_ids_placeholder/\$HPO_TERMS/g" new_auto_config.yml sed -i "s/analysis_mode_placeholder/${params.analysis_mode}/g" new_auto_config.yml - sed -i "s/vcf_placeholder/\$VCF_PATH" new_auto_config.yml - sed -i "s/output_prefix_placeholder/sample-${vcf_path1.simpleName}/" new_auto_config.yml - sed -i "s/prioritiser_placeholder/${prioritiser}/" new_auto_config.yml - sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/" new_auto_config.yml - sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/" new_auto_config.yml - sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/" new_auto_config.yml - sed -i "s/ped:/ped: ${proband_id1}.ped/" new_auto_config.yml + sed -i "s/vcf_placeholder/\$VCF_PATH/g" new_auto_config.yml + sed -i "s/output_prefix_placeholder/sample-${vcf_path1.simpleName}/g" new_auto_config.yml + sed -i "s/prioritiser_placeholder/${prioritiser}/g" new_auto_config.yml + sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/g" new_auto_config.yml + sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml + sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml + sed -i "s/ped:/ped: ${proband_id1}.ped/g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) ${params.debug_script} From 4a7bf21f43b9908cf5cee2a8578826e5289a9fe9 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 5 Dec 2022 14:53:15 +0000 Subject: [PATCH 06/98] hpo list --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 6e9c204..ce8b2a6 100644 --- a/main.nf +++ b/main.nf @@ -161,7 +161,7 @@ process exomiser { ln -s "\$PWD/${vcf_path1}" in.vcf # Workaround for symlinked files not found - HPO_TERMS="${proband_id1}-HPO.txt" + HPO_TERMS=`cat ${proband_id1}-HPO.txt` VCF_PATH="in.vcf" From 88a2af614e2f660b019a1144cf378f367136b192 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 5 Dec 2022 17:04:05 +0000 Subject: [PATCH 07/98] vcf path rm ln --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index ce8b2a6..a388d17 100644 --- a/main.nf +++ b/main.nf @@ -158,11 +158,11 @@ process exomiser { echo "$vcf_path1" # link the staged/downloaded data to predefined path ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle - ln -s "\$PWD/${vcf_path1}" in.vcf + # Workaround for symlinked files not found HPO_TERMS=`cat ${proband_id1}-HPO.txt` - VCF_PATH="in.vcf" + # Modify auto_config.to pass the params @@ -171,7 +171,7 @@ process exomiser { # Swap placeholders with user provided values sed -i "s/hpo_ids_placeholder/\$HPO_TERMS/g" new_auto_config.yml sed -i "s/analysis_mode_placeholder/${params.analysis_mode}/g" new_auto_config.yml - sed -i "s/vcf_placeholder/\$VCF_PATH/g" new_auto_config.yml + sed -i "s/vcf_placeholder/${vcf_path1}/g" new_auto_config.yml sed -i "s/output_prefix_placeholder/sample-${vcf_path1.simpleName}/g" new_auto_config.yml sed -i "s/prioritiser_placeholder/${prioritiser}/g" new_auto_config.yml sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/g" new_auto_config.yml From d41c9ac3070c17ed7d8c41446c6526354590d76e Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Tue, 6 Dec 2022 10:18:31 +0000 Subject: [PATCH 08/98] proband placeholder --- bin/auto_config.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/auto_config.yml b/bin/auto_config.yml index 4de86dc..e5acaf2 100644 --- a/bin/auto_config.yml +++ b/bin/auto_config.yml @@ -7,7 +7,7 @@ analysis: genomeAssembly: hg38 vcf: vcf_placeholder ped: - proband: + proband: proband_placeholder hpoIds: [hpo_ids_placeholder] # These are the default settings, with values representing the maximum minor allele frequency in percent (%) permitted for an # allele to be considered as a causative candidate under that mode of inheritance. @@ -112,12 +112,12 @@ analysis: ] outputOptions: outputContributingVariantsOnly: false - #numGenes options: 0 = all or specify a limit e.g. 500 for the first 500 results + #numGenes options: 0 = all or specify a limit e.g. 500 for the first 500 results numGenes: 0 #outputPrefix options: specify the path/filename without an extension and this will be added - # according to the outputFormats option. If unspecified this will default to the following: + # according to the outputFormats option. If unspecified this will default to the following: # {exomiserDir}/results/input-vcf-name-exomiser-results.html - # alternatively, specify a fully qualifed path only. e.g. /users/jules/exomes/analysis + # alternatively, specify a fully qualifed path only. e.g. /users/jules/exomes/analysis outputPrefix: output_prefix_placeholder #out-format options: HTML, JSON, TSV_GENE, TSV_VARIANT, VCF (default: HTML) - outputFormats: [HTML, JSON, TSV_GENE, TSV_VARIANT, VCF] \ No newline at end of file + outputFormats: [HTML, JSON, TSV_GENE, TSV_VARIANT, VCF] From 1de6293cfe72c248d751cdc50072f9423087bf89 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Tue, 6 Dec 2022 10:35:43 +0000 Subject: [PATCH 09/98] proband placeholder --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index a388d17..75faa5d 100644 --- a/main.nf +++ b/main.nf @@ -178,6 +178,7 @@ process exomiser { sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml sed -i "s/ped:/ped: ${proband_id1}.ped/g" new_auto_config.yml + sed -i "s/proband_placeholder/${proband_id1}/g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) ${params.debug_script} From 6190d7cfa1161917dbbde37eb33bf3e0fd3947f1 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Tue, 6 Dec 2022 14:55:47 +0000 Subject: [PATCH 10/98] update exomiser|PED|parser_py --- bin/ped_module_man.py | 69 +++++++++++++++++++++++++++++++++++++++++++ main.nf | 28 +++++++----------- nextflow.config | 3 +- 3 files changed, 82 insertions(+), 18 deletions(-) create mode 100644 bin/ped_module_man.py diff --git a/bin/ped_module_man.py b/bin/ped_module_man.py new file mode 100644 index 0000000..e2a5d7c --- /dev/null +++ b/bin/ped_module_man.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# RPC 291122 +# Aim take family file and convert to passed +import pandas as pd +import os +import argparse +from pathlib import Path + +# local test +# os.chdir("/Users/ryancardenas/Documents/exomiser-pipeline-nf/bin") +# input_dat = pd.read_csv("familytest.tsv", sep="\t") + +#build arg parser here +parser = argparse.ArgumentParser(description='Create PED file from family file - Exomiser') +parser.add_argument('--input_family', nargs=1, required=True, help='Enter the path for the familt TSV file') +args = parser.parse_args() + + +#bamfile set +input = str(args.input_family[0]) +input_dat = pd.read_csv(input, sep="\t", skipinitialspace = True) + + +def PED_function(run_ID, proband_ID, vcf_path, vcf_index, proband_sex, mother_ID, father_ID): + # Extract + output_name = (f"{proband_ID}.ped") + print(f"creating {proband_ID}.ped") + + text_file = open(f"{output_name}", "w") + + # extract filename without extention or path + file_base = Path(f"{vcf_path}").stem + file_base = Path(f"{file_base}").stem + + if proband_sex == 'M' or proband_sex == 'Male': + proband_sex2 = "1" + elif proband_sex == 'F' or proband_sex == 'Female': + proband_sex2 = "2" + else: + proband_sex2 = proband_sex + + template = f"""#FamilyID\tIndividualID\tPaternalID\tMaternalID\tSex\tPhenotype +{run_ID}\t{proband_ID}\t{father_ID}\t{mother_ID}\t{proband_sex2}\t2 +{run_ID}\t{mother_ID}\t0\t0\t2\t1 +{run_ID}\t{father_ID}\t0\t0\t1\t1 + """ + print(template) + #save PED using bash + n = text_file.write(template) + text_file.close() + print(f"finished {proband_ID}.ped") + +for index, row in input_dat.iterrows(): + + # define variables + run_id1 = row["run_id"] + proband_id1 = row["proband_id"] + hpo1 = row["hpo"] + mother_id1 = row["mother_id"] + father_id1 = row["father_id"] + vcf_path1 = row["vcf_path"] + vcf_index_path1 = row["vcf_index_path"] + proband_sex1 = row["proband_sex"] + + PED_function(run_id1,proband_id1, vcf_path1, vcf_index_path1, proband_sex1, mother_id1, father_id1) + + # create HPO file here. + os.system(f"rm -fr {proband_id1}-HPO.txt" ) + os.system(f"echo '{hpo1}' > {proband_id1}-HPO.txt") diff --git a/main.nf b/main.nf index 75faa5d..9326abd 100644 --- a/main.nf +++ b/main.nf @@ -1,5 +1,5 @@ #!/usr/bin/env nextflow - +nextflow.enable.dsl=1 import groovy.json.* /* @@ -16,13 +16,12 @@ c_white = "\033[0;37m"; c_yellow = "\033[0;33m"; c_purple = "\033[0;35m"; -sample_name = params.sample_name // Header log info log.info "-${c_purple}\nPARAMETERS SUMMARY${c_reset}-" log.info "-${c_teal}config:${c_reset}- ${params.config}" -log.info "-${c_teal}input:${c_reset}- ${params.input}" -log.info "-${c_teal}sample_name:${c_reset}- ${sample_name}" -log.info "-${c_teal}filename_hpo:${c_reset}- ${params.filename_hpo}" +log.info "-${c_teal}filename_design_file:${c_reset}- ${params.families_file}" +if(params.hpo_file) log.info "-${c_teal}filename_hpo:${c_reset}- ${params.filename_hpo}" +if(params.ped_file) log.info "-${c_teal}filename_ped:${c_reset}- ${params.ped_file}" log.info "-${c_teal}analysis_mode:${c_reset}- ${params.analysis_mode}" log.info "-${c_teal}exomiser_data:${c_reset}- ${params.exomiser_data}" log.info "-${c_teal}exomiser_phenotype_data:${c_reset}- ${params.exomiser_phenotype_data}" @@ -34,7 +33,6 @@ log.info "-${c_teal}min_priority_score:${c_reset}- ${params.min_priority_score}" log.info "-${c_teal}application_properties:${c_reset}- ${params.application_properties}" log.info "-${c_teal}auto_config_yml:${c_reset}- ${params.auto_config_yml}" log.info "-${c_teal}exomiser_data_directory:${c_reset}- ${params.exomiser_data_directory}" -log.info "-${c_teal}hpo terms from a file:${c_reset}- ${params.hpo_terms_file}" log.info "-${c_teal}exomiser_container_tag:${c_reset}- ${params.exomiser_container_tag}" log.info "-${c_teal}debug_script:${c_reset}- ${params.debug_script}" log.info "-${c_teal}echo:${c_reset}- ${params.echo}" @@ -55,16 +53,11 @@ if(params.families_file) { exit 1, "please specify Family file with --family_file parameter" } -// Channel -// .fromPath(params.families_file) -// .ifEmpty { exit 1, "Cannot find input file : ${params.input}" } -// .splitCsv(skip:1, sep:'\t') -// .map { run_id, proband_id, hpo, vcf_path, vcf_index_path, proband_sex, mother_id, father_id -> [ run_id, proband_id, hpo, file(vcf_path), file(vcf_index_path), proband_sex, mother_id, father_id ] } -// .set {ch_input} + Channel .fromPath(params.families_file) - .ifEmpty { exit 1, "Cannot find input file : ${params.input}" } + .ifEmpty { exit 1, "Cannot find input file : ${params.families_file}" } .splitCsv(header:true, sep:'\t', strip: true) .map {row -> [ row.run_id, row.proband_id, row.hpo, file(row.vcf_path), file(row.vcf_index_path), row.proband_sex, row.mother_id, row.father_id ] } .set {ch_input} @@ -94,7 +87,7 @@ selected_analysis_mode = params.analysis_mode.split(',').collect{it.trim()} if (!checkParameterList(selected_analysis_mode, analysisModesList)) exit 1, "Unknown analysis mode, the available options are:\n$analysisModesList" ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") - +ch_ped_parser_py = Channel.fromPath("${params.ped_parser_py}") /*-------------------------------------------------- Create PED and HPO file from design ---------------------------------------------------*/ @@ -104,18 +97,19 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") if (params.ped_file) ped_ch = Channel.value(file(params.ped_file)) if (params.hpo_file) hpo_ch = Channel.value(file(params.hpo_file)) - if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { + container 'broadinstitute/gatk' publishDir "${params.outdir}/familyfile/", mode: 'copy' input: file family_file from ch_vcf + file(ped_parser_py) from ch_ped_parser_py output: file "*-HPO.txt" into hpo_ch file "*.ped" into ped_ch script: """ - python3 $baseDir/${params.py_file} --input_family $family_file + python3 $ped_parser_py --input_family $family_file """ } } @@ -129,7 +123,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") process exomiser { tag "${vcf_path1}" - publishDir "${params.outdir}/${sample_name}", mode: 'copy' + publishDir "${params.outdir}/${proband_id1}", mode: 'copy' input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input diff --git a/nextflow.config b/nextflow.config index 7f2e5cc..4ee7f7c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,6 +17,7 @@ params { filename_hpo = '' sample_name = null config = 'conf/standard.config' + ped_parser_py = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/ped_module_man.py' exomiser_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/hg38' exomiser_phenotype_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/2102_phenotype' cadd_snvs = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/cadd_snvs' @@ -38,7 +39,7 @@ params { debug_script = "ls -l" echo = false errorStrategy = 'terminate' - + // container versions exomiser_container_tag = '12.1.0' cloudos_cli_container_tag = '0.0.2' From b1ec0c453473aaa55ed19f21efae4904985ac254 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Tue, 6 Dec 2022 15:08:20 +0000 Subject: [PATCH 11/98] remove dsl1 param --- main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/main.nf b/main.nf index 9326abd..627da51 100644 --- a/main.nf +++ b/main.nf @@ -1,5 +1,4 @@ #!/usr/bin/env nextflow -nextflow.enable.dsl=1 import groovy.json.* /* From a8c7625ef04e734b4ab1fd03c39288e12b4f58f0 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Tue, 6 Dec 2022 17:09:26 +0000 Subject: [PATCH 12/98] readlink PED file --- main.nf | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 627da51..0d279ee 100644 --- a/main.nf +++ b/main.nf @@ -127,7 +127,7 @@ process exomiser { input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input file "${proband_id1}-HPO.txt" from hpo_ch - file "${proband_id1}.ped" from ped_ch + file("${proband_id1}.ped") from ped_ch //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) @@ -148,13 +148,16 @@ process exomiser { def exomiser_executable = "/exomiser/exomiser-cli-"+"${params.exomiser_version}"+".jar" def exomiser = "java -Xms2g -Xmx4g -jar "+"${exomiser_executable}" """ - echo "$vcf_path1" + echo "Contents in PED" + cat ${proband_id1}.ped + # link the staged/downloaded data to predefined path ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle # Workaround for symlinked files not found HPO_TERMS=`cat ${proband_id1}-HPO.txt` + PED_FILE=`readlink ${proband_id1}.ped` @@ -170,7 +173,7 @@ process exomiser { sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/g" new_auto_config.yml sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml - sed -i "s/ped:/ped: ${proband_id1}.ped/g" new_auto_config.yml + sed -i "s/ped:/ped: \$PED_FILE/g" new_auto_config.yml sed -i "s/proband_placeholder/${proband_id1}/g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) From 2e16bd0d60cd4aa8a27093e477264ad4a736961b Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Tue, 6 Dec 2022 19:16:41 +0000 Subject: [PATCH 13/98] sed colon --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 0d279ee..fd94b3a 100644 --- a/main.nf +++ b/main.nf @@ -174,7 +174,7 @@ process exomiser { sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml sed -i "s/ped:/ped: \$PED_FILE/g" new_auto_config.yml - sed -i "s/proband_placeholder/${proband_id1}/g" new_auto_config.yml + sed -i "s:proband_placeholder:${proband_id1}:g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) ${params.debug_script} From 0767c0050f234c33f7bcbd5cd9b20717098c460d Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Tue, 6 Dec 2022 20:37:39 +0000 Subject: [PATCH 14/98] sed colon fix --- bin/auto_config.yml | 2 +- main.nf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/auto_config.yml b/bin/auto_config.yml index e5acaf2..60eef5f 100644 --- a/bin/auto_config.yml +++ b/bin/auto_config.yml @@ -6,7 +6,7 @@ analysis: # hg19 or hg38 - ensure that the application has been configured to run the specified assembly otherwise it will halt. genomeAssembly: hg38 vcf: vcf_placeholder - ped: + ped: ped_placeholder proband: proband_placeholder hpoIds: [hpo_ids_placeholder] # These are the default settings, with values representing the maximum minor allele frequency in percent (%) permitted for an diff --git a/main.nf b/main.nf index fd94b3a..9f4c981 100644 --- a/main.nf +++ b/main.nf @@ -173,8 +173,8 @@ process exomiser { sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/g" new_auto_config.yml sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml - sed -i "s/ped:/ped: \$PED_FILE/g" new_auto_config.yml - sed -i "s:proband_placeholder:${proband_id1}:g" new_auto_config.yml + sed -i "s:ped_placeholder:\$PED_FILE:g" new_auto_config.yml + sed -i "s/proband_placeholder/${proband_id1}/g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) ${params.debug_script} From a13a3827b0afb324ff121cc10cfe0fa3255fbd66 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Wed, 7 Dec 2022 08:46:00 +0000 Subject: [PATCH 15/98] ped input by cat --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 9f4c981..153d145 100644 --- a/main.nf +++ b/main.nf @@ -153,7 +153,7 @@ process exomiser { # link the staged/downloaded data to predefined path ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle - + cat ${proband_id1}.ped > input.ped # Workaround for symlinked files not found HPO_TERMS=`cat ${proband_id1}-HPO.txt` @@ -173,7 +173,7 @@ process exomiser { sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/g" new_auto_config.yml sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml - sed -i "s:ped_placeholder:\$PED_FILE:g" new_auto_config.yml + sed -i "s:ped_placeholder:input.ped:g" new_auto_config.yml sed -i "s/proband_placeholder/${proband_id1}/g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) From dc895e0c665bf228ec2fba11906f1d557f5cd511 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Wed, 7 Dec 2022 11:04:39 +0000 Subject: [PATCH 16/98] file ped no proband --- main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 153d145..f9c71e0 100644 --- a/main.nf +++ b/main.nf @@ -127,7 +127,7 @@ process exomiser { input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input file "${proband_id1}-HPO.txt" from hpo_ch - file("${proband_id1}.ped") from ped_ch + file ped from ped_ch //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) @@ -149,7 +149,7 @@ process exomiser { def exomiser = "java -Xms2g -Xmx4g -jar "+"${exomiser_executable}" """ echo "Contents in PED" - cat ${proband_id1}.ped + cat $ped # link the staged/downloaded data to predefined path ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle @@ -157,7 +157,7 @@ process exomiser { # Workaround for symlinked files not found HPO_TERMS=`cat ${proband_id1}-HPO.txt` - PED_FILE=`readlink ${proband_id1}.ped` + @@ -173,7 +173,7 @@ process exomiser { sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/g" new_auto_config.yml sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml - sed -i "s:ped_placeholder:input.ped:g" new_auto_config.yml + sed -i "s:ped_placeholder:${ped}:g" new_auto_config.yml sed -i "s/proband_placeholder/${proband_id1}/g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) From 9effa1548a7223712ee0e833496c0c8bd8cbaacd Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Wed, 7 Dec 2022 14:11:18 +0000 Subject: [PATCH 17/98] Ped_parser container added --- main.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index f9c71e0..270dca1 100644 --- a/main.nf +++ b/main.nf @@ -98,7 +98,7 @@ if (params.hpo_file) hpo_ch = Channel.value(file(params.hpo_file)) if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { - container 'broadinstitute/gatk' + container 'quay.io/lifebitaiorg/docker-containers_ped_parser:1.0.0' publishDir "${params.outdir}/familyfile/", mode: 'copy' input: file family_file from ch_vcf @@ -127,7 +127,7 @@ process exomiser { input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input file "${proband_id1}-HPO.txt" from hpo_ch - file ped from ped_ch + file("${proband_id1}.ped") from ped_ch //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) @@ -149,7 +149,7 @@ process exomiser { def exomiser = "java -Xms2g -Xmx4g -jar "+"${exomiser_executable}" """ echo "Contents in PED" - cat $ped + cat ${proband_id1}.ped # link the staged/downloaded data to predefined path ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle @@ -157,7 +157,7 @@ process exomiser { # Workaround for symlinked files not found HPO_TERMS=`cat ${proband_id1}-HPO.txt` - + PED_FILE=`${proband_id1}.ped` @@ -173,7 +173,7 @@ process exomiser { sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/g" new_auto_config.yml sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml - sed -i "s:ped_placeholder:${ped}:g" new_auto_config.yml + sed -i "s/ped_placeholder/\$PED_FILE/g" new_auto_config.yml sed -i "s/proband_placeholder/${proband_id1}/g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) From 7886f44ffb7e49c40299eb7ba58a6e07d3b3e750 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Wed, 7 Dec 2022 14:46:20 +0000 Subject: [PATCH 18/98] dockerhub test --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 270dca1..0e01dd1 100644 --- a/main.nf +++ b/main.nf @@ -98,7 +98,7 @@ if (params.hpo_file) hpo_ch = Channel.value(file(params.hpo_file)) if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { - container 'quay.io/lifebitaiorg/docker-containers_ped_parser:1.0.0' + container '151515151515/ped_parser_v2' publishDir "${params.outdir}/familyfile/", mode: 'copy' input: file family_file from ch_vcf From 08814b6a45c3439a37530412479c7a2b7610730c Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Wed, 7 Dec 2022 15:38:51 +0000 Subject: [PATCH 19/98] fix ped cmd bash error --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 0e01dd1..6d7e270 100644 --- a/main.nf +++ b/main.nf @@ -98,7 +98,7 @@ if (params.hpo_file) hpo_ch = Channel.value(file(params.hpo_file)) if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { - container '151515151515/ped_parser_v2' + container '151515151515/ped_parser_v2:' publishDir "${params.outdir}/familyfile/", mode: 'copy' input: file family_file from ch_vcf @@ -157,7 +157,7 @@ process exomiser { # Workaround for symlinked files not found HPO_TERMS=`cat ${proband_id1}-HPO.txt` - PED_FILE=`${proband_id1}.ped` + @@ -173,7 +173,7 @@ process exomiser { sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/g" new_auto_config.yml sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml - sed -i "s/ped_placeholder/\$PED_FILE/g" new_auto_config.yml + sed -i "s/ped_placeholder/${proband_id1}.ped/g" new_auto_config.yml sed -i "s/proband_placeholder/${proband_id1}/g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) From ef381ccfdfcfcdf40b1f0372149be701aacf199f Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Wed, 7 Dec 2022 16:05:36 +0000 Subject: [PATCH 20/98] fix ped cmd bash error --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 6d7e270..4738e84 100644 --- a/main.nf +++ b/main.nf @@ -98,7 +98,7 @@ if (params.hpo_file) hpo_ch = Channel.value(file(params.hpo_file)) if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { - container '151515151515/ped_parser_v2:' + container '151515151515/ped_parser_v2' publishDir "${params.outdir}/familyfile/", mode: 'copy' input: file family_file from ch_vcf From cbe4d20a5b9df5b4ad69bb990d735e438ed78231 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Wed, 7 Dec 2022 16:15:21 +0000 Subject: [PATCH 21/98] add quay image --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 4738e84..610d4be 100644 --- a/main.nf +++ b/main.nf @@ -98,7 +98,7 @@ if (params.hpo_file) hpo_ch = Channel.value(file(params.hpo_file)) if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { - container '151515151515/ped_parser_v2' + container 'quay.io/lifebitaiorg/ped_parser:latest' publishDir "${params.outdir}/familyfile/", mode: 'copy' input: file family_file from ch_vcf From 5e5ff75d9bf57c725e7dfe577f38ae00d28c69b6 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Thu, 8 Dec 2022 13:24:14 +0000 Subject: [PATCH 22/98] collect input --- main.nf | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 610d4be..17d670a 100644 --- a/main.nf +++ b/main.nf @@ -126,16 +126,12 @@ process exomiser { input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input - file "${proband_id1}-HPO.txt" from hpo_ch - file("${proband_id1}.ped") from ped_ch + file "${proband_id1}-HPO.txt" from hpo_ch.collect() + file("${proband_id1}.ped") from ped_ch.collect() //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) // this is fine, as when there is no second dir, a fake input.1 is generated that will be unused - file(application_properties) from ch_application_properties - file(auto_config_yml) from ch_auto_config_yml - file(exomiser_data) from ch_exomiser_data - each prioritiser from selected_prioritisers output: set file("*.html"),file("*.vcf"), file("*.json") optional true From 0a13943c188cb454b0ab8e0aeb72d681b80106ea Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Thu, 8 Dec 2022 13:41:57 +0000 Subject: [PATCH 23/98] mistaken deletion fix --- main.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/main.nf b/main.nf index 17d670a..93d650b 100644 --- a/main.nf +++ b/main.nf @@ -132,6 +132,10 @@ process exomiser { // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) // this is fine, as when there is no second dir, a fake input.1 is generated that will be unused + file(application_properties) from ch_application_properties + file(auto_config_yml) from ch_auto_config_yml + file(exomiser_data) from ch_exomiser_data + each prioritiser from selected_prioritisers output: set file("*.html"),file("*.vcf"), file("*.json") optional true From fc1b22f7b05b9d6ba1d0eeaffb4b01d812f6330e Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Thu, 8 Dec 2022 13:55:03 +0000 Subject: [PATCH 24/98] add maxforks --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 93d650b..83653d2 100644 --- a/main.nf +++ b/main.nf @@ -123,7 +123,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") process exomiser { tag "${vcf_path1}" publishDir "${params.outdir}/${proband_id1}", mode: 'copy' - + maxForks 50 input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input file "${proband_id1}-HPO.txt" from hpo_ch.collect() From 819d08d8d2368fbe3eb58f268c2fb3a49040bb5e Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Thu, 8 Dec 2022 17:22:43 +0000 Subject: [PATCH 25/98] ls it --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 83653d2..e0a9a02 100644 --- a/main.nf +++ b/main.nf @@ -148,6 +148,7 @@ process exomiser { def exomiser_executable = "/exomiser/exomiser-cli-"+"${params.exomiser_version}"+".jar" def exomiser = "java -Xms2g -Xmx4g -jar "+"${exomiser_executable}" """ + ls -la echo "Contents in PED" cat ${proband_id1}.ped From 2f07bdaf94c269ac4c527d84986339442099711e Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 10:12:33 +0000 Subject: [PATCH 26/98] wildcard input collect --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index e0a9a02..db290da 100644 --- a/main.nf +++ b/main.nf @@ -126,8 +126,8 @@ process exomiser { maxForks 50 input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input - file "${proband_id1}-HPO.txt" from hpo_ch.collect() - file("${proband_id1}.ped") from ped_ch.collect() + file "*-HPO.txt" from hpo_ch.collect() + file("*.ped") from ped_ch.collect() //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) @@ -148,7 +148,7 @@ process exomiser { def exomiser_executable = "/exomiser/exomiser-cli-"+"${params.exomiser_version}"+".jar" def exomiser = "java -Xms2g -Xmx4g -jar "+"${exomiser_executable}" """ - ls -la + ls -la echo "Contents in PED" cat ${proband_id1}.ped From 64ea1d97485a50c448f4825658bdccd090a01aad Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 11:19:31 +0000 Subject: [PATCH 27/98] rm collect --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index db290da..45e5779 100644 --- a/main.nf +++ b/main.nf @@ -126,8 +126,8 @@ process exomiser { maxForks 50 input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input - file "*-HPO.txt" from hpo_ch.collect() - file("*.ped") from ped_ch.collect() + file "${proband_id1}-HPO.txt" from hpo_ch + file "${proband_id1}.ped" from ped_ch //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) From eeb3ab2bb0cfde9d55c4b3cce6e7e284223e3cee Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 14:57:47 +0000 Subject: [PATCH 28/98] split channels --- main.nf | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/main.nf b/main.nf index 45e5779..9dbd1bd 100644 --- a/main.nf +++ b/main.nf @@ -100,12 +100,15 @@ if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { container 'quay.io/lifebitaiorg/ped_parser:latest' publishDir "${params.outdir}/familyfile/", mode: 'copy' + stageInMode 'copy' input: - file family_file from ch_vcf - file(ped_parser_py) from ch_ped_parser_py + set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input + file family_file from ch_vcf.collect() + file(ped_parser_py) from ch_ped_parser_py.collect() output: - file "*-HPO.txt" into hpo_ch - file "*.ped" into ped_ch + file "${proband_id1}-HPO.txt" into hpo_ch + file "${proband_id1}.ped" into ped_ch + file "${vcf_path1}" into vcf_file_ch script: """ python3 $ped_parser_py --input_family $family_file @@ -125,9 +128,10 @@ process exomiser { publishDir "${params.outdir}/${proband_id1}", mode: 'copy' maxForks 50 input: - set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input - file "${proband_id1}-HPO.txt" from hpo_ch - file "${proband_id1}.ped" from ped_ch + //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input + file vcf_path1 from vcf_file_ch + file hpo_file from hpo_ch + file ped_file from ped_ch //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) @@ -150,23 +154,18 @@ process exomiser { """ ls -la echo "Contents in PED" - cat ${proband_id1}.ped # link the staged/downloaded data to predefined path ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle - cat ${proband_id1}.ped > input.ped - - # Workaround for symlinked files not found - HPO_TERMS=`cat ${proband_id1}-HPO.txt` - - + proband_id1=`basename ${ped_file}` + echo \$proband_id1 # Modify auto_config.to pass the params cp ${auto_config_yml} new_auto_config.yml # Swap placeholders with user provided values - sed -i "s/hpo_ids_placeholder/\$HPO_TERMS/g" new_auto_config.yml + sed -i "s/hpo_ids_placeholder/${hpo_file}/g" new_auto_config.yml sed -i "s/analysis_mode_placeholder/${params.analysis_mode}/g" new_auto_config.yml sed -i "s/vcf_placeholder/${vcf_path1}/g" new_auto_config.yml sed -i "s/output_prefix_placeholder/sample-${vcf_path1.simpleName}/g" new_auto_config.yml @@ -174,8 +173,8 @@ process exomiser { sed -i "s/min_priority_score_placeholder/${params.min_priority_score}/g" new_auto_config.yml sed -i "s/keep_non_pathogenic_placeholder/${params.keep_non_pathogenic}/g" new_auto_config.yml sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml - sed -i "s/ped_placeholder/${proband_id1}.ped/g" new_auto_config.yml - sed -i "s/proband_placeholder/${proband_id1}/g" new_auto_config.yml + sed -i "s/ped_placeholder/${ped_file}/g" new_auto_config.yml + sed -i "s/proband_placeholder/\$proband_id1/g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) ${params.debug_script} From 83e87cf718359202f54e36f44788ef31fedc7c6c Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 14:59:07 +0000 Subject: [PATCH 29/98] remove exomiser data --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 9dbd1bd..89328d2 100644 --- a/main.nf +++ b/main.nf @@ -138,7 +138,7 @@ process exomiser { // this is fine, as when there is no second dir, a fake input.1 is generated that will be unused file(application_properties) from ch_application_properties file(auto_config_yml) from ch_auto_config_yml - file(exomiser_data) from ch_exomiser_data + //file(exomiser_data) from ch_exomiser_data each prioritiser from selected_prioritisers output: @@ -156,7 +156,7 @@ process exomiser { echo "Contents in PED" # link the staged/downloaded data to predefined path - ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + proband_id1=`basename ${ped_file}` echo \$proband_id1 From 3982abad5e2ae4695536a50e1b6ab60ffbee5789 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 15:08:37 +0000 Subject: [PATCH 30/98] add exomiser dat --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 89328d2..9dbd1bd 100644 --- a/main.nf +++ b/main.nf @@ -138,7 +138,7 @@ process exomiser { // this is fine, as when there is no second dir, a fake input.1 is generated that will be unused file(application_properties) from ch_application_properties file(auto_config_yml) from ch_auto_config_yml - //file(exomiser_data) from ch_exomiser_data + file(exomiser_data) from ch_exomiser_data each prioritiser from selected_prioritisers output: @@ -156,7 +156,7 @@ process exomiser { echo "Contents in PED" # link the staged/downloaded data to predefined path - + ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle proband_id1=`basename ${ped_file}` echo \$proband_id1 From 9349889533f94a47950f7ccbe4009d8789397241 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 15:20:22 +0000 Subject: [PATCH 31/98] add exomiser dat --- main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 9dbd1bd..aeec88f 100644 --- a/main.nf +++ b/main.nf @@ -138,7 +138,7 @@ process exomiser { // this is fine, as when there is no second dir, a fake input.1 is generated that will be unused file(application_properties) from ch_application_properties file(auto_config_yml) from ch_auto_config_yml - file(exomiser_data) from ch_exomiser_data + //file(exomiser_data) from ch_exomiser_data each prioritiser from selected_prioritisers output: @@ -156,7 +156,7 @@ process exomiser { echo "Contents in PED" # link the staged/downloaded data to predefined path - ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + proband_id1=`basename ${ped_file}` echo \$proband_id1 @@ -189,7 +189,7 @@ process exomiser { # Create the slot for CloudOS html report preview mkdir MultiQC cp *.html MultiQC/multiqc_report.html - sed -i "s/Anonymous/${proband_id1}/" MultiQC/multiqc_report.html + sed -i "s/Anonymous/\$proband_id1/" MultiQC/multiqc_report.html """ }else{ From 090be19f8bf62198ad682b1f5e1c6d2df756ee29 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 15:27:39 +0000 Subject: [PATCH 32/98] final --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index aeec88f..5cc98e9 100644 --- a/main.nf +++ b/main.nf @@ -138,7 +138,7 @@ process exomiser { // this is fine, as when there is no second dir, a fake input.1 is generated that will be unused file(application_properties) from ch_application_properties file(auto_config_yml) from ch_auto_config_yml - //file(exomiser_data) from ch_exomiser_data + file(exomiser_data) from ch_exomiser_data each prioritiser from selected_prioritisers output: @@ -156,7 +156,7 @@ process exomiser { echo "Contents in PED" # link the staged/downloaded data to predefined path - + ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle proband_id1=`basename ${ped_file}` echo \$proband_id1 From 3971fdc75268f163ef18ba56ab8c5682812a2f13 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 15:32:42 +0000 Subject: [PATCH 33/98] final2 --- main.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 5cc98e9..1fbdb8a 100644 --- a/main.nf +++ b/main.nf @@ -108,9 +108,11 @@ if(!params.ped_file & !params.hpo_file){ output: file "${proband_id1}-HPO.txt" into hpo_ch file "${proband_id1}.ped" into ped_ch - file "${vcf_path1}" into vcf_file_ch + file "copy_${vcf_path1}" into vcf_file_ch script: """ + ls -la + cp ${vcf_path1} copy_${vcf_path1} python3 $ped_parser_py --input_family $family_file """ } From 4e8d2c9c1113ebeb53386441ae6da7d85fdf5c69 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 15:47:56 +0000 Subject: [PATCH 34/98] gunzip basename --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 1fbdb8a..3370ef3 100644 --- a/main.nf +++ b/main.nf @@ -108,11 +108,11 @@ if(!params.ped_file & !params.hpo_file){ output: file "${proband_id1}-HPO.txt" into hpo_ch file "${proband_id1}.ped" into ped_ch - file "copy_${vcf_path1}" into vcf_file_ch + file "${vcf_path1.baseName}" into vcf_file_ch script: """ ls -la - cp ${vcf_path1} copy_${vcf_path1} + gunzip ${vcf_path1} python3 $ped_parser_py --input_family $family_file """ } From 9b72d06632b48d449eeb63b5c0c8b5080100d423 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 15:48:46 +0000 Subject: [PATCH 35/98] gunzip basename --- main.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/main.nf b/main.nf index 3370ef3..2d999ae 100644 --- a/main.nf +++ b/main.nf @@ -100,7 +100,6 @@ if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { container 'quay.io/lifebitaiorg/ped_parser:latest' publishDir "${params.outdir}/familyfile/", mode: 'copy' - stageInMode 'copy' input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input file family_file from ch_vcf.collect() From 93e6eb3b4f2ab1b26ed1b18db84baea5d5b7ad11 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Fri, 9 Dec 2022 16:25:56 +0000 Subject: [PATCH 36/98] HPO fix again --- main.nf | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/main.nf b/main.nf index 2d999ae..76ae32e 100644 --- a/main.nf +++ b/main.nf @@ -100,6 +100,7 @@ if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { container 'quay.io/lifebitaiorg/ped_parser:latest' publishDir "${params.outdir}/familyfile/", mode: 'copy' + stageInMode 'copy' input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input file family_file from ch_vcf.collect() @@ -107,11 +108,9 @@ if(!params.ped_file & !params.hpo_file){ output: file "${proband_id1}-HPO.txt" into hpo_ch file "${proband_id1}.ped" into ped_ch - file "${vcf_path1.baseName}" into vcf_file_ch + file "${vcf_path1}" into vcf_file_ch script: """ - ls -la - gunzip ${vcf_path1} python3 $ped_parser_py --input_family $family_file """ } @@ -155,18 +154,15 @@ process exomiser { """ ls -la echo "Contents in PED" - # link the staged/downloaded data to predefined path ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle - proband_id1=`basename ${ped_file}` + hpo_band1=`cat ${hpo_file}` echo \$proband_id1 - # Modify auto_config.to pass the params cp ${auto_config_yml} new_auto_config.yml - # Swap placeholders with user provided values - sed -i "s/hpo_ids_placeholder/${hpo_file}/g" new_auto_config.yml + sed -i "s/hpo_ids_placeholder/\$hpo_band1/g" new_auto_config.yml sed -i "s/analysis_mode_placeholder/${params.analysis_mode}/g" new_auto_config.yml sed -i "s/vcf_placeholder/${vcf_path1}/g" new_auto_config.yml sed -i "s/output_prefix_placeholder/sample-${vcf_path1.simpleName}/g" new_auto_config.yml @@ -176,22 +172,18 @@ process exomiser { sed -i "s/pathogenicity_sources_placeholder/${params.pathogenicity_sources}/g" new_auto_config.yml sed -i "s/ped_placeholder/${ped_file}/g" new_auto_config.yml sed -i "s/proband_placeholder/\$proband_id1/g" new_auto_config.yml - # Printing (ls, see files; cat, injected values validation) ${params.debug_script} cat new_auto_config.yml - # Run Exomiser ${exomiser} \ --analysis new_auto_config.yml \ --spring.config.location=$application_properties \ --exomiser.data-directory='.' - # Create the slot for CloudOS html report preview mkdir MultiQC cp *.html MultiQC/multiqc_report.html sed -i "s/Anonymous/\$proband_id1/" MultiQC/multiqc_report.html - """ }else{ """ From 8a9ee12da4f1304ef8033d9bb0314e37473f9020 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Sun, 11 Dec 2022 17:27:18 +0000 Subject: [PATCH 37/98] proband id channel --- main.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 76ae32e..a55da22 100644 --- a/main.nf +++ b/main.nf @@ -108,6 +108,7 @@ if(!params.ped_file & !params.hpo_file){ output: file "${proband_id1}-HPO.txt" into hpo_ch file "${proband_id1}.ped" into ped_ch + file "${proband_id1}_ID.txt" into id_ch file "${vcf_path1}" into vcf_file_ch script: """ @@ -132,6 +133,7 @@ process exomiser { file vcf_path1 from vcf_file_ch file hpo_file from hpo_ch file ped_file from ped_ch + file id_file from id _ch //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) @@ -156,7 +158,7 @@ process exomiser { echo "Contents in PED" # link the staged/downloaded data to predefined path ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle - proband_id1=`basename ${ped_file}` + proband_id1=`cat ${id_file}` hpo_band1=`cat ${hpo_file}` echo \$proband_id1 # Modify auto_config.to pass the params From 66d6e98a9c254e895280cd020745c23f975eb2ab Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Sun, 11 Dec 2022 22:03:17 +0000 Subject: [PATCH 38/98] fix if ch --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index a55da22..fdd4388 100644 --- a/main.nf +++ b/main.nf @@ -133,7 +133,7 @@ process exomiser { file vcf_path1 from vcf_file_ch file hpo_file from hpo_ch file ped_file from ped_ch - file id_file from id _ch + file id_file from id_ch //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) From e285a8b49875c0e720bb0f3eefa1fa995fd14f13 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 12 Dec 2022 15:44:09 +0000 Subject: [PATCH 39/98] add index --- main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.nf b/main.nf index fdd4388..3f12d1c 100644 --- a/main.nf +++ b/main.nf @@ -110,6 +110,7 @@ if(!params.ped_file & !params.hpo_file){ file "${proband_id1}.ped" into ped_ch file "${proband_id1}_ID.txt" into id_ch file "${vcf_path1}" into vcf_file_ch + file "${vcf_index_path1}" into vcf_index_ch script: """ python3 $ped_parser_py --input_family $family_file @@ -131,6 +132,7 @@ process exomiser { input: //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input file vcf_path1 from vcf_file_ch + file vcf_index1 from vcf_index_ch file hpo_file from hpo_ch file ped_file from ped_ch file id_file from id_ch From 222f065e36f40535e5d1d6eddc93b514a49174ff Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 12 Dec 2022 16:09:29 +0000 Subject: [PATCH 40/98] add retry --- main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.nf b/main.nf index 3f12d1c..8e62d98 100644 --- a/main.nf +++ b/main.nf @@ -101,6 +101,8 @@ if(!params.ped_file & !params.hpo_file){ container 'quay.io/lifebitaiorg/ped_parser:latest' publishDir "${params.outdir}/familyfile/", mode: 'copy' stageInMode 'copy' + errorStrategy 'retry' + maxErrors 5 input: set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input file family_file from ch_vcf.collect() From 7557f5fdd649f127892e2ad61060e1c7a7d43518 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 12 Dec 2022 17:29:10 +0000 Subject: [PATCH 41/98] copy file --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 8e62d98..608337d 100644 --- a/main.nf +++ b/main.nf @@ -115,6 +115,7 @@ if(!params.ped_file & !params.hpo_file){ file "${vcf_index_path1}" into vcf_index_ch script: """ + cp ${vcf_path1} input.vcf.gz python3 $ped_parser_py --input_family $family_file """ } From 8b9bf8e8dc83f41f3925602b84b9fcf00031e720 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 12 Dec 2022 17:30:49 +0000 Subject: [PATCH 42/98] copy file --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 608337d..a6b0133 100644 --- a/main.nf +++ b/main.nf @@ -111,7 +111,7 @@ if(!params.ped_file & !params.hpo_file){ file "${proband_id1}-HPO.txt" into hpo_ch file "${proband_id1}.ped" into ped_ch file "${proband_id1}_ID.txt" into id_ch - file "${vcf_path1}" into vcf_file_ch + file "input.vcf.gz" into vcf_file_ch file "${vcf_index_path1}" into vcf_index_ch script: """ From 05b2eab6bbabc7469c3180979f517f0a2a2dc13e Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 12 Dec 2022 17:33:18 +0000 Subject: [PATCH 43/98] gunzip file --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index a6b0133..4d010ec 100644 --- a/main.nf +++ b/main.nf @@ -111,11 +111,11 @@ if(!params.ped_file & !params.hpo_file){ file "${proband_id1}-HPO.txt" into hpo_ch file "${proband_id1}.ped" into ped_ch file "${proband_id1}_ID.txt" into id_ch - file "input.vcf.gz" into vcf_file_ch + file "input.vcf" into vcf_file_ch file "${vcf_index_path1}" into vcf_index_ch script: """ - cp ${vcf_path1} input.vcf.gz + gunzip -c ${vcf_path1} > input.vcf python3 $ped_parser_py --input_family $family_file """ } From 275934d5270015de2033e431f3708499db67e54e Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Tue, 13 Dec 2022 11:13:09 +0000 Subject: [PATCH 44/98] tuple input output --- main.nf | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/main.nf b/main.nf index 4d010ec..519fbf9 100644 --- a/main.nf +++ b/main.nf @@ -100,7 +100,6 @@ if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { container 'quay.io/lifebitaiorg/ped_parser:latest' publishDir "${params.outdir}/familyfile/", mode: 'copy' - stageInMode 'copy' errorStrategy 'retry' maxErrors 5 input: @@ -108,14 +107,9 @@ if(!params.ped_file & !params.hpo_file){ file family_file from ch_vcf.collect() file(ped_parser_py) from ch_ped_parser_py.collect() output: - file "${proband_id1}-HPO.txt" into hpo_ch - file "${proband_id1}.ped" into ped_ch - file "${proband_id1}_ID.txt" into id_ch - file "input.vcf" into vcf_file_ch - file "${vcf_index_path1}" into vcf_index_ch + tuple file("${proband_id1}-HPO.txt"), file("${proband_id1}.ped"), file("${proband_id1}_ID.txt"), file("${vcf_path1}"), file("${vcf_index_path1}") into exomiser_ch script: """ - gunzip -c ${vcf_path1} > input.vcf python3 $ped_parser_py --input_family $family_file """ } @@ -134,18 +128,14 @@ process exomiser { maxForks 50 input: //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input - file vcf_path1 from vcf_file_ch - file vcf_index1 from vcf_index_ch - file hpo_file from hpo_ch - file ped_file from ped_ch - file id_file from id_ch + tuple file(hpo_file),file(ped_file),file(id_file),file(vcf_path1),file(vcf_index1) from exomiser_ch //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) // this is fine, as when there is no second dir, a fake input.1 is generated that will be unused - file(application_properties) from ch_application_properties - file(auto_config_yml) from ch_auto_config_yml - file(exomiser_data) from ch_exomiser_data + each file(application_properties) from ch_application_properties + each file(auto_config_yml) from ch_auto_config_yml + each file(exomiser_data) from ch_exomiser_data each prioritiser from selected_prioritisers output: From 330319adb15c938b1f6b4818b7dd33587b2b975d Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Tue, 13 Dec 2022 12:26:22 +0000 Subject: [PATCH 45/98] debug local --- main.nf | 16 ++++++++-------- nextflow.config | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/main.nf b/main.nf index 519fbf9..9ebaf4f 100644 --- a/main.nf +++ b/main.nf @@ -152,7 +152,7 @@ process exomiser { ls -la echo "Contents in PED" # link the staged/downloaded data to predefined path - ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle +# ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle proband_id1=`cat ${id_file}` hpo_band1=`cat ${hpo_file}` echo \$proband_id1 @@ -173,14 +173,14 @@ process exomiser { ${params.debug_script} cat new_auto_config.yml # Run Exomiser - ${exomiser} \ - --analysis new_auto_config.yml \ - --spring.config.location=$application_properties \ - --exomiser.data-directory='.' + #${exomiser} \ + #--analysis new_auto_config.yml \ + #--spring.config.location=$application_properties \ + #--exomiser.data-directory='.' # Create the slot for CloudOS html report preview - mkdir MultiQC - cp *.html MultiQC/multiqc_report.html - sed -i "s/Anonymous/\$proband_id1/" MultiQC/multiqc_report.html + #mkdir MultiQC + #cp *.html MultiQC/multiqc_report.html + #sed -i "s/Anonymous/\$proband_id1/" MultiQC/multiqc_report.html """ }else{ """ diff --git a/nextflow.config b/nextflow.config index 4ee7f7c..95941f0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,7 +17,7 @@ params { filename_hpo = '' sample_name = null config = 'conf/standard.config' - ped_parser_py = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/ped_module_man.py' + ped_parser_py = '/Users/ryancardenas/Documents/fix_pipelines/exomiser-pipeline-nf/bin/ped_module.py' exomiser_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/hg38' exomiser_phenotype_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/2102_phenotype' cadd_snvs = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/cadd_snvs' @@ -62,7 +62,7 @@ process { errorStrategy = params.errorStrategy withName: exomiser { container = "quay.io/lifebitai/exomiser:${params.exomiser_container_tag}" - containerOptions = "--volume ${params.exomiser_data_directory}:/data/exomiser-data-bundle/" +// containerOptions = "--volume ${params.exomiser_data_directory}:/data/exomiser-data-bundle/" memory = 6.GB cpus = 4 } From 049b25c8570647220ef37a2713c0f19e7999fbb3 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Tue, 13 Dec 2022 17:52:08 +0000 Subject: [PATCH 46/98] fork1 --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 9ebaf4f..c2215d4 100644 --- a/main.nf +++ b/main.nf @@ -125,7 +125,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") process exomiser { tag "${vcf_path1}" publishDir "${params.outdir}/${proband_id1}", mode: 'copy' - maxForks 50 + maxForks 1 input: //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input tuple file(hpo_file),file(ped_file),file(id_file),file(vcf_path1),file(vcf_index1) from exomiser_ch From fcd92da60a2f05b16616152ffac64b5b6c3434e1 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Tue, 13 Dec 2022 18:02:45 +0000 Subject: [PATCH 47/98] rm ln hash --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index c2215d4..3d973f6 100644 --- a/main.nf +++ b/main.nf @@ -152,7 +152,7 @@ process exomiser { ls -la echo "Contents in PED" # link the staged/downloaded data to predefined path -# ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle proband_id1=`cat ${id_file}` hpo_band1=`cat ${hpo_file}` echo \$proband_id1 From 4accd6e8750a303623591d05313c4a0e9d275cf1 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Wed, 14 Dec 2022 09:32:10 +0000 Subject: [PATCH 48/98] fork 5 delay submit --- main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 3d973f6..5acc180 100644 --- a/main.nf +++ b/main.nf @@ -125,7 +125,8 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") process exomiser { tag "${vcf_path1}" publishDir "${params.outdir}/${proband_id1}", mode: 'copy' - maxForks 1 + maxForks 2 + submitRateLimit = '1 / 5 m' input: //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input tuple file(hpo_file),file(ped_file),file(id_file),file(vcf_path1),file(vcf_index1) from exomiser_ch From 02f3a8939bc0977d62f8a8b9f3a7ef509337e8c7 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Tue, 10 Jan 2023 17:16:19 +0000 Subject: [PATCH 49/98] uncomment out exomiser commands - fork =1 --- main.nf | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index 5acc180..497ffac 100644 --- a/main.nf +++ b/main.nf @@ -125,7 +125,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") process exomiser { tag "${vcf_path1}" publishDir "${params.outdir}/${proband_id1}", mode: 'copy' - maxForks 2 + maxForks 1 submitRateLimit = '1 / 5 m' input: //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input @@ -174,14 +174,14 @@ process exomiser { ${params.debug_script} cat new_auto_config.yml # Run Exomiser - #${exomiser} \ - #--analysis new_auto_config.yml \ - #--spring.config.location=$application_properties \ - #--exomiser.data-directory='.' + ${exomiser} \ + --analysis new_auto_config.yml \ + --spring.config.location=$application_properties \ + --exomiser.data-directory='.' # Create the slot for CloudOS html report preview - #mkdir MultiQC - #cp *.html MultiQC/multiqc_report.html - #sed -i "s/Anonymous/\$proband_id1/" MultiQC/multiqc_report.html + mkdir MultiQC + cp *.html MultiQC/multiqc_report.html + sed -i "s/Anonymous/\$proband_id1/" MultiQC/multiqc_report.html """ }else{ """ From ef6f2ad1366e9a2d2fae7ac6329dd9a92bb6d244 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Wed, 11 Jan 2023 10:28:17 +0000 Subject: [PATCH 50/98] add container options --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 95941f0..27c3655 100644 --- a/nextflow.config +++ b/nextflow.config @@ -62,7 +62,7 @@ process { errorStrategy = params.errorStrategy withName: exomiser { container = "quay.io/lifebitai/exomiser:${params.exomiser_container_tag}" -// containerOptions = "--volume ${params.exomiser_data_directory}:/data/exomiser-data-bundle/" + containerOptions = "--volume ${params.exomiser_data_directory}:/data/exomiser-data-bundle/" memory = 6.GB cpus = 4 } From 9e13023a7f82bf9b45b935828dcbc7bcedaf0819 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Wed, 11 Jan 2023 11:34:48 +0000 Subject: [PATCH 51/98] change rate --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 497ffac..59d688a 100644 --- a/main.nf +++ b/main.nf @@ -126,7 +126,7 @@ process exomiser { tag "${vcf_path1}" publishDir "${params.outdir}/${proband_id1}", mode: 'copy' maxForks 1 - submitRateLimit = '1 / 5 m' + submitRateLimit = '1 / 1 m' input: //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input tuple file(hpo_file),file(ped_file),file(id_file),file(vcf_path1),file(vcf_index1) from exomiser_ch From e318df61a9ed3800fc32f46e3e22c750cd9835ba Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Wed, 11 Jan 2023 16:24:10 +0000 Subject: [PATCH 52/98] remove nans from ped --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 59d688a..f0a4ea4 100644 --- a/main.nf +++ b/main.nf @@ -111,6 +111,7 @@ if(!params.ped_file & !params.hpo_file){ script: """ python3 $ped_parser_py --input_family $family_file + sed -i 's/nan/0/g' ${proband_id1}.ped """ } } From afd26726adda6fef53bb4f42129825347e0afe2c Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Thu, 12 Jan 2023 09:51:56 +0000 Subject: [PATCH 53/98] rate limit 5m --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index f0a4ea4..9d82e35 100644 --- a/main.nf +++ b/main.nf @@ -127,7 +127,7 @@ process exomiser { tag "${vcf_path1}" publishDir "${params.outdir}/${proband_id1}", mode: 'copy' maxForks 1 - submitRateLimit = '1 / 1 m' + submitRateLimit = '1 / 5 m' input: //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input tuple file(hpo_file),file(ped_file),file(id_file),file(vcf_path1),file(vcf_index1) from exomiser_ch From ecdcb3c38225c4844c33c106522ea5afa4bd0874 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Thu, 12 Jan 2023 10:48:14 +0000 Subject: [PATCH 54/98] retries --- main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.nf b/main.nf index 9d82e35..feb8032 100644 --- a/main.nf +++ b/main.nf @@ -128,6 +128,8 @@ process exomiser { publishDir "${params.outdir}/${proband_id1}", mode: 'copy' maxForks 1 submitRateLimit = '1 / 5 m' + errorStrategy 'retry' + maxRetries 3 input: //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input tuple file(hpo_file),file(ped_file),file(id_file),file(vcf_path1),file(vcf_index1) from exomiser_ch From b70b9043a5520e5aa614be2ee368b2d8b64659bf Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Thu, 12 Jan 2023 12:12:15 +0000 Subject: [PATCH 55/98] rm symbolic link --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index feb8032..a84262f 100644 --- a/main.nf +++ b/main.nf @@ -184,6 +184,7 @@ process exomiser { # Create the slot for CloudOS html report preview mkdir MultiQC cp *.html MultiQC/multiqc_report.html + rm -fr /data/exomiser-data-bundle sed -i "s/Anonymous/\$proband_id1/" MultiQC/multiqc_report.html """ }else{ From 89707a02165b67f857015e7f5775f2d3deb1b133 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Thu, 12 Jan 2023 13:39:45 +0000 Subject: [PATCH 56/98] force symbolic link --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index a84262f..5c58cc3 100644 --- a/main.nf +++ b/main.nf @@ -156,7 +156,7 @@ process exomiser { ls -la echo "Contents in PED" # link the staged/downloaded data to predefined path - ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + ln -svf "\$PWD/$exomiser_data/" /data/exomiser-data-bundle proband_id1=`cat ${id_file}` hpo_band1=`cat ${hpo_file}` echo \$proband_id1 @@ -184,7 +184,7 @@ process exomiser { # Create the slot for CloudOS html report preview mkdir MultiQC cp *.html MultiQC/multiqc_report.html - rm -fr /data/exomiser-data-bundle + sed -i "s/Anonymous/\$proband_id1/" MultiQC/multiqc_report.html """ }else{ From e557a63be66bc2b77dead3b8fd0637eaf859f89d Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Thu, 12 Jan 2023 17:30:21 +0000 Subject: [PATCH 57/98] size vcf symbolic --- main.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/main.nf b/main.nf index 5c58cc3..da8b981 100644 --- a/main.nf +++ b/main.nf @@ -157,6 +157,9 @@ process exomiser { echo "Contents in PED" # link the staged/downloaded data to predefined path ln -svf "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + stat -Lc $vcf_path1 + stat -Lc $vcf_path1 > out.txt + cat out.txt proband_id1=`cat ${id_file}` hpo_band1=`cat ${hpo_file}` echo \$proband_id1 From 353f1fa657c3e25789e3acf1d151be4bd83f83b8 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas <57348405+R-Cardenas@users.noreply.github.com> Date: Thu, 12 Jan 2023 20:10:49 +0000 Subject: [PATCH 58/98] stat symbolic link --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index da8b981..9ed44c5 100644 --- a/main.nf +++ b/main.nf @@ -157,8 +157,8 @@ process exomiser { echo "Contents in PED" # link the staged/downloaded data to predefined path ln -svf "\$PWD/$exomiser_data/" /data/exomiser-data-bundle - stat -Lc $vcf_path1 - stat -Lc $vcf_path1 > out.txt + stat -L $vcf_path1 + stat -L $vcf_path1 > out.txt cat out.txt proband_id1=`cat ${id_file}` hpo_band1=`cat ${hpo_file}` From 76fccdb4fe1577503b2cf547effad542ec45b96f Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 16 Jan 2023 11:08:18 +0000 Subject: [PATCH 59/98] join channels --- main.nf | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 9ed44c5..3b9e0e6 100644 --- a/main.nf +++ b/main.nf @@ -58,8 +58,10 @@ Channel .fromPath(params.families_file) .ifEmpty { exit 1, "Cannot find input file : ${params.families_file}" } .splitCsv(header:true, sep:'\t', strip: true) - .map {row -> [ row.run_id, row.proband_id, row.hpo, file(row.vcf_path), file(row.vcf_index_path), row.proband_sex, row.mother_id, row.father_id ] } - .set {ch_input} + .map {row -> [ row.proband_id, row.hpo, file(row.vcf_path), file(row.vcf_index_path)] } + .set {ch_input1} + +ch_input1.into { ch_input; ch_input2 } // Conditional creation of channels, custom if provided else default from bin/ projectDir = workflow.projectDir @@ -103,11 +105,11 @@ if(!params.ped_file & !params.hpo_file){ errorStrategy 'retry' maxErrors 5 input: - set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input + set proband_id1, hpo, file(vcf_path1), file(vcf_index_path1) from ch_input file family_file from ch_vcf.collect() file(ped_parser_py) from ch_ped_parser_py.collect() output: - tuple file("${proband_id1}-HPO.txt"), file("${proband_id1}.ped"), file("${proband_id1}_ID.txt"), file("${vcf_path1}"), file("${vcf_index_path1}") into exomiser_ch + tuple val(proband_id1), file("${proband_id1}-HPO.txt"), file("${proband_id1}.ped"), file("${proband_id1}_ID.txt") into join_ch script: """ python3 $ped_parser_py --input_family $family_file @@ -120,6 +122,13 @@ if(!params.ped_file & !params.hpo_file){ Run containarised Exomiser ---------------------------------------------------*/ + +combined_channel = ch_input2.join(join_ch, by: 0).view() + +/*-------------------------------------------------- + Run containarised Exomiser +---------------------------------------------------*/ + ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") @@ -132,7 +141,8 @@ process exomiser { maxRetries 3 input: //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input - tuple file(hpo_file),file(ped_file),file(id_file),file(vcf_path1),file(vcf_index1) from exomiser_ch + //set val(proband_id1), hpo, file(vcf_path1), file(vcf_index_path1) + set file(vcf_path1),file(vcf_index1), val(proband_id1), file(hpo_file),file(ped_file),file(id_file) from combined_channel //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) From 85617becc3610e5638c87a57b1474963732e3cac Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 16 Jan 2023 12:16:18 +0000 Subject: [PATCH 60/98] realign params --- main.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 3b9e0e6..400a55d 100644 --- a/main.nf +++ b/main.nf @@ -58,7 +58,7 @@ Channel .fromPath(params.families_file) .ifEmpty { exit 1, "Cannot find input file : ${params.families_file}" } .splitCsv(header:true, sep:'\t', strip: true) - .map {row -> [ row.proband_id, row.hpo, file(row.vcf_path), file(row.vcf_index_path)] } + .map {row -> [ row.proband_id, file(row.vcf_path), file(row.vcf_index_path)] } .set {ch_input1} ch_input1.into { ch_input; ch_input2 } @@ -132,6 +132,7 @@ combined_channel = ch_input2.join(join_ch, by: 0).view() ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") + process exomiser { tag "${vcf_path1}" publishDir "${params.outdir}/${proband_id1}", mode: 'copy' @@ -142,7 +143,7 @@ process exomiser { input: //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input //set val(proband_id1), hpo, file(vcf_path1), file(vcf_index_path1) - set file(vcf_path1),file(vcf_index1), val(proband_id1), file(hpo_file),file(ped_file),file(id_file) from combined_channel + set val(proband_id1),file(vcf_path1),file(vcf_index1), file(hpo_file), file(ped_file),file(id_file) from combined_channel //The following is expected when CADD is omitted, // WARN: Input tuple does not match input set cardinality declared by process `exomiser` // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) @@ -166,7 +167,7 @@ process exomiser { ls -la echo "Contents in PED" # link the staged/downloaded data to predefined path - ln -svf "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle stat -L $vcf_path1 stat -L $vcf_path1 > out.txt cat out.txt From a772ebca5d18acf87117a98292ab708090f886b5 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 16 Jan 2023 12:17:07 +0000 Subject: [PATCH 61/98] ln svf --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 400a55d..caa4abc 100644 --- a/main.nf +++ b/main.nf @@ -167,7 +167,7 @@ process exomiser { ls -la echo "Contents in PED" # link the staged/downloaded data to predefined path - ln -s "\$PWD/$exomiser_data/" /data/exomiser-data-bundle + ln -svf "\$PWD/$exomiser_data/" /data/exomiser-data-bundle stat -L $vcf_path1 stat -L $vcf_path1 > out.txt cat out.txt From 103bd00cb554b1db4e66547c1c8463620a6169d1 Mon Sep 17 00:00:00 2001 From: Ryan Cardenas Date: Mon, 16 Jan 2023 16:15:26 +0000 Subject: [PATCH 62/98] update yaml conf --- bin/auto_config.yml | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/bin/auto_config.yml b/bin/auto_config.yml index 60eef5f..9e72def 100644 --- a/bin/auto_config.yml +++ b/bin/auto_config.yml @@ -80,20 +80,35 @@ analysis: #intervalFilter: {intervals: ['chr10:123256200-123256300', 'chr10:123256290-123256350']}, # or using a BED file - NOTE this should be 0-based, Exomiser otherwise uses 1-based coordinates in line with VCF #intervalFilter: {bed: /full/path/to/bed_file.bed}, - #failedVariantFilter: {}, + failedVariantFilter: {}, #genePanelFilter: {geneSymbols: ['FGFR1','FGFR2']}, ##################################################################################### - hiPhivePrioritiser: {}, + #hiPhivePrioritiser: {}, #running the prioritiser followed by a priorityScoreFilter will remove genes #which are least likely to contribute to the phenotype defined in hpoIds, this will #dramatically reduce the time and memory required to analyse a genome. # 0.501 is a good compromise to select good phenotype matches and the best protein-protein interactions hits from hiPhive - priorityScoreFilter: {priorityType: HIPHIVE_PRIORITY, minPriorityScore: 0.501}, + #priorityScoreFilter: {priorityType: HIPHIVE_PRIORITY, minPriorityScore: 0.501}, ###################################################################################### #variantEffectFilter: {remove: [SYNONYMOUS_VARIANT]}, #regulatoryFeatureFilter removes all non-regulatory non-coding variants over 20Kb from a known gene. - regulatoryFeatureFilter: {}, + #regulatoryFeatureFilter: {}, #knownVariantFilter: {}, #removes variants represented in the database + variantEffectFilter: { + remove: [ + FIVE_PRIME_UTR_EXON_VARIANT, + FIVE_PRIME_UTR_INTRON_VARIANT, + THREE_PRIME_UTR_EXON_VARIANT, + THREE_PRIME_UTR_INTRON_VARIANT, + NON_CODING_TRANSCRIPT_EXON_VARIANT, + NON_CODING_TRANSCRIPT_INTRON_VARIANT, + CODING_TRANSCRIPT_INTRON_VARIANT, + UPSTREAM_GENE_VARIANT, + DOWNSTREAM_GENE_VARIANT, + INTERGENIC_VARIANT, + REGULATORY_REGION_VARIANT + ] + }, frequencyFilter: {maxFrequency: 2.0}, pathogenicityFilter: {keepNonPathogenic: keep_non_pathogenic_placeholder}, #inheritanceFilter and omimPrioritiser should always run AFTER all other filters have completed @@ -103,12 +118,12 @@ analysis: omimPrioritiser: {}, #Other prioritisers: Only combine omimPrioritiser with one of these. #Don't include any if you only want to filter the variants. - #hiPhivePrioritiser: {}, + hiPhivePrioritiser: {} # or run hiPhive in benchmarking mode: #hiPhivePrioritiser: {diseaseId: 'OMIM:101600', candidateGeneSymbol: FGFR2, runParams: 'human,mouse,fish,ppi'}, #phenixPrioritiser: {} #exomeWalkerPrioritiser: {seedGeneIds: [11111, 22222, 33333]} - prioritiser_placeholder : {} + #prioritiser_placeholder : {} ] outputOptions: outputContributingVariantsOnly: false From e475cf8b5ecdba1f2fbc3ed3fa12266934656787 Mon Sep 17 00:00:00 2001 From: Sangram K Sahu Date: Wed, 31 May 2023 04:02:12 +0530 Subject: [PATCH 63/98] Update main.nf --- main.nf | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/main.nf b/main.nf index caa4abc..026fec8 100644 --- a/main.nf +++ b/main.nf @@ -114,6 +114,16 @@ if(!params.ped_file & !params.hpo_file){ """ python3 $ped_parser_py --input_family $family_file sed -i 's/nan/0/g' ${proband_id1}.ped + echo "DEBUG: Check the file before - Content" + cat ${proband_id1}.ped + echo "DEBUG: Check the file before - Number of line" + wc -l ${proband_id1}.ped + # remove the last line when nan present + head -n -1 ${proband_id1}.ped > temp.txt ; mv temp.txt ${proband_id1}.ped + echo "DEBUG: Check the file after - Content" + cat ${proband_id1}.ped + echo "DEBUG: Check the file after - Number of line" + wc -l ${proband_id1}.ped """ } } From 22ea0c64c9e106715498c38ebdda8be00a1ad2fd Mon Sep 17 00:00:00 2001 From: Sangram K Sahu Date: Mon, 17 Jul 2023 14:33:38 +0530 Subject: [PATCH 64/98] batch run fix Removes the error `ln: failed to create symbolic link '/data/exomiser-data-bundle': No such file or directory` when run with batch --- main.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.nf b/main.nf index 026fec8..61ebcb8 100644 --- a/main.nf +++ b/main.nf @@ -177,6 +177,8 @@ process exomiser { ls -la echo "Contents in PED" # link the staged/downloaded data to predefined path + mkdir -p /data + mkdir -p /data/exomiser-data-bundle ln -svf "\$PWD/$exomiser_data/" /data/exomiser-data-bundle stat -L $vcf_path1 stat -L $vcf_path1 > out.txt From 7df88542e7dc11b778ec47b741e387e16b3d79e1 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Thu, 26 Oct 2023 16:19:55 +0200 Subject: [PATCH 65/98] added the ped_module script in the bin and removed the hardcoded path --- bin/ped_module.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++ nextflow.config | 1 - 2 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 bin/ped_module.py diff --git a/bin/ped_module.py b/bin/ped_module.py new file mode 100644 index 0000000..5d3de65 --- /dev/null +++ b/bin/ped_module.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# RPC 291122 +# Aim take family file and convert to passed + +from ped_parser import Individual, Family +import pandas as pd +import os +import argparse + +# local test +# os.chdir("/Users/ryancardenas/Documents/exomiser-pipeline-nf/bin") +# input_dat = pd.read_csv("familytest.tsv", sep="\t") + +#build arg parser here +parser = argparse.ArgumentParser(description='Create PED file from family file - Exomiser') +parser.add_argument('--input_family', nargs=1, required=True, help='Enter the path for the family TSV file') +args = parser.parse_args() + + +#bamfile set +input = str(args.input_family[0]) +input_dat = pd.read_csv(input, sep="\t") + +# --------------- create function for ped_parser +def PED_function(run_ID, proband_ID, vcf_path, vcf_index, proband_sex, mother_ID, father_ID): + # Extract + output_name = (f"{proband_ID}_tmp.ped") + outfile = open(output_name,'a') + my_individuals = [] + print(f"{output_name}") + # extract filename without extention or path + file_name = os.path.basename(f"{vcf_path}") + file_base = os.path.splitext(file_name)[0] + + proband_sex = proband_sex.lower() + + if proband_sex == 'm' or proband_sex == 'male': + proband_sex2 = "1" + elif proband_sex == 'f' or proband_sex == 'female': + proband_sex2 = "2" + elif proband_sex == 'other': + proband_sex2 = "0" + else: + proband_sex2 = proband_sex + + #proband_ID + my_individuals.append(Individual( + f'{proband_ID}', + family=f'{run_ID}', + mother=f'{mother_ID}', + father=f'{father_ID}', + sex=f'{proband_sex2}', + phenotype='2' + )) + #mother + my_individuals.append(Individual( + f'{mother_ID}', + family=f'{run_ID}', + mother='0', + father='0', + sex='2', + phenotype='1' + )) + #father + my_individuals.append(Individual( + f'{father_ID}', + family=f'{run_ID}', + mother='0', + father='0', + sex='1', + phenotype='1' + )) + my_family = Family(family_id=f'{run_ID}') + for individual in my_individuals: + my_family.add_individual(individual) + + # save PED files + my_family.to_ped(outfile) + + + +for index, row in input_dat.iterrows(): + + # define variables + run_id1 = row["run_id"] + proband_id1 = row["proband_id"] + hpo1 = row["hpo"] + mother_id1 = row["mother_id"] + father_id1 = row["father_id"] + vcf_path1 = row["vcf_path"] + vcf_index_path1 = row["vcf_index_path"] + proband_sex1 = row["proband_sex"] + + PED_function(run_id1,proband_id1, vcf_path1, vcf_index_path1, proband_sex1, mother_id1, father_id1) + + # create HPO file here. + os.system(f"rm -fr {proband_id1}-HPO.txt" ) + os.system(f"echo '{hpo1}' > {proband_id1}-HPO.txt") + + #create proband_id into text_file + os.system(f"echo '{proband_id1}' > {proband_id1}_ID.txt") + + # filter PEDs to only have proband_id + # Strangely despite loops the file appends each family is added + cmd_strip = f"grep -A 2 '{proband_id1}' {proband_id1}_tmp.ped > {proband_id1}.ped" + os.system(cmd_strip) diff --git a/nextflow.config b/nextflow.config index 27c3655..dd54333 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,7 +17,6 @@ params { filename_hpo = '' sample_name = null config = 'conf/standard.config' - ped_parser_py = '/Users/ryancardenas/Documents/fix_pipelines/exomiser-pipeline-nf/bin/ped_module.py' exomiser_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/hg38' exomiser_phenotype_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/2102_phenotype' cadd_snvs = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/cadd_snvs' From d4cf4ec3c385c7130b5f0c7ad7dac35883f46dd0 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Thu, 26 Oct 2023 17:54:36 +0200 Subject: [PATCH 66/98] added testing_family_file --- main.nf | 14 ++++++-------- testdata/fam_file.txt | 2 ++ 2 files changed, 8 insertions(+), 8 deletions(-) create mode 100644 testdata/fam_file.txt diff --git a/main.nf b/main.nf index 61ebcb8..360fc82 100644 --- a/main.nf +++ b/main.nf @@ -44,7 +44,7 @@ log.info "" // ---------------------------------------------------*/ if(params.families_file) { - Channel + Channel .fromPath( "${params.families_file}") .ifEmpty { exit 1, "Family file: ${params.families_file} not found"} .set {ch_vcf} @@ -88,7 +88,6 @@ selected_analysis_mode = params.analysis_mode.split(',').collect{it.trim()} if (!checkParameterList(selected_analysis_mode, analysisModesList)) exit 1, "Unknown analysis mode, the available options are:\n$analysisModesList" ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") -ch_ped_parser_py = Channel.fromPath("${params.ped_parser_py}") /*-------------------------------------------------- Create PED and HPO file from design ---------------------------------------------------*/ @@ -107,22 +106,21 @@ if(!params.ped_file & !params.hpo_file){ input: set proband_id1, hpo, file(vcf_path1), file(vcf_index_path1) from ch_input file family_file from ch_vcf.collect() - file(ped_parser_py) from ch_ped_parser_py.collect() output: tuple val(proband_id1), file("${proband_id1}-HPO.txt"), file("${proband_id1}.ped"), file("${proband_id1}_ID.txt") into join_ch script: """ - python3 $ped_parser_py --input_family $family_file + ped_module.py --input_family $family_file sed -i 's/nan/0/g' ${proband_id1}.ped - echo "DEBUG: Check the file before - Content" + #echo "DEBUG: Check the file before - Content" cat ${proband_id1}.ped - echo "DEBUG: Check the file before - Number of line" + #echo "DEBUG: Check the file before - Number of line" wc -l ${proband_id1}.ped # remove the last line when nan present head -n -1 ${proband_id1}.ped > temp.txt ; mv temp.txt ${proband_id1}.ped - echo "DEBUG: Check the file after - Content" + #echo "DEBUG: Check the file after - Content" cat ${proband_id1}.ped - echo "DEBUG: Check the file after - Number of line" + #echo "DEBUG: Check the file after - Number of line" wc -l ${proband_id1}.ped """ } diff --git a/testdata/fam_file.txt b/testdata/fam_file.txt new file mode 100644 index 0000000..aeaadfc --- /dev/null +++ b/testdata/fam_file.txt @@ -0,0 +1,2 @@ +run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id +EX001 ERR3239334 HP:0001156 /Users/lmansouri/Lifebit/exomiser-pipeline-nf/testdata/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz /Users/lmansouri/Lifebit/exomiser-pipeline-nf/testdata/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz M ERR3989342 ERR3989341 \ No newline at end of file From b8d585e602488685c6e5748ed85b825285dde9c5 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Fri, 27 Oct 2023 09:27:12 +0200 Subject: [PATCH 67/98] changed paths in the family file --- testdata/fam_file.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/testdata/fam_file.txt b/testdata/fam_file.txt index aeaadfc..9408f62 100644 --- a/testdata/fam_file.txt +++ b/testdata/fam_file.txt @@ -1,2 +1,2 @@ run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id -EX001 ERR3239334 HP:0001156 /Users/lmansouri/Lifebit/exomiser-pipeline-nf/testdata/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz /Users/lmansouri/Lifebit/exomiser-pipeline-nf/testdata/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz M ERR3989342 ERR3989341 \ No newline at end of file +EX001 ERR3239334 HP:0001156 s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz.tbi M ERR3989342 ERR3989341 \ No newline at end of file From b87088c2dcce7be328adf8860a7ef131c1700278 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Fri, 27 Oct 2023 11:55:59 +0200 Subject: [PATCH 68/98] changed the sep of the family file --- testdata/fam_file.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/testdata/fam_file.txt b/testdata/fam_file.txt index 9408f62..e83485b 100644 --- a/testdata/fam_file.txt +++ b/testdata/fam_file.txt @@ -1,2 +1,2 @@ -run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id -EX001 ERR3239334 HP:0001156 s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz.tbi M ERR3989342 ERR3989341 \ No newline at end of file +run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id +EX001 ERR3239334 HP:0001156 s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz.tbi M ERR3989342 ERR3989341 \ No newline at end of file From 199af4afc10e767220fb6b15c82ba5368c65d71d Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Fri, 27 Oct 2023 14:41:27 +0200 Subject: [PATCH 69/98] removed family file as it is in S3 --- testdata/fam_file.txt | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 testdata/fam_file.txt diff --git a/testdata/fam_file.txt b/testdata/fam_file.txt deleted file mode 100644 index e83485b..0000000 --- a/testdata/fam_file.txt +++ /dev/null @@ -1,2 +0,0 @@ -run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id -EX001 ERR3239334 HP:0001156 s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz.tbi M ERR3989342 ERR3989341 \ No newline at end of file From 48953702304df402072404bb98add046a3a9cb59 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Fri, 27 Oct 2023 12:34:37 +0000 Subject: [PATCH 70/98] fixed pipeline bugs in testing --- conf/{test.config => family_test.config} | 4 ++-- conf/single_vcf_test.config | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) rename conf/{test.config => family_test.config} (63%) create mode 100644 conf/single_vcf_test.config diff --git a/conf/test.config b/conf/family_test.config similarity index 63% rename from conf/test.config rename to conf/family_test.config index 7babfcf..b8a0e07 100644 --- a/conf/test.config +++ b/conf/family_test.config @@ -1,8 +1,8 @@ params { - input = 'https://lifebit-featured-datasets.s3-eu-west-1.amazonaws.com/pipelines/parabricks/germline/output/HG001-NA12878-pFDA_S1_L001_20k.vcf' - sample_name = 'HG001_NA12878' + families_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/fam_file.tsv' hpo_terms_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/hpo_terms_file.txt' prioritisers = 'hiPhivePrioritiser' exomiser_data = "s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle" application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' + auto_config_yml 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' } diff --git a/conf/single_vcf_test.config b/conf/single_vcf_test.config new file mode 100644 index 0000000..491e0d7 --- /dev/null +++ b/conf/single_vcf_test.config @@ -0,0 +1,8 @@ +params { + families_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/single_vcf.tsv' + hpo_terms_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/hpo_terms_file.txt' + prioritisers = 'hiPhivePrioritiser' + exomiser_data = "s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle" + application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' + auto_config_yml 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' +} From 8544610e99943256732c543b734b7d19b6fe6b86 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Fri, 27 Oct 2023 12:37:18 +0000 Subject: [PATCH 71/98] fixed bugs in testing --- .attach_pid22597 | 0 bin/add_exomiser_fields_to_genotiers.js | 0 bin/genotiers.js | 0 bin/get_hpo_terms_from_barcode.js | 0 bin/ped_module.py | 0 bin/ped_module_man.py | 0 main.nf | 38 +++++++++---------------- nextflow.config | 8 ++++-- testdata/single_vcf.tsv | 2 ++ 9 files changed, 21 insertions(+), 27 deletions(-) create mode 100644 .attach_pid22597 mode change 100644 => 100755 bin/add_exomiser_fields_to_genotiers.js mode change 100644 => 100755 bin/genotiers.js mode change 100644 => 100755 bin/get_hpo_terms_from_barcode.js mode change 100644 => 100755 bin/ped_module.py mode change 100644 => 100755 bin/ped_module_man.py create mode 100644 testdata/single_vcf.tsv diff --git a/.attach_pid22597 b/.attach_pid22597 new file mode 100644 index 0000000..e69de29 diff --git a/bin/add_exomiser_fields_to_genotiers.js b/bin/add_exomiser_fields_to_genotiers.js old mode 100644 new mode 100755 diff --git a/bin/genotiers.js b/bin/genotiers.js old mode 100644 new mode 100755 diff --git a/bin/get_hpo_terms_from_barcode.js b/bin/get_hpo_terms_from_barcode.js old mode 100644 new mode 100755 diff --git a/bin/ped_module.py b/bin/ped_module.py old mode 100644 new mode 100755 diff --git a/bin/ped_module_man.py b/bin/ped_module_man.py old mode 100644 new mode 100755 diff --git a/main.nf b/main.nf index 360fc82..d069ab0 100644 --- a/main.nf +++ b/main.nf @@ -21,6 +21,7 @@ log.info "-${c_teal}config:${c_reset}- ${params.config}" log.info "-${c_teal}filename_design_file:${c_reset}- ${params.families_file}" if(params.hpo_file) log.info "-${c_teal}filename_hpo:${c_reset}- ${params.filename_hpo}" if(params.ped_file) log.info "-${c_teal}filename_ped:${c_reset}- ${params.ped_file}" +if(params.families_file) log.info "-${c_teal}families_file:${c_reset}- ${params.families_file}" log.info "-${c_teal}analysis_mode:${c_reset}- ${params.analysis_mode}" log.info "-${c_teal}exomiser_data:${c_reset}- ${params.exomiser_data}" log.info "-${c_teal}exomiser_phenotype_data:${c_reset}- ${params.exomiser_phenotype_data}" @@ -49,7 +50,7 @@ if(params.families_file) { .ifEmpty { exit 1, "Family file: ${params.families_file} not found"} .set {ch_vcf} } else { - exit 1, "please specify Family file with --family_file parameter" + exit 1, "please specify Family file with --families_file parameter" } @@ -104,24 +105,17 @@ if(!params.ped_file & !params.hpo_file){ errorStrategy 'retry' maxErrors 5 input: - set proband_id1, hpo, file(vcf_path1), file(vcf_index_path1) from ch_input + set proband_id1, file(vcf_path1), file(vcf_index_path1) from ch_input file family_file from ch_vcf.collect() output: tuple val(proband_id1), file("${proband_id1}-HPO.txt"), file("${proband_id1}.ped"), file("${proband_id1}_ID.txt") into join_ch script: """ ped_module.py --input_family $family_file + #to change nan in 0s if there are any sed -i 's/nan/0/g' ${proband_id1}.ped - #echo "DEBUG: Check the file before - Content" - cat ${proband_id1}.ped - #echo "DEBUG: Check the file before - Number of line" - wc -l ${proband_id1}.ped - # remove the last line when nan present - head -n -1 ${proband_id1}.ped > temp.txt ; mv temp.txt ${proband_id1}.ped - #echo "DEBUG: Check the file after - Content" - cat ${proband_id1}.ped - #echo "DEBUG: Check the file after - Number of line" - wc -l ${proband_id1}.ped + #to remove the "parent" line if it's a single sample + sed -i "/0\t0\t0/d" ${proband_id1}.ped """ } } @@ -149,13 +143,7 @@ process exomiser { errorStrategy 'retry' maxRetries 3 input: - //set run_id, proband_id1, hpo, file(vcf_path1), file(vcf_index_path1), proband_sex, mother_id, father_id from ch_input - //set val(proband_id1), hpo, file(vcf_path1), file(vcf_index_path1) set val(proband_id1),file(vcf_path1),file(vcf_index1), file(hpo_file), file(ped_file),file(id_file) from combined_channel - //The following is expected when CADD is omitted, - // WARN: Input tuple does not match input set cardinality declared by process `exomiser` - // ch_all_exomiser_data contents can be 1 or 2 folders, (exomiser_data +/- cadd separately) - // this is fine, as when there is no second dir, a fake input.1 is generated that will be unused each file(application_properties) from ch_application_properties each file(auto_config_yml) from ch_auto_config_yml each file(exomiser_data) from ch_exomiser_data @@ -172,18 +160,18 @@ process exomiser { def exomiser_executable = "/exomiser/exomiser-cli-"+"${params.exomiser_version}"+".jar" def exomiser = "java -Xms2g -Xmx4g -jar "+"${exomiser_executable}" """ - ls -la - echo "Contents in PED" + #ls -la + #echo "Contents in PED" # link the staged/downloaded data to predefined path mkdir -p /data mkdir -p /data/exomiser-data-bundle ln -svf "\$PWD/$exomiser_data/" /data/exomiser-data-bundle - stat -L $vcf_path1 + #stat -L $vcf_path1 stat -L $vcf_path1 > out.txt - cat out.txt + #cat out.txt proband_id1=`cat ${id_file}` hpo_band1=`cat ${hpo_file}` - echo \$proband_id1 + #echo \$proband_id1 # Modify auto_config.to pass the params cp ${auto_config_yml} new_auto_config.yml # Swap placeholders with user provided values @@ -198,8 +186,8 @@ process exomiser { sed -i "s/ped_placeholder/${ped_file}/g" new_auto_config.yml sed -i "s/proband_placeholder/\$proband_id1/g" new_auto_config.yml # Printing (ls, see files; cat, injected values validation) - ${params.debug_script} - cat new_auto_config.yml + #${params.debug_script} + #cat new_auto_config.yml # Run Exomiser ${exomiser} \ --analysis new_auto_config.yml \ diff --git a/nextflow.config b/nextflow.config index dd54333..f6759b8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -34,6 +34,9 @@ params { exomiser_version = '12.1.0' exomiser_data_directory = '/data/exomiser-data-bundle' + //inputs + families_file = false + // Debugging related parameters debug_script = "ls -l" echo = false @@ -53,7 +56,8 @@ params { profiles { standard { includeConfig params.config } - test { includeConfig "conf/test.config" } + family_test { includeConfig 'conf/family_test.config' } + single_vcf_test { includeConfig 'conf/single_vcf_test.config' } } process { @@ -61,7 +65,7 @@ process { errorStrategy = params.errorStrategy withName: exomiser { container = "quay.io/lifebitai/exomiser:${params.exomiser_container_tag}" - containerOptions = "--volume ${params.exomiser_data_directory}:/data/exomiser-data-bundle/" + containerOptions = "--volume ${params.exomiser_data_directory}:/data/" memory = 6.GB cpus = 4 } diff --git a/testdata/single_vcf.tsv b/testdata/single_vcf.tsv new file mode 100644 index 0000000..8acb569 --- /dev/null +++ b/testdata/single_vcf.tsv @@ -0,0 +1,2 @@ +run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id +EX001 ERR3239334 HP:0001156 s3://lifebit-featured-datasets/pipelines/exomiser-nf/ERR3239334_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/ERR3239334_small.vcf.gz.tbi M nan nan \ No newline at end of file From 71465700b515373916cf5dd60b4a0c4a8ae75aa2 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Fri, 27 Oct 2023 14:08:25 +0000 Subject: [PATCH 72/98] added documentation for the pipeline --- .github/workflows/ci_test.yml | 2 +- README.md | 114 +++++++++++++++++++++++++--------- main.nf | 8 +-- 3 files changed, 90 insertions(+), 34 deletions(-) diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml index fcc8101..b575386 100644 --- a/.github/workflows/ci_test.yml +++ b/.github/workflows/ci_test.yml @@ -23,4 +23,4 @@ # sudo mv nextflow /usr/local/bin/ # - name: Basic workflow tests # run: | -# nextflow run ${GITHUB_WORKSPACE} -profile test +# nextflow run ${GITHUB_WORKSPACE} -profile single_vcf_test diff --git a/README.md b/README.md index ff24814..b91bbed 100644 --- a/README.md +++ b/README.md @@ -1,54 +1,110 @@ # Exomiser +## Pipeline documentation -## Exomiser -> container -> pipeline -> Run at your peril +Table of contents -### Containarizing Exomiser +- [Pipeline documentation](#pipeline-documentation) + - [Pipeline description](#pipeline-description) + - [Pipeline overview](#pipeline-overview) + - [Input](#input) + - [--\](#--name_of_main_input) + - [Processes](#processes) + - [Output](#output) + - [Options](#options) + - [General Options](#general-options) + - [Resource Allocation](#resource-allocation) + - [Usage](#usage) + - [Running with Docker or Singularity](#running-with-docker-or-singularity) + - [Running on CloudOS](#running-on-cloudos) + - [Testing](#testing) + - [Profiles](#profiles) + - [Stress Testing](#stress-testing) -Containarization provides a solution to isolate all software dependencies and make an aplication / workflow reproducible. +## Pipeline description -In order to containirize Exomiser we started from a Docker base image with Ubuntu and Java8, which is the main dependency for Exomiser to be able to run. +### Pipeline overview -The container pulls the latest publicly available release of `Exomiser v12.0.0` from Github releases, and unzips it inside the container. Modifications to the application property are included to match the genome reference data used for running exomiser. The version selecter for the container is `1811`. + - Name: exomiser-pipeline-nf + - Tools: exomiser + - Version: 12.1.0 -Latest version of exomiser accept parameters and configuration from a YAML file. This YAML file defines input such as VCF files, reference genome version, HPO terms and other parameter configuration. In order to create a directly executable application for Exomiser, without the need to write beforehand the YAML file, we added a `Python` handle script that just do that. +It is a fully containerised nextflow pipeline that runs exomisers on either a single sample VCF file or a trio VCF file. -The `Python` script `run.py` takes care of: -* Getting Exomiser input parameters from command line -* Create a YAML file from a template and updating it given parameters received -* Run Exomiser with the YAML file just created +The Exomiser is a tool to perform genome-wide prioritisation of genomic variants including non-coding and regulatory variants using patient phenotypes as a means of differentiating candidate genes. + +To perform an analysis, Exomiser requires the patient's genome/exome in VCF format and their phenotype encoded in [HPO terms](https://hpo.jax.org/app/). The exomiser is also capable of analysing trios/small family genomes. -The script also takes care of handling the pointer to the genome reference dataset used for running Exomiser. -Since reference data is quite large, doesn't make much sense to include it in the container, as it would turn into a +50 GB image... +The main input of the pipeline (`families_file`) is a TSV file and the main output of the pipeline is an HTML file containing pathogenicity score of the called variants. -Instead the fetching of the data, which includes: -* VCF -* Reference Exomiser genome dataset -Will be taken care by a simple Nextflow pipeline. +### Input -## Containarized Exomiser pipeline +#### --families_file -In order to run Exomiser with a VCF file, reference genome datasets have to be fetched and unzipped for Exomiser to run. We separated the data and file staging part from the container into a pipeline (Nextflow) which will take care of pulling the data into the working directory for Exomiser to run successfully. +This is a TSV file that contains the following info tab separated -The reference dataset has been added as a parameter, allowing flexibility to pull the data from any resource (i.e. cloud, local storage, ftp, ...) and Nextlfow pipeline will automatically take care of fetching the data without having to add any logic to Exomiser process/script. +|run_id |proband_id |hpo |vcf_path |vcf_index_path |proband_sex |mother_id |father_id | +| :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | +| | | | | | | | | | +The vcf_path column can contain the path to either a multiVCF(trio) or a single-sample VCF. +In the case of a single-sample VCF, the last 2 columns must contain `nan` as a value. An example can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-nf/fam_file.tsv) -## Test the pipeline +### --application_properties -### Download local copy of exomiser-data-bundle (~120GB) +This is a file needed by exomiser to run. It contains information on where to find the reference data as well as the versioning of the reference genome. An example can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-nf/application.properties) -```bash -sudo aws s3 sync s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle /data/exomiser-data-bundle --no-sign-request -``` +### --auto_config_yml + +This is a file needed by exomiser to run. It contains placeholders in the text that get filled in by the second process of the pipeline just before running exomiser. The one used for testing can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-nf/auto_config.yml) + +### --exomiser_data + +This path refers to the reference data bundle needed by exomiser (~120 GB!). A copy of such files can be found [here](s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle). The reference dataset has been added as a parameter, allowing flexibility to pull the data from any resource (i.e. cloud, local storage, ftp, ...) and Nextlfow will automatically take care of fetching the data without having to add anything to the pipeline itself. + +There are other parameters that can be tweaked to personalize the behaviour of the pipeline. These are referenced in `nextflow.config` + +### Processes + +Here is the list of steps performed by this pipeline. + +1. `process ped_hpo_creation` - this process produces the pedigree (PED) file needed for exomiser to run using a python script. +2. `process 2` - this process is where the autoconfig file for exomiser is generated and exomiser is run. + +### Output + +- a html and a json file containing a report on the analysis +- the autoconfig file, for reproducibility purpose +- a vcf file with the called variants that are identified as causative + + +### Testing + +To run the pipeline with `docker` (used by default), type the following commands: + +To test the pipeline on a multi-VCF: -Or if in a EC2 from HK region download faster with HKGI AWS credentials ``` -sudo aws s3 sync s3://lifebit-featured-datasets-hkgi/pipelines/exomiser-data-bundle /data/exomiser-data-bundle +nextflow run main.nf -profile family_test ``` -### Test run (With a HOP term file, no fetch from database) +To test the pipeline on a single-sample VCF: -```bash -nextflow run main.nf -profile test ``` +nextflow run main.nf -profile single_vcf_test +``` + +Be careful when running this, as the pipeline requires the staging of 120 GB of reference data, required by exomiser, so only that takes a while! + +### Running on CloudOS + + +### Profiles + +| profile name | Run locally | Run on CloudOS | description | +| :----: | :----: | :----: | :----: | +|family_test | the data required is so big, it was tested on a c5.4xlarge EC2 machine | | this test is designed to test the pipeine on a multi-VCF with trio information | +|single_vcf_test | the data required is so big, it was tested on a c5.4xlarge EC2 machine | | this test is designed to test the pipeine on a single-sample-VCF | + + diff --git a/main.nf b/main.nf index d069ab0..0af8895 100644 --- a/main.nf +++ b/main.nf @@ -95,10 +95,10 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") //remove //ch_vcf_inspect.dump(tag:'ch_vcf') -if (params.ped_file) ped_ch = Channel.value(file(params.ped_file)) -if (params.hpo_file) hpo_ch = Channel.value(file(params.hpo_file)) +// if (params.ped_file) ped_ch = Channel.value(file(params.ped_file)) +// if (params.hpo_file) hpo_ch = Channel.value(file(params.hpo_file)) -if(!params.ped_file & !params.hpo_file){ +// if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { container 'quay.io/lifebitaiorg/ped_parser:latest' publishDir "${params.outdir}/familyfile/", mode: 'copy' @@ -118,7 +118,7 @@ if(!params.ped_file & !params.hpo_file){ sed -i "/0\t0\t0/d" ${proband_id1}.ped """ } -} +// } /*-------------------------------------------------- Run containarised Exomiser From 0b7191de153bef4dd9f9e7f3197c88c9e4fc468a Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Fri, 27 Oct 2023 14:27:23 +0000 Subject: [PATCH 73/98] fixed url in documentation for the pipeline --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b91bbed..5b8f7d6 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ This is a file needed by exomiser to run. It contains placeholders in the text t ### --exomiser_data -This path refers to the reference data bundle needed by exomiser (~120 GB!). A copy of such files can be found [here](s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle). The reference dataset has been added as a parameter, allowing flexibility to pull the data from any resource (i.e. cloud, local storage, ftp, ...) and Nextlfow will automatically take care of fetching the data without having to add anything to the pipeline itself. +This path refers to the reference data bundle needed by exomiser (~120 GB!). A copy of such files can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-data-bundle/) . The reference dataset has been added as a parameter, allowing flexibility to pull the data from any resource (i.e. cloud, local storage, ftp, ...) and Nextlfow will automatically take care of fetching the data without having to add anything to the pipeline itself. There are other parameters that can be tweaked to personalize the behaviour of the pipeline. These are referenced in `nextflow.config` From 6b41db0039d8d3f0ae42a90601220b8264129fdb Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Mon, 30 Oct 2023 10:48:45 +0100 Subject: [PATCH 74/98] added default paths to files in S3 bucket --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index f6759b8..34a464d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,8 +21,8 @@ params { exomiser_phenotype_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/2102_phenotype' cadd_snvs = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/cadd_snvs' phenix_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/phenix' - application_properties = false - auto_config_yml = false + application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' + auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' hpo_terms_file = false modes_of_inheritance = 'AUTOSOMAL_DOMINANT,AUTOSOMAL_RECESSIVE,X_RECESSIVE,UNDEFINED' prioritisers = 'hiPhivePrioritiser,phivePrioritiser,phenixPrioritiser' From ba522c6e6e153e8449bacc9ea500f62841c6c641 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Mon, 30 Oct 2023 11:00:37 +0100 Subject: [PATCH 75/98] fixed typo --- conf/family_test.config | 2 +- conf/single_vcf_test.config | 2 +- testdata/fam_file.tsv | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 testdata/fam_file.tsv diff --git a/conf/family_test.config b/conf/family_test.config index b8a0e07..3ea336e 100644 --- a/conf/family_test.config +++ b/conf/family_test.config @@ -4,5 +4,5 @@ params { prioritisers = 'hiPhivePrioritiser' exomiser_data = "s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle" application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' - auto_config_yml 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' + auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' } diff --git a/conf/single_vcf_test.config b/conf/single_vcf_test.config index 491e0d7..19eff49 100644 --- a/conf/single_vcf_test.config +++ b/conf/single_vcf_test.config @@ -4,5 +4,5 @@ params { prioritisers = 'hiPhivePrioritiser' exomiser_data = "s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle" application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' - auto_config_yml 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' + auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' } diff --git a/testdata/fam_file.tsv b/testdata/fam_file.tsv new file mode 100644 index 0000000..e83485b --- /dev/null +++ b/testdata/fam_file.tsv @@ -0,0 +1,2 @@ +run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id +EX001 ERR3239334 HP:0001156 s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz.tbi M ERR3989342 ERR3989341 \ No newline at end of file From 2223fa28650df7d8c5d34227f1eccee13537bd7b Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Mon, 30 Oct 2023 16:09:11 +0100 Subject: [PATCH 76/98] changed the docker for bug on platform --- containers/ped_parser/Dockerfile | 23 +++++++++++++++++++++++ main.nf | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 containers/ped_parser/Dockerfile diff --git a/containers/ped_parser/Dockerfile b/containers/ped_parser/Dockerfile new file mode 100644 index 0000000..201a3a3 --- /dev/null +++ b/containers/ped_parser/Dockerfile @@ -0,0 +1,23 @@ +FROM continuumio/miniconda3@sha256:77f9119def83d94b7afb654b39a1c21aaa7f255518aba57de08321760c27c86a + +ENV VERSION="1.6.6-py_2" + +ARG ENV_NAME="ped-parser" + +LABEL description="Docker containing the ped_parser python package." \ + software.version="${VERSION}" \ + maintainer="Leila Mansouri: leila.mansouri@lifebit.ai" \ + name="quay.io/lifebitaiorg/ped_parser:1.6.6-py_2" + +#needed as per the documentation +RUN apt-get update -y &&\ + apt-get install -y procps \ + zlib1g &&\ + rm -rf /var/lib/apt/lists/* + +#installing the tool and its dependencies +RUN pip install ped_parser + +RUN pip install pandas + +ENTRYPOINT [ ] diff --git a/main.nf b/main.nf index 0af8895..7781aa3 100644 --- a/main.nf +++ b/main.nf @@ -100,7 +100,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") // if(!params.ped_file & !params.hpo_file){ process ped_hpo_creation { - container 'quay.io/lifebitaiorg/ped_parser:latest' + container 'quay.io/lifebitaiorg/ped_parser:1.6.6' publishDir "${params.outdir}/familyfile/", mode: 'copy' errorStrategy 'retry' maxErrors 5 From 8151ee9da066c1e34e4ddb5b40de25192a56c088 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 11:12:08 +0100 Subject: [PATCH 77/98] changed bundle data path --- bin/application.properties | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/application.properties b/bin/application.properties index 30f0af7..3b2e7fb 100644 --- a/bin/application.properties +++ b/bin/application.properties @@ -21,7 +21,7 @@ #root path where data is to be downloaded and worked on #it is assumed that all the files required by exomiser listed in this properties file #will be found in the data directory unless specifically overridden here. -exomiser.data-directory=/data/exomiser-data-bundle/ +exomiser.data-directory=/data/exomiser-data-bundle/exomiser-data-bundle/ ### hg19 assembly ### # exomiser.hg19.data-version=2102 @@ -49,7 +49,7 @@ exomiser.hg38.data-version=2102 #transcript source will default to ensembl. Can define as ucsc/ensembl/refseq exomiser.hg38.transcript-source=ensembl -exomiser.hg38.data-directory=/data/exomiser-data-bundle/2102_hg38 +exomiser.hg38.data-directory=/data/exomiser-data-bundle/exomiser-data-bundle/2102_hg38 #location of CADD/REMM Tabix files - you will need these for analysis of non-coding variants. #CADD can be downloaded from http://cadd.gs.washington.edu/download - v1.3 has been tested. #REMM can be downloaded from https://charite.github.io/software-remm-score.html @@ -57,17 +57,17 @@ exomiser.hg38.data-directory=/data/exomiser-data-bundle/2102_hg38 # #You will require the tsv.gz and tsv.gz.tbi (tabix) file pairs. #Un-comment and add the full path to the relevant tsv.gz files if you want to enable these. -exomiser.hg38.cadd-snv-path=/data/exomiser-data-bundle/cadd_snvs/whole_genome_SNVs.tsv.gz -exomiser.hg38.cadd-in-del-path=/data/exomiser-data-bundle/2102_hg38/gnomad.genomes.r3.0.indel.tsv.gz +exomiser.hg38.cadd-snv-path=/data/exomiser-data-bundle/exomiser-data-bundle/cadd_snvs/whole_genome_SNVs.tsv.gz +exomiser.hg38.cadd-in-del-path=/data/exomiser-data-bundle/exomiser-data-bundle/2102_hg38/gnomad.genomes.r3.0.indel.tsv.gz #exomiser.hg38.remm-path=${exomiser.hg38.data-directory}/ReMM.v0.3.1.tsv.gz #exomiser.hg38.local-frequency-path=${exomiser.hg38.data-directory}/local_frequency_test.tsv.gz -exomiser.hg38.variant-white-list-path=/data/exomiser-data-bundle/2102_hg38/2102_hg38_clinvar_whitelist.tsv.gz +exomiser.hg38.variant-white-list-path=/data/exomiser-data-bundle/exomiser-data-bundle/2102_hg38/2102_hg38_clinvar_whitelist.tsv.gz ### phenotypes ### exomiser.phenotype.data-version=2102 -exomiser.phenotype.data-directory=/data/exomiser-data-bundle/2102_phenotype +exomiser.phenotype.data-directory=/data/exomiser-data-bundle/exomiser-data-bundle/2102_phenotype #String random walk data file -exomiser.phenotype.random-walk-file-name=/data/exomiser-data-bundle/2102_phenotype/rw_string_10.mv +exomiser.phenotype.random-walk-file-name=/data/exomiser-data-bundle/exomiser-data-bundle/2102_phenotype/rw_string_10.mv #exomiser.phenotype.random-walk-index-file-name=rw_string_9_05_id2index.gz ### caching ### From 3c05507d6d88254d48c4bd2851021e488957d463 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 11:13:15 +0100 Subject: [PATCH 78/98] commented debug code --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 7781aa3..f99d763 100644 --- a/main.nf +++ b/main.nf @@ -167,7 +167,7 @@ process exomiser { mkdir -p /data/exomiser-data-bundle ln -svf "\$PWD/$exomiser_data/" /data/exomiser-data-bundle #stat -L $vcf_path1 - stat -L $vcf_path1 > out.txt + #stat -L $vcf_path1 > out.txt #cat out.txt proband_id1=`cat ${id_file}` hpo_band1=`cat ${hpo_file}` From 1c98cfa92fe5be6f7ae4d2c415f82b795132ec84 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 16:53:10 +0100 Subject: [PATCH 79/98] removed_js_files --- .attach_pid22597 | 0 bin/add_exomiser_fields_to_genotiers.js | 320 ------------------------ bin/genotiers.js | 213 ---------------- bin/get_hpo_terms_from_barcode.js | 119 --------- main.nf | 6 +- 5 files changed, 2 insertions(+), 656 deletions(-) delete mode 100644 .attach_pid22597 delete mode 100755 bin/add_exomiser_fields_to_genotiers.js delete mode 100755 bin/genotiers.js delete mode 100755 bin/get_hpo_terms_from_barcode.js diff --git a/.attach_pid22597 b/.attach_pid22597 deleted file mode 100644 index e69de29..0000000 diff --git a/bin/add_exomiser_fields_to_genotiers.js b/bin/add_exomiser_fields_to_genotiers.js deleted file mode 100755 index 6122a34..0000000 --- a/bin/add_exomiser_fields_to_genotiers.js +++ /dev/null @@ -1,320 +0,0 @@ -// ADD EXOMISER FIELDS TO GENOTIERS -// takes a TSV file, extracts `fullLocation` (i.e takes `#CHROM`, ``POS`, `REF` and `ALT` and produces `fullLocation`), -// looks for matching `fullLocation` in the `genotiers` collection in MongoDB and -// updates the matching document with the rest of the exomiser fields - -// To run: -// minimum: -// node --no-warnings ./bin/add_exomiser_fields_to_genotiers.js -t ./testdata/exomiserVariantsHG001-NA12878-pFDA_S1_L001_100k_AR.variants.tsv -// full set: -// node --no-warnings ./bin/add_exomiser_fields_to_genotiers.js --tsvFile ./testdata/exomiserVariantsHG001-NA12878-pFDA_S1_L001_100k_AR.variants.tsv --databaseName clinical-dev --uri mongodb://localhost:27017 - -// functions -// closeConnection function closes the connection to MongoDB -function closeConnection() { - mongoose.connection.close(function() { - // console.log('Mongoose disconnected '); - }); -} -// makeFullLocation function creates `fullLocation` from `#CHROM`, ``POS`, `REF` and `ALT` -function makeFullLocation (chromosome, position, reference, alternative){ - var fullLocation; - if(chromosome && position && reference && alternative){ - fullLocation = chromosome + ':' + position + '-' + reference + '-' + alternative; - } - return fullLocation; -} - -// required packages -const { program } = require('commander'); -const fs = require('fs') -const mongoose = require('mongoose'); -require('./genotiers'); - -// Take inputs from command line -program - .description('A script that, given a tsv file, retrieves the corresponding genotier from database and updates it with tsv file results') - .option('-t, --tsvFile ', 'Tsv file with exomiser results') - .option('-d, --databaseName ', 'Database name', "clinical-dev") - .option('-u, --uri ', 'Uri to database', "mongodb://localhost:27017"); -program.parse(); -var tsvFilePath = program.opts().tsvFile -var dbName = program.opts().databaseName -var uri = program.opts().uri - -// Extract file name from file path -var tsvFileName=tsvFilePath.split('/')[tsvFilePath.split('/').length-1].split('.')[0] -// Initiate log file -var logFileName = tsvFileName+'_addToDatabase.log' -fs.writeFileSync(logFileName, ''); - -// Connect to mongoDB -mongoose.connect(uri + '/' + dbName, {useNewUrlParser: true, useUnifiedTopology: true}); - -// If cannot connect beacause of error -mongoose.connection.on('error', function(err) { - console.log('Mongoose connection error: ' + err); -}); - -// Read and split tsv file -var tsvFileContent = fs.readFileSync(tsvFilePath, 'utf8'); -var tsvFileContentSplit = tsvFileContent.split('\n') -var header = tsvFileContentSplit[0].split('\t') -var i; -var count = 0; -// Iterate through header to get chromosome, pos, alt and ref -for(i=0;i { - if(err){ - console.log(err) - if(count == tsvFileContentSplit.length-1){ - closeConnection() - } - } else if(!genotier){ - // console.log('No results') - count = count+1; - fs.appendFileSync(logFileName, 'Genotier for location: '+fullLocation+' not found \n') - if(count == tsvFileContentSplit.length-1){ - closeConnection() - } - } else { - if(genotier.length==0){ - // console.log('No results') - count = count+1; - fs.appendFileSync(logFileName, 'Genotier for location: '+fullLocation+' not found \n') - if(count == tsvFileContentSplit.length-1){ - closeConnection() - } - } else { - // if genotier found add all fields - var k=0; - for(k=0;k0.483)')){ - if(row[k]=='.'){ - genotier.exomiserCadd = '' - } else { - genotier.exomiserCadd = row[k] - } - } else if(value.includes('POLYPHEN(>0.956|>0.446)')){ - if(row[k]=='.'){ - genotier.exomiserPolyphen = '' - } else { - genotier.exomiserPolyphen = row[k] - } - } else if(value.includes('MUTATIONTASTER(>0.94)')){ - if(row[k]=='.'){ - genotier.exomiserMutationTaster = '' - } else { - genotier.exomiserMutationTaster = row[k] - } - } else if(value.includes('SIFT(<0.06)')){ - if(row[k]=='.'){ - genotier.exomiserSift = '' - } else { - genotier.exomiserSift = row[k] - } - } else if(value.includes('REMM')){ - if(row[k]=='.'){ - genotier.exomiserRemm = '' - } else { - genotier.exomiserRemm = row[k] - } - } else if(value.includes('DBSNP_ID')){ - if(row[k]=='.'){ - genotier.exomiserDbsnp = '' - } else { - genotier.exomiserDbsnp = row[k] - } - } else if(value.includes('MAX_FREQUENCY')){ - if(row[k]=='.'){ - genotier.exomiserMaxFreq = '' - } else { - genotier.exomiserMaxFreq = row[k] - } - } else if(value.includes('DBSNP_FREQUENCY')){ - if(row[k]=='.'){ - genotier.exomiserDbsnpFreq = '' - } else { - genotier.exomiserDbsnpFreq = row[k] - } - } else if(value.includes('EVS_EA_FREQUENCY')){ - if(row[k]=='.'){ - genotier.exomiserEvsEaFreq = '' - } else { - genotier.exomiserEvsEaFreq = row[k] - } - } else if(value.includes('EVS_AA_FREQUENCY')){ - if(row[k]=='.'){ - genotier.exomiserEvsAaFreq = '' - } else { - genotier.exomiserEvsAaFreq = row[k] - } - } else if(value.includes('EXAC_AFR_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacAfrFreq = '' - } else { - genotier.exomiserExacAfrFreq = row[k] - } - } else if(value.includes('EXAC_AMR_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacAmrFreq = '' - } else { - genotier.exomiserExacAmrFreq = row[k] - } - } else if(value.includes('EXAC_EAS_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacEasFreq = '' - } else { - genotier.exomiserExacEasFreq = row[k] - } - } else if(value.includes('EXAC_FIN_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacFinFreq = '' - } else { - genotier.exomiserExacFinFreq = row[k] - } - } else if(value.includes('EXAC_NFE_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacNfeFreq = '' - } else { - genotier.exomiserExacNfeFreq = row[k] - } - } else if(value.includes('EXAC_SAS_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacSasFreq = '' - } else { - genotier.exomiserExacSasFreq = row[k] - } - } else if(value.includes('EXAC_OTH_FREQ')){ - if(row[k]=='.'){ - genotier.exomiserExacOthFreq = '' - } else { - genotier.exomiserExacOthFreq = row[k] - } - } else if(value.includes('CONTRIBUTING_VARIANT')){ - if(row[k]=='.'){ - genotier.exomiserContributingVariant = '' - } else { - genotier.exomiserContributingVariant = row[k] - } - } - } - // save modified field - genotier.save((err,genotier) =>{ - if(err) { - console.log(err); - count = count+1; - if(count == tsvFileContentSplit.length-1){ - closeConnection() - } - } else { - fs.appendFileSync(logFileName, 'Genotier for location: '+genotier.fullLocation+' had been changed \n') - count = count+1; - if(count == tsvFileContentSplit.length-1){ - closeConnection() - } - } - }) - } - } - }) - } else { - console.log('No location found') - } - } -}) \ No newline at end of file diff --git a/bin/genotiers.js b/bin/genotiers.js deleted file mode 100755 index d383a52..0000000 --- a/bin/genotiers.js +++ /dev/null @@ -1,213 +0,0 @@ -const mongoose = require('mongoose'); - -const genotiersSchema = new mongoose.Schema({ - acmgAppxScore: { - type: Number - }, - acmgBenignSubscore: { - type: String - }, - acmgCodingImpact: { - type: String - }, - acmgGeneId: { - type: Number - }, - acmgPathogenicSubscore: { - type: String - }, - acmgUserExplain: { - type: Array - }, - acmgVerdict: { - type: String - }, - acmgVersion: { - type: String - }, - allelicBalance: { - type: Number - }, - alternative: { - type: String - }, - all1: { - type: String - }, - all2: { - type: String - }, - ampApproxScore: { - type: String - }, - ampClassifications: { - type: String - }, - ampClassificationsTier: { - type: String - }, - ampName: { - type: String - }, - ampTier1: { - type: String - }, - ampTier2: { - type: String - }, - ampTier3: { - type: String - }, - ampTier4: { - type: String - }, - ampVerdict: { - type: String - }, - ampVerdictTier: { - type: String - }, - ampVersion: { - type: String - }, - cbio: { - type: String - }, - chromosome: { - type: String, - required: true - }, - coverage: { - type: Number - }, - fullLocation: { - type: String, - required: true - }, - gene: { - type: String - }, - genotype: { - type: String - }, - i: { - type: String, - required: true - }, - location: { - type: String, - required: true - }, - notes: { - type: String - }, - position: { - type: Number, - required: true - }, - reference: { - type: String - }, - barcode: { - type: String - }, - variantType: { - type: String - }, - vcfSampleId: { - type: String - }, - zygosity: { - type: String - }, - exomiserQual: { - type: String - }, - exomiserFilter: { - type: String - }, - exomiserGenotype: { - type: String - }, - exomiserHgvs: { - type: String - }, - exomiserCoverage: { - type: String - }, - exomiserFunctionalClass: { - type: String - }, - exomiserVariantScore: { - type: String - }, - exomiserGene: { - type: String - }, - exomiserGenePhenoScore: { - type: String - }, - exomiserGeneVariantScore: { - type: String - }, - exomiserGeneCombinedScore: { - type: String - }, - exomiserCadd: { - type: String - }, - exomiserPolyphen: { - type: String - }, - exomiserMutationTaster: { - type: String - }, - exomiserSift: { - type: String - }, - exomiserRemm: { - type: String - }, - exomiserDbsnp: { - type: String - }, - exomiserMaxFreq: { - type: String - }, - exomiserDbsnpFreq: { - type: String - }, - exomiserEvsEaFreq: { - type: String - }, - exomiserEvsAaFreq: { - type: String - }, - exomiserExacAfrFreq: { - type: String - }, - exomiserExacAmrFreq: { - type: String - }, - exomiserExacEasFreq: { - type: String - }, - exomiserExacFinFreq: { - type: String - }, - exomiserExacNfeFreq: { - type: String - }, - exomiserExacSasFreq: { - type: String - }, - exomiserExacOthFreq: { - type: String - }, - exomiserContributingVariant: { - type: String - } -}) - -genotiersSchema.index({fullLocation: 1, i: 1 }, { unique: true }) -mongoose.model('Genotiers', genotiersSchema, 'genotiers'); \ No newline at end of file diff --git a/bin/get_hpo_terms_from_barcode.js b/bin/get_hpo_terms_from_barcode.js deleted file mode 100755 index 38bcb37..0000000 --- a/bin/get_hpo_terms_from_barcode.js +++ /dev/null @@ -1,119 +0,0 @@ -// GET HPO TERMS FROM SAMPLE ID is a script -// which looks for participant ID (more specifically the field `i`) in first collection specified and -// then uses found id ('i') to find HP terms in second collection specified - -// To run: -// minimum: -// node --no-warnings ./bin/get_hpo_terms_from_barcode.js --barcode 000000001 -// full set: -// node ./bin/get_hpo_terms_from_barcode.js --barcode '000000001' --dbWithPatientSamples clinical-portal --uriWithPatientSamples mongodb://localhost:27017 --collectionPatientSamples patientsamples --dbWithParticipants cohort-browser --collectionParticipants participants --uriWithParticipants mongodb://localhost:27017 - -// required packages -const { program } = require('commander'); -const fs = require('fs') - -// Take inputs from command line -program - .description('A script that, given a barcode, retrieves the corresponding participant ID (more specifically `i`) and HPO terms') - .option('-b, --barcode ', '547300000450') - .option('-ds, --dbWithPatientSamples ', 'Database containing the collection called "patientsamples"', "clinical-portal") - .option('-us, --uriWithPatientSamples ', 'Uri to database containing the collection called "patientsamples"', "mongodb://localhost:27017") - .option('-cs, --collectionPatientSamples ', 'Collection name for samples', "patientsamples") - .option('-dp, --dbWithParticipants ', 'Database containing the collection called "participants"', "cohort-browser") - .option('-up, --uriWithParticipants ', 'Uri to database containing the collection called "participants"', "mongodb://localhost:27017") - .option('-cp, --collectionParticipants ', 'Collection name for participants', "participants") -program.parse(); - -var barcode = program.opts().barcode - -var dbNameSamples = program.opts().dbWithPatientSamples -var uriSamples = program.opts().uriWithPatientSamples -var collectionSamples = program.opts().collectionPatientSamples - -var dbNameParticipants = program.opts().dbWithParticipants -var uriParticipants = program.opts().uriWithParticipants -var collectionParticipants = program.opts().collectionParticipants - -// initiate variables -var fileName = barcode+'_hpo_list.txt' - -// initialise output file -fs.writeFileSync(fileName, ''); -fs.appendFileSync(fileName, '# '+barcode+' \n') - -// connect to mongoDB -var MongoClient = require('mongodb',{ useUnifiedTopology: true }).MongoClient; - -// use promise to find first query results -var promise = new Promise((resolve, reject) => { - MongoClient.connect(uriSamples,{poolSize: 1000}, function(err, db) { - if (err) throw err; - console.log('connected to database 1') - var database = db.db(dbNameSamples); - // prepare first query - var query = { 'barcode': barcode }; - // query database - database.collection(collectionSamples).find(query).toArray(function(err, results) { - if(err) { - reject(err) - db.close() - } else if(!results){ - reject('No results') - db.close(); - } else if(Array.isArray(results)){ - if(results.length==0){ - reject('No results') - db.close() - } else { - // when results are created resolve promise - resolve(results[0].i) - db.close() - } - } - }); - }) - }); -// use results from first promise to query again -promise.then(firstQueryResults =>{ - MongoClient.connect(uriParticipants,{poolSize: 1000}, (err, db2) =>{ - if (err) throw err; - console.log('connected to database 2') - var database2 = db2.db(dbNameParticipants); - // create new query - var newQuery = {'i': firstQueryResults} - // query database - database2.collection(collectionParticipants).find(newQuery).toArray((err, results2)=> { - if(err){ - console.log(err) - db2.close() - } else if(!results2){ - console.log('No results') - db2.close() - } else { - if(results2.length == 0){ - console.log('No results') - db2.close() - } else { - results2.forEach(instance=>{ - for (var key in instance) { - if (instance.hasOwnProperty(key)) { - // check if values have 'HP' in value - if(JSON.stringify(instance[key]).includes('HP')){ - var hpTerm = JSON.stringify(instance[key]) - if((JSON.stringify(instance[key]).includes('(')) && (JSON.stringify(instance[key]).includes(')'))){ - hpTerm = hpTerm.split('(')[1].split(')')[0] - } - // save results in output file - fs.appendFileSync(fileName, hpTerm.replace(/['"]+/g, '')+'\n') - } - } - } - }) - db2.close() - } - } - }) - }) -}).catch(err=>{ - console.log(err) -}) diff --git a/main.nf b/main.nf index f99d763..afb6e11 100644 --- a/main.nf +++ b/main.nf @@ -1,5 +1,6 @@ #!/usr/bin/env nextflow -import groovy.json.* +import groovy. +on.* /* ======================================================================================== @@ -69,9 +70,6 @@ projectDir = workflow.projectDir ch_application_properties = params.application_properties ? Channel.value(file(params.application_properties)) : Channel.fromPath("${projectDir}/bin/application.properties") ch_auto_config_yml = params.auto_config_yml ? Channel.value(file(params.auto_config_yml)) : Channel.fromPath("${projectDir}/bin/auto_config.yml") -// Stage scripts from bin -ch_add_exomiser_fields_script = Channel.value(file("${projectDir}/bin/add_exomiser_fields_to_genotiers.js")) -ch_get_hpo_terms_script = Channel.value(file("${projectDir}/bin/get_hpo_terms_from_barcode.js")) // set exomiser specific flags pathogenicitySourcesList= definePathogenicitySources() From 596ac1666d9439ae9efc220e069b640e870ded88 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 16:55:21 +0100 Subject: [PATCH 80/98] change name of the profiles --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 22d9ca5..f82b9dc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,8 +58,8 @@ params { profiles { standard { includeConfig params.config } - family_test { includeConfig 'conf/family_test.config' } - single_vcf_test { includeConfig 'conf/single_vcf_test.config' } + test_family { includeConfig 'conf/family_test.config' } + test_single_vcf { includeConfig 'conf/single_vcf_test.config' } awsbatch { includeConfig 'conf/executors/awsbatch.config' } eu_west_1 { includeConfig 'conf/cloud-region/eu_west_1.config' } eu_west_2 { includeConfig 'conf/cloud-region/eu_west_2.config' } From 5672654934238d449c3355a4a439115f9787b8c1 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 16:56:39 +0100 Subject: [PATCH 81/98] fixed typo in documentation --- README.md | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 5b8f7d6..738b8b7 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -# Exomiser +# Exomiser + ## Pipeline documentation Table of contents @@ -7,7 +8,7 @@ Table of contents - [Pipeline description](#pipeline-description) - [Pipeline overview](#pipeline-overview) - [Input](#input) - - [--\](#--name_of_main_input) + - [--\](#--name_of_main_input) - [Processes](#processes) - [Output](#output) - [Options](#options) @@ -24,29 +25,27 @@ Table of contents ### Pipeline overview - - Name: exomiser-pipeline-nf - - Tools: exomiser - - Version: 12.1.0 +- Name: exomiser-pipeline-nf +- Tools: exomiser +- Version: 12.1.0 -It is a fully containerised nextflow pipeline that runs exomisers on either a single sample VCF file or a trio VCF file. +It is a fully containerised nextflow pipeline that runs exomisers on either a single sample VCF file or a trio VCF file. The Exomiser is a tool to perform genome-wide prioritisation of genomic variants including non-coding and regulatory variants using patient phenotypes as a means of differentiating candidate genes. - -To perform an analysis, Exomiser requires the patient's genome/exome in VCF format and their phenotype encoded in [HPO terms](https://hpo.jax.org/app/). The exomiser is also capable of analysing trios/small family genomes. +To perform an analysis, Exomiser requires the patient's genome/exome in VCF format and their phenotype encoded in [HPO terms](https://hpo.jax.org/app/). The exomiser is also capable of analysing trios/small family genomes. The main input of the pipeline (`families_file`) is a TSV file and the main output of the pipeline is an HTML file containing pathogenicity score of the called variants. - ### Input #### --families_file This is a TSV file that contains the following info tab separated -|run_id |proband_id |hpo |vcf_path |vcf_index_path |proband_sex |mother_id |father_id | -| :----: | :----: | :----: | :----: | :----: | :----: | :----: | :----: | -| | | | | | | | | | +| run_id | proband_id | hpo | vcf_path | vcf_index_path | proband_sex | mother_id | father_id | +| :----: | :--------: | :-: | :------: | :------------: | :---------: | :-------: | :-------: | --- | +| | | | | | | | | | The vcf_path column can contain the path to either a multiVCF(trio) or a single-sample VCF. In the case of a single-sample VCF, the last 2 columns must contain `nan` as a value. An example can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-nf/fam_file.tsv) @@ -67,18 +66,17 @@ There are other parameters that can be tweaked to personalize the behaviour of t ### Processes -Here is the list of steps performed by this pipeline. +Here is the list of steps performed by this pipeline. 1. `process ped_hpo_creation` - this process produces the pedigree (PED) file needed for exomiser to run using a python script. -2. `process 2` - this process is where the autoconfig file for exomiser is generated and exomiser is run. - +2. `process 2` - this process is where the autoconfig file for exomiser is generated and exomiser is run. + ### Output -- a html and a json file containing a report on the analysis +- a html and a json file containing a report on the analysis - the autoconfig file, for reproducibility purpose - a vcf file with the called variants that are identified as causative - ### Testing To run the pipeline with `docker` (used by default), type the following commands: @@ -99,12 +97,9 @@ Be careful when running this, as the pipeline requires the staging of 120 GB of ### Running on CloudOS - ### Profiles -| profile name | Run locally | Run on CloudOS | description | -| :----: | :----: | :----: | :----: | -|family_test | the data required is so big, it was tested on a c5.4xlarge EC2 machine | | this test is designed to test the pipeine on a multi-VCF with trio information | -|single_vcf_test | the data required is so big, it was tested on a c5.4xlarge EC2 machine | | this test is designed to test the pipeine on a single-sample-VCF | - - +| profile name | Run locally | Run on CloudOS | description | +| :-------------: | :--------------------------------------------------------------------: | :------------: | :-----------------------------------------------------------------------------: | +| test_family | the data required is so big, it was tested on a c5.4xlarge EC2 machine | Successful | this test is designed to test the pipeline on a multi-VCF with trio information | +| test_single_vcf | the data required is so big, it was tested on a c5.4xlarge EC2 machine | Successful | this test is designed to test the pipeline on a single-sample-VCF | From 39cb1e1b712beb00482a35b418e810fbe0d9bec3 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 17:01:15 +0100 Subject: [PATCH 82/98] changed channel names --- main.nf | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/main.nf b/main.nf index 3c7683e..d77bcfa 100644 --- a/main.nf +++ b/main.nf @@ -49,7 +49,7 @@ if(params.families_file) { Channel .fromPath( "${params.families_file}") .ifEmpty { exit 1, "Family file: ${params.families_file} not found"} - .set {ch_vcf} + .set {ch_families_file} } else { exit 1, "please specify Family file with --families_file parameter" } @@ -61,9 +61,7 @@ Channel .ifEmpty { exit 1, "Cannot find input file : ${params.families_file}" } .splitCsv(header:true, sep:'\t', strip: true) .map {row -> [ row.proband_id, file(row.vcf_path), file(row.vcf_index_path)] } - .set {ch_input1} - -ch_input1.into { ch_input; ch_input2 } + .into {ch_vcf_paths; ch_vcf_paths2} // Conditional creation of channels, custom if provided else default from bin/ projectDir = workflow.projectDir @@ -103,10 +101,10 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") errorStrategy 'retry' maxErrors 5 input: - set proband_id1, file(vcf_path1), file(vcf_index_path1) from ch_input + set proband_id1, file(vcf_path1), file(vcf_index_path1) from ch_vcf_paths file family_file from ch_vcf.collect() output: - tuple val(proband_id1), file("${proband_id1}-HPO.txt"), file("${proband_id1}.ped"), file("${proband_id1}_ID.txt") into join_ch + tuple val(proband_id1), file("${proband_id1}-HPO.txt"), file("${proband_id1}.ped"), file("${proband_id1}_ID.txt") into ch_to_join script: """ ped_module.py --input_family $family_file @@ -123,7 +121,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") ---------------------------------------------------*/ -combined_channel = ch_input2.join(join_ch, by: 0).view() +ch_combined = ch_vcf_paths2.join(ch_to_join, by: 0).view() /*-------------------------------------------------- Run containarised Exomiser @@ -141,7 +139,7 @@ process exomiser { errorStrategy 'retry' maxRetries 3 input: - set val(proband_id1),file(vcf_path1),file(vcf_index1), file(hpo_file), file(ped_file),file(id_file) from combined_channel + set val(proband_id1),file(vcf_path1),file(vcf_index1), file(hpo_file), file(ped_file),file(id_file) from ch_combined each file(application_properties) from ch_application_properties each file(auto_config_yml) from ch_auto_config_yml each file(exomiser_data) from ch_exomiser_data From 028b6956436452ab249d176fc68bd2e4a21af956 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 17:10:18 +0100 Subject: [PATCH 83/98] outputs MultiQC html into outdir --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index d77bcfa..fe33f8a 100644 --- a/main.nf +++ b/main.nf @@ -134,6 +134,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") process exomiser { tag "${vcf_path1}" publishDir "${params.outdir}/${proband_id1}", mode: 'copy' + publishDir "${params.outdir}/", mode: 'copy', pattern: "MultiQC/multiqc_report.html" maxForks 1 submitRateLimit = '1 / 5 m' errorStrategy 'retry' From 6d381aee768ba9fa8a879a324c628ddd2776dddd Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 17:19:59 +0100 Subject: [PATCH 84/98] removing directive from process and adding them to the config file --- main.nf | 5 +---- nextflow.config | 4 ++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index fe33f8a..40462ed 100644 --- a/main.nf +++ b/main.nf @@ -135,10 +135,7 @@ process exomiser { tag "${vcf_path1}" publishDir "${params.outdir}/${proband_id1}", mode: 'copy' publishDir "${params.outdir}/", mode: 'copy', pattern: "MultiQC/multiqc_report.html" - maxForks 1 - submitRateLimit = '1 / 5 m' - errorStrategy 'retry' - maxRetries 3 + input: set val(proband_id1),file(vcf_path1),file(vcf_index1), file(hpo_file), file(ped_file),file(id_file) from ch_combined each file(application_properties) from ch_application_properties diff --git a/nextflow.config b/nextflow.config index f82b9dc..9aa2d59 100644 --- a/nextflow.config +++ b/nextflow.config @@ -76,5 +76,9 @@ process { containerOptions = "--volume ${params.exomiser_data_directory}:/data/" memory = 6.GB cpus = 4 + maxForks 1 + submitRateLimit = '1 / 5 m' + errorStrategy 'retry' + maxRetries 3 } } \ No newline at end of file From d15b8bde45d9636642a64d4f727b2f8ae5a5fb73 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 17:54:40 +0100 Subject: [PATCH 85/98] added usage command line with params and defaults --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 738b8b7..ae89a6c 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,19 @@ Here is the list of steps performed by this pipeline. - the autoconfig file, for reproducibility purpose - a vcf file with the called variants that are identified as causative +### Usage + +The pipeline can be run like: + +``` +nextflow run main.nf --families_file 's3://lifebit-featured-datasets/pipelines/exomiser-nf/fam_file.tsv' \ + --hpo_terms_file 's3://lifebit-featured-datasets/pipelines/exomiser-nf/hpo_terms_file.txt' \ + --prioritisers 'hiPhivePrioritiser' \ + --exomiser_data 's3://lifebit-featured-datasets/pipelines/exomiser-data-bundle' \ + --application_properties 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' \ + --auto_config_yml 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' +``` + ### Testing To run the pipeline with `docker` (used by default), type the following commands: From 668993dc871752737f28dec5bde02823b124e643 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 18:26:37 +0100 Subject: [PATCH 86/98] parametrized resources --- nextflow.config | 20 ++++++++++++++------ testdata/fam_file_multi_hpo.tsv | 2 ++ 2 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 testdata/fam_file_multi_hpo.tsv diff --git a/nextflow.config b/nextflow.config index 9aa2d59..6b3bbb9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -53,6 +53,14 @@ params { aws_batch_cli_path = '/home/ec2-user/miniconda/bin/aws' aws_batch_fetch_instance_type = true aws_region = 'ap-east-1' + + //process resources + memory = 6.GB + cpus = 4 + maxForks = 1 + submitRateLimit = '1 / 5 m' + errorStrategy = 'retry' + maxRetries = 3 } @@ -74,11 +82,11 @@ process { withName: exomiser { container = "quay.io/lifebitai/exomiser:${params.exomiser_container_tag}" containerOptions = "--volume ${params.exomiser_data_directory}:/data/" - memory = 6.GB - cpus = 4 - maxForks 1 - submitRateLimit = '1 / 5 m' - errorStrategy 'retry' - maxRetries 3 + memory = params.memory + cpus = params.cpus + maxForks = params.maxForks + submitRateLimit = params.submitRateLimit + errorStrategy = params.errorStrategy + maxRetries = params.maxRetries } } \ No newline at end of file diff --git a/testdata/fam_file_multi_hpo.tsv b/testdata/fam_file_multi_hpo.tsv new file mode 100644 index 0000000..88e86b1 --- /dev/null +++ b/testdata/fam_file_multi_hpo.tsv @@ -0,0 +1,2 @@ +run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id +EX001 ERR3239334 HP:0001156; HP:0001363; HP:0011304 s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz.tbi M ERR3989342 ERR3989341 \ No newline at end of file From 7e5ce7edeb7ce6572311bc754dae5daaf48a17ae Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 18:33:28 +0100 Subject: [PATCH 87/98] fix typo --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 40462ed..bb38031 100644 --- a/main.nf +++ b/main.nf @@ -1,6 +1,6 @@ #!/usr/bin/env nextflow -import groovy. -on.* +import groovy.json.* + /* ======================================================================================== From 02de25bae58bed8405d7bf4b06c16273ae047ac6 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 18:39:28 +0100 Subject: [PATCH 88/98] fix typo --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index bb38031..9f11d44 100644 --- a/main.nf +++ b/main.nf @@ -102,7 +102,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") maxErrors 5 input: set proband_id1, file(vcf_path1), file(vcf_index_path1) from ch_vcf_paths - file family_file from ch_vcf.collect() + file family_file from ch_families_file.collect() output: tuple val(proband_id1), file("${proband_id1}-HPO.txt"), file("${proband_id1}.ped"), file("${proband_id1}_ID.txt") into ch_to_join script: From 4abc11d2d7bafbab0e457c365275f47e060a6ec8 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 31 Oct 2023 18:45:44 +0100 Subject: [PATCH 89/98] fix typo --- README.md | 1 - nextflow.config | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index ae89a6c..5f22775 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,6 @@ The pipeline can be run like: ``` nextflow run main.nf --families_file 's3://lifebit-featured-datasets/pipelines/exomiser-nf/fam_file.tsv' \ - --hpo_terms_file 's3://lifebit-featured-datasets/pipelines/exomiser-nf/hpo_terms_file.txt' \ --prioritisers 'hiPhivePrioritiser' \ --exomiser_data 's3://lifebit-featured-datasets/pipelines/exomiser-data-bundle' \ --application_properties 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' \ diff --git a/nextflow.config b/nextflow.config index 6b3bbb9..63f405b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -85,7 +85,7 @@ process { memory = params.memory cpus = params.cpus maxForks = params.maxForks - submitRateLimit = params.submitRateLimit + //submitRateLimit = params.submitRateLimit errorStrategy = params.errorStrategy maxRetries = params.maxRetries } From 6c77128e582da9db90dde90b89f7a36efade76be Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Thu, 2 Nov 2023 09:18:53 +0100 Subject: [PATCH 90/98] fix typo --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 63f405b..1d13efc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,7 +58,7 @@ params { memory = 6.GB cpus = 4 maxForks = 1 - submitRateLimit = '1 / 5 m' + submitRateLimit = '1/5min' errorStrategy = 'retry' maxRetries = 3 } @@ -85,7 +85,7 @@ process { memory = params.memory cpus = params.cpus maxForks = params.maxForks - //submitRateLimit = params.submitRateLimit + submitRateLimit = params.submitRateLimit errorStrategy = params.errorStrategy maxRetries = params.maxRetries } From 1491d41fbae803948e1e48b317ee2c392442cac3 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Thu, 2 Nov 2023 09:20:18 +0100 Subject: [PATCH 91/98] fix typo --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 9f11d44..543eae3 100644 --- a/main.nf +++ b/main.nf @@ -107,7 +107,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") tuple val(proband_id1), file("${proband_id1}-HPO.txt"), file("${proband_id1}.ped"), file("${proband_id1}_ID.txt") into ch_to_join script: """ - ped_module.py --input_family $family_file + ped_module.py --input_family ${family_file} #to change nan in 0s if there are any sed -i 's/nan/0/g' ${proband_id1}.ped #to remove the "parent" line if it's a single sample From 5ef94b81cb1b625918c7e96031a0e86530b6a041 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Thu, 2 Nov 2023 09:36:11 +0100 Subject: [PATCH 92/98] moved submitRateLimit back to the process --- main.nf | 1 + nextflow.config | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 543eae3..a64bb00 100644 --- a/main.nf +++ b/main.nf @@ -133,6 +133,7 @@ ch_exomiser_data = Channel.fromPath("${params.exomiser_data}") process exomiser { tag "${vcf_path1}" + submitRateLimit = '1 / 5 m' publishDir "${params.outdir}/${proband_id1}", mode: 'copy' publishDir "${params.outdir}/", mode: 'copy', pattern: "MultiQC/multiqc_report.html" diff --git a/nextflow.config b/nextflow.config index 1d13efc..6d24337 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,7 +58,7 @@ params { memory = 6.GB cpus = 4 maxForks = 1 - submitRateLimit = '1/5min' + //submitRateLimit = '1/5min' errorStrategy = 'retry' maxRetries = 3 } @@ -85,7 +85,7 @@ process { memory = params.memory cpus = params.cpus maxForks = params.maxForks - submitRateLimit = params.submitRateLimit + //submitRateLimit = params.submitRateLimit errorStrategy = params.errorStrategy maxRetries = params.maxRetries } From b266fee3733d755102513e9d3c6370fb3d3609a7 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Thu, 2 Nov 2023 11:25:56 +0100 Subject: [PATCH 93/98] added testing profile for multi_hpo --- conf/family_test.config | 1 - conf/multi_hpo_test.config | 7 +++++++ nextflow.config | 1 + testdata/fam_file_multi_hpo.tsv | 2 +- 4 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 conf/multi_hpo_test.config diff --git a/conf/family_test.config b/conf/family_test.config index 3ea336e..5f2e5af 100644 --- a/conf/family_test.config +++ b/conf/family_test.config @@ -1,6 +1,5 @@ params { families_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/fam_file.tsv' - hpo_terms_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/hpo_terms_file.txt' prioritisers = 'hiPhivePrioritiser' exomiser_data = "s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle" application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' diff --git a/conf/multi_hpo_test.config b/conf/multi_hpo_test.config new file mode 100644 index 0000000..38e67c3 --- /dev/null +++ b/conf/multi_hpo_test.config @@ -0,0 +1,7 @@ +params { + families_file = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/fam_file_multi_hpo.tsv' + prioritisers = 'hiPhivePrioritiser' + exomiser_data = "s3://lifebit-featured-datasets/pipelines/exomiser-data-bundle" + application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' + auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' +} diff --git a/nextflow.config b/nextflow.config index 6d24337..abb64f0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -68,6 +68,7 @@ profiles { standard { includeConfig params.config } test_family { includeConfig 'conf/family_test.config' } test_single_vcf { includeConfig 'conf/single_vcf_test.config' } + test_multi_hpo { includeConfig 'conf/multi_hpo_test.config' } awsbatch { includeConfig 'conf/executors/awsbatch.config' } eu_west_1 { includeConfig 'conf/cloud-region/eu_west_1.config' } eu_west_2 { includeConfig 'conf/cloud-region/eu_west_2.config' } diff --git a/testdata/fam_file_multi_hpo.tsv b/testdata/fam_file_multi_hpo.tsv index 88e86b1..6287f81 100644 --- a/testdata/fam_file_multi_hpo.tsv +++ b/testdata/fam_file_multi_hpo.tsv @@ -1,2 +1,2 @@ run_id proband_id hpo vcf_path vcf_index_path proband_sex mother_id father_id -EX001 ERR3239334 HP:0001156; HP:0001363; HP:0011304 s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz.tbi M ERR3989342 ERR3989341 \ No newline at end of file +EX001 ERR3239334 HP:0001156,HP:0001363,HP:0011304 s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz s3://lifebit-featured-datasets/pipelines/exomiser-nf/family_ERR3239334_ERR3989341_ERR3989342_small.vcf.gz.tbi M ERR3989342 ERR3989341 \ No newline at end of file From 0e617b613214ef0f08bb1f8e33d561bc04d20ef7 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Mon, 6 Nov 2023 14:49:48 +0100 Subject: [PATCH 94/98] fixed README --- .gitignore | 1 + README.md | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 4fc773e..1410098 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ nextflow_schema.json *fq **/2102_* exo +*.mt/* // Ignore node modules bin/node_modules/ diff --git a/README.md b/README.md index 5f22775..96ad979 100644 --- a/README.md +++ b/README.md @@ -44,12 +44,14 @@ The main input of the pipeline (`families_file`) is a TSV file and the main outp This is a TSV file that contains the following info tab separated | run_id | proband_id | hpo | vcf_path | vcf_index_path | proband_sex | mother_id | father_id | -| :----: | :--------: | :-: | :------: | :------------: | :---------: | :-------: | :-------: | --- | -| | | | | | | | | | +| :----: | :--------: | :-: | :------: | :------------: | :---------: | :-------: | :-------: | +| | | | | | | | | The vcf_path column can contain the path to either a multiVCF(trio) or a single-sample VCF. In the case of a single-sample VCF, the last 2 columns must contain `nan` as a value. An example can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-nf/fam_file.tsv) +In the hpo column, multiple comma-separated HPO terms can be present. + ### --application_properties This is a file needed by exomiser to run. It contains information on where to find the reference data as well as the versioning of the reference genome. An example can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-nf/application.properties) @@ -69,7 +71,7 @@ There are other parameters that can be tweaked to personalize the behaviour of t Here is the list of steps performed by this pipeline. 1. `process ped_hpo_creation` - this process produces the pedigree (PED) file needed for exomiser to run using a python script. -2. `process 2` - this process is where the autoconfig file for exomiser is generated and exomiser is run. +2. `process exomiser` - this process is where the autoconfig file for exomiser is generated and exomiser is run. ### Output From fe46b3f9df5d8f3748c10c0dacd36d5f41d1d468 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Tue, 7 Nov 2023 10:18:18 +0100 Subject: [PATCH 95/98] fix typo --- bin/auto_config_V2.yml | 138 +++++++++++++++++++++++++++++++++++++++++ nextflow.config | 2 +- 2 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 bin/auto_config_V2.yml diff --git a/bin/auto_config_V2.yml b/bin/auto_config_V2.yml new file mode 100644 index 0000000..9e72def --- /dev/null +++ b/bin/auto_config_V2.yml @@ -0,0 +1,138 @@ +## Exomiser Analysis Template. +# These are all the possible options for running exomiser. Use this as a template for +# your own set-up. +--- +analysis: + # hg19 or hg38 - ensure that the application has been configured to run the specified assembly otherwise it will halt. + genomeAssembly: hg38 + vcf: vcf_placeholder + ped: ped_placeholder + proband: proband_placeholder + hpoIds: [hpo_ids_placeholder] + # These are the default settings, with values representing the maximum minor allele frequency in percent (%) permitted for an + # allele to be considered as a causative candidate under that mode of inheritance. + # If you just want to analyse a sample under a single inheritance mode, delete/comment-out the others. For AUTOSOMAL_RECESSIVE + # or X_RECESSIVE ensure *both* relevant HOM_ALT and COMP_HET modes are present. + # In cases where you do not want any cut-offs applied an empty map should be used e.g. inheritanceModes: {} + inheritanceModes: { + AUTOSOMAL_DOMINANT: 0.1, + AUTOSOMAL_RECESSIVE_HOM_ALT: 0.1, + AUTOSOMAL_RECESSIVE_COMP_HET: 2.0, + X_DOMINANT: 0.1, + X_RECESSIVE_HOM_ALT: 0.1, + X_RECESSIVE_COMP_HET: 2.0, + MITOCHONDRIAL: 0.2 + } + #FULL or PASS_ONLY + analysisMode: PASS_ONLY + #Possible frequencySources: + #Thousand Genomes project - http://www.1000genomes.org/ (THOUSAND_GENOMES) + #TOPMed - https://www.nhlbi.nih.gov/science/precision-medicine-activities (TOPMED) + #UK10K - http://www.uk10k.org/ (UK10K) + #ESP project - http://evs.gs.washington.edu/EVS/ (ESP_) + # ESP_AFRICAN_AMERICAN, ESP_EUROPEAN_AMERICAN, ESP_ALL, + #ExAC project http://exac.broadinstitute.org/about (EXAC_) + # EXAC_AFRICAN_INC_AFRICAN_AMERICAN, EXAC_AMERICAN, + # EXAC_SOUTH_ASIAN, EXAC_EAST_ASIAN, + # EXAC_FINNISH, EXAC_NON_FINNISH_EUROPEAN, + # EXAC_OTHER + #gnomAD - http://gnomad.broadinstitute.org/ (GNOMAD_E, GNOMAD_G) + frequencySources: [ + THOUSAND_GENOMES, + TOPMED, + UK10K, + + ESP_AFRICAN_AMERICAN, ESP_EUROPEAN_AMERICAN, ESP_ALL, + + EXAC_AFRICAN_INC_AFRICAN_AMERICAN, EXAC_AMERICAN, + EXAC_SOUTH_ASIAN, EXAC_EAST_ASIAN, + EXAC_FINNISH, EXAC_NON_FINNISH_EUROPEAN, + EXAC_OTHER, + + GNOMAD_E_AFR, + GNOMAD_E_AMR, +# GNOMAD_E_ASJ, + GNOMAD_E_EAS, + GNOMAD_E_FIN, + GNOMAD_E_NFE, + GNOMAD_E_OTH, + GNOMAD_E_SAS, + + GNOMAD_G_AFR, + GNOMAD_G_AMR, +# GNOMAD_G_ASJ, + GNOMAD_G_EAS, + GNOMAD_G_FIN, + GNOMAD_G_NFE, + GNOMAD_G_OTH, + GNOMAD_G_SAS + ] + #Possible pathogenicitySources: POLYPHEN, MUTATION_TASTER, SIFT, CADD, REMM + #REMM is trained on non-coding regulatory regions + #*WARNING* if you enable CADD or REMM ensure that you have downloaded and installed the CADD/REMM tabix files + #and updated their location in the application.properties. Exomiser will not run without this. + pathogenicitySources: [pathogenicity_sources_placeholder] + #this is the recommended order for a genome-sized analysis. + #all steps are optional + steps: [ + #intervalFilter: {interval: 'chr10:123256200-123256300'}, + # or for multiple intervals: + #intervalFilter: {intervals: ['chr10:123256200-123256300', 'chr10:123256290-123256350']}, + # or using a BED file - NOTE this should be 0-based, Exomiser otherwise uses 1-based coordinates in line with VCF + #intervalFilter: {bed: /full/path/to/bed_file.bed}, + failedVariantFilter: {}, + #genePanelFilter: {geneSymbols: ['FGFR1','FGFR2']}, + ##################################################################################### + #hiPhivePrioritiser: {}, + #running the prioritiser followed by a priorityScoreFilter will remove genes + #which are least likely to contribute to the phenotype defined in hpoIds, this will + #dramatically reduce the time and memory required to analyse a genome. + # 0.501 is a good compromise to select good phenotype matches and the best protein-protein interactions hits from hiPhive + #priorityScoreFilter: {priorityType: HIPHIVE_PRIORITY, minPriorityScore: 0.501}, + ###################################################################################### + #variantEffectFilter: {remove: [SYNONYMOUS_VARIANT]}, + #regulatoryFeatureFilter removes all non-regulatory non-coding variants over 20Kb from a known gene. + #regulatoryFeatureFilter: {}, + #knownVariantFilter: {}, #removes variants represented in the database + variantEffectFilter: { + remove: [ + FIVE_PRIME_UTR_EXON_VARIANT, + FIVE_PRIME_UTR_INTRON_VARIANT, + THREE_PRIME_UTR_EXON_VARIANT, + THREE_PRIME_UTR_INTRON_VARIANT, + NON_CODING_TRANSCRIPT_EXON_VARIANT, + NON_CODING_TRANSCRIPT_INTRON_VARIANT, + CODING_TRANSCRIPT_INTRON_VARIANT, + UPSTREAM_GENE_VARIANT, + DOWNSTREAM_GENE_VARIANT, + INTERGENIC_VARIANT, + REGULATORY_REGION_VARIANT + ] + }, + frequencyFilter: {maxFrequency: 2.0}, + pathogenicityFilter: {keepNonPathogenic: keep_non_pathogenic_placeholder}, + #inheritanceFilter and omimPrioritiser should always run AFTER all other filters have completed + #they will analyse genes according to the specified modeOfInheritance above- UNDEFINED will not be analysed. + inheritanceFilter: {}, + #omimPrioritiser isn't mandatory. + omimPrioritiser: {}, + #Other prioritisers: Only combine omimPrioritiser with one of these. + #Don't include any if you only want to filter the variants. + hiPhivePrioritiser: {} + # or run hiPhive in benchmarking mode: + #hiPhivePrioritiser: {diseaseId: 'OMIM:101600', candidateGeneSymbol: FGFR2, runParams: 'human,mouse,fish,ppi'}, + #phenixPrioritiser: {} + #exomeWalkerPrioritiser: {seedGeneIds: [11111, 22222, 33333]} + #prioritiser_placeholder : {} + ] +outputOptions: + outputContributingVariantsOnly: false + #numGenes options: 0 = all or specify a limit e.g. 500 for the first 500 results + numGenes: 0 + #outputPrefix options: specify the path/filename without an extension and this will be added + # according to the outputFormats option. If unspecified this will default to the following: + # {exomiserDir}/results/input-vcf-name-exomiser-results.html + # alternatively, specify a fully qualifed path only. e.g. /users/jules/exomes/analysis + outputPrefix: output_prefix_placeholder + #out-format options: HTML, JSON, TSV_GENE, TSV_VARIANT, VCF (default: HTML) + outputFormats: [HTML, JSON, TSV_GENE, TSV_VARIANT, VCF] diff --git a/nextflow.config b/nextflow.config index abb64f0..acb2791 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,7 +24,7 @@ params { cadd_snvs = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/cadd_snvs' phenix_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/phenix' application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' - auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config.yml' + auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config_v2.yml' hpo_terms_file = false modes_of_inheritance = 'AUTOSOMAL_DOMINANT,AUTOSOMAL_RECESSIVE,X_RECESSIVE,UNDEFINED' prioritisers = 'hiPhivePrioritiser,phivePrioritiser,phenixPrioritiser' From 88b3466d70de27edbb7930a8e74f377764c39df6 Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Fri, 17 Nov 2023 11:32:14 +0100 Subject: [PATCH 96/98] changed configs for template standardization --- conf/containers/dockerhub.config | 3 ++ conf/containers/ecr.config | 5 ++ conf/containers/quay.config | 4 +- conf/customised_pipeline_resources.config | 37 ++++++++++++++ conf/data/data.config | 6 +++ conf/executors/singularity.config | 20 ++++++++ conf/{data => tests/ci}/ci_test_data.config | 0 .../test_full_family.config} | 0 .../test_full_multi_hpo.config} | 0 .../test_full_single_vcf.config} | 0 nextflow.config | 49 ++++++++++++++----- 11 files changed, 109 insertions(+), 15 deletions(-) create mode 100644 conf/containers/dockerhub.config create mode 100644 conf/containers/ecr.config create mode 100644 conf/customised_pipeline_resources.config create mode 100644 conf/executors/singularity.config rename conf/{data => tests/ci}/ci_test_data.config (100%) rename conf/{family_test.config => tests/test_full_family.config} (100%) rename conf/{multi_hpo_test.config => tests/test_full_multi_hpo.config} (100%) rename conf/{single_vcf_test.config => tests/test_full_single_vcf.config} (100%) diff --git a/conf/containers/dockerhub.config b/conf/containers/dockerhub.config new file mode 100644 index 0000000..d447553 --- /dev/null +++ b/conf/containers/dockerhub.config @@ -0,0 +1,3 @@ +// params { +// main_container = 'dockerhub.io/lifebitaiorg/report:latest' +// } diff --git a/conf/containers/ecr.config b/conf/containers/ecr.config new file mode 100644 index 0000000..c2d7d00 --- /dev/null +++ b/conf/containers/ecr.config @@ -0,0 +1,5 @@ +// params { +// main_container = 'https://${params.aws_account_id}.dkr.${params.aws_region}.amazonaws.com/lifebitaiorg/report:latest' +// } +// ECR pattern: +// https://aws_account_id.dkr.ecr.region.amazonaws.com/lifebitaiorg/tool:version diff --git a/conf/containers/quay.config b/conf/containers/quay.config index b1601bc..d7a13ee 100644 --- a/conf/containers/quay.config +++ b/conf/containers/quay.config @@ -1,3 +1,3 @@ params { - main_container = 'quay.io/lifebitai/exomiser:12.1.0' -} \ No newline at end of file + main_container = "quay.io/lifebitai/exomiser:${params.exomiser_container_tag}" + } \ No newline at end of file diff --git a/conf/customised_pipeline_resources.config b/conf/customised_pipeline_resources.config new file mode 100644 index 0000000..3ee5858 --- /dev/null +++ b/conf/customised_pipeline_resources.config @@ -0,0 +1,37 @@ +params { + + // process resources default + cpus = 1 + memory = 2.GB + time = 8.h // do not change + + // max resources limits defaults + max_cpus = 8 + max_memory = 60.GB + max_time = 300.h // do not change + + // process_micro defaults + micro_memory = 2.GB + micro_cpus = 1 + + // process_small defaults + small_memory = 4.GB + small_cpus = 2 + + // process_medium defaults + medium_memory = 6.GB + medium_cpus = 4 + + // process_large defaults + large_memory = 15.GB + large_cpus = 4 + + // other parameters + echo = false + errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'terminate' } + maxErrors = -1 + maxRetries = 3 + maxForks = 200 + queueSize = 200 + +} diff --git a/conf/data/data.config b/conf/data/data.config index 4cb134e..fa26105 100644 --- a/conf/data/data.config +++ b/conf/data/data.config @@ -1 +1,7 @@ // If there is any data that needs to be included in the config, it should be placed here using "${params.reference_data_bucket}/path/to/data" +params { + exomiser_data = "${params.reference_data_bucket}/pipelines/exomiser-data-bundle" + application_properties = "${params.reference_data_bucket}/pipelines/exomiser-nf/application.properties" + auto_config_yml = '${params.reference_data_bucket}/pipelines/exomiser-nf/auto_config_v2.yml' + +} \ No newline at end of file diff --git a/conf/executors/singularity.config b/conf/executors/singularity.config new file mode 100644 index 0000000..0141add --- /dev/null +++ b/conf/executors/singularity.config @@ -0,0 +1,20 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running pipeline with Singularity locally + * ------------------------------------------------- + * Base config needed for running with -profile singularity + */ + +params { + singularity_cache = "local_singularity_cache" +} + +singularity { + enabled = true + cacheDir = params.singularity_cache + autoMounts = true +} + +docker { + enabled = false +} \ No newline at end of file diff --git a/conf/data/ci_test_data.config b/conf/tests/ci/ci_test_data.config similarity index 100% rename from conf/data/ci_test_data.config rename to conf/tests/ci/ci_test_data.config diff --git a/conf/family_test.config b/conf/tests/test_full_family.config similarity index 100% rename from conf/family_test.config rename to conf/tests/test_full_family.config diff --git a/conf/multi_hpo_test.config b/conf/tests/test_full_multi_hpo.config similarity index 100% rename from conf/multi_hpo_test.config rename to conf/tests/test_full_multi_hpo.config diff --git a/conf/single_vcf_test.config b/conf/tests/test_full_single_vcf.config similarity index 100% rename from conf/single_vcf_test.config rename to conf/tests/test_full_single_vcf.config diff --git a/nextflow.config b/nextflow.config index acb2791..3f24384 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,12 +2,16 @@ manifest { name = 'lifebit-ai/exomiser-nf' description = 'A pipeline to perform variant prioritisation' mainScript = 'main.nf' - version = 'v1.0' + version = 'v2.0' } +includeConfig 'conf/customised_pipeline_resources.config' + docker.enabled = true params { + raci_owner = "Lifebit" + // Exomiser specific parameters reference_data_bucket = "s3://lifebit-featured-datasets" bucket_pattern = "lifebit-featured-datasets" @@ -23,8 +27,8 @@ params { exomiser_phenotype_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/2102_phenotype' cadd_snvs = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/cadd_snvs' phenix_data = 's3://lifebit-featured-datasets/pipelines/exomiser/very_fake/phenix' - application_properties = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/application.properties' - auto_config_yml = 's3://lifebit-featured-datasets/pipelines/exomiser-nf/auto_config_v2.yml' + application_properties = "${params.reference_data_bucket}/pipelines/exomiser-nf/application.properties" + auto_config_yml = '${params.reference_data_bucket}/pipelines/exomiser-nf/auto_config_v2.yml' hpo_terms_file = false modes_of_inheritance = 'AUTOSOMAL_DOMINANT,AUTOSOMAL_RECESSIVE,X_RECESSIVE,UNDEFINED' prioritisers = 'hiPhivePrioritiser,phivePrioritiser,phenixPrioritiser' @@ -48,11 +52,16 @@ params { exomiser_container_tag = '12.1.0' cloudos_cli_container_tag = '0.0.2' - // awsbatch specific - aws_batch_process_queue = null - aws_batch_cli_path = '/home/ec2-user/miniconda/bin/aws' - aws_batch_fetch_instance_type = true - aws_region = 'ap-east-1' + queueSize = 200 + executor = false + + // AWS batch + aws_region = 'eu-west-1' + aws_batch_default_queue = "optimal-instance-1tb-ami-on-demand-queue" + aws_batch_cli_path = '/home/ec2-user/miniconda/bin/aws' + aws_batch_fetch_instance_type = true + aws_batch_max_parallel_transfers = 2 + aws_batch_volumes = '/home/ec2-user/.aws:/root/.aws' //process resources memory = 6.GB @@ -63,25 +72,33 @@ params { maxRetries = 3 } +includeConfig 'conf/containers/quay.config' +includeConfig 'conf/data/data.config' // Loads in data + profiles { standard { includeConfig params.config } - test_family { includeConfig 'conf/family_test.config' } - test_single_vcf { includeConfig 'conf/single_vcf_test.config' } - test_multi_hpo { includeConfig 'conf/multi_hpo_test.config' } awsbatch { includeConfig 'conf/executors/awsbatch.config' } eu_west_1 { includeConfig 'conf/cloud-region/eu_west_1.config' } eu_west_2 { includeConfig 'conf/cloud-region/eu_west_2.config' } test_full { includeConfig "conf/tests/full/test_full.config" } - ci_test_data { includeConfig "conf/data/ci_test_data.config" } + test_full_family { includeConfig 'conf/tests/test_full_family.config' } + test_full_single_vcf { includeConfig 'conf/tests/test_full_single_vcf.config' } + test_full_multi_hpo { includeConfig 'conf/tests/test_full_multi_hpo.config' } + ci_test_data { includeConfig "conf/tests/ci/ci_test_data.config" } + singularity { includeConfig 'conf/executors/singularity.config' } + dockerhub { includeConfig 'conf/containers/dockerhub.config' } quay { includeConfig 'conf/containers/quay.config' } + ecr { includeConfig 'conf/containers/ecr.config' } } +includeConfig 'conf/resources.config' + process { echo = params.echo errorStrategy = params.errorStrategy withName: exomiser { - container = "quay.io/lifebitai/exomiser:${params.exomiser_container_tag}" + container = params.main_container containerOptions = "--volume ${params.exomiser_data_directory}:/data/" memory = params.memory cpus = params.cpus @@ -90,4 +107,10 @@ process { errorStrategy = params.errorStrategy maxRetries = params.maxRetries } +} + + +executor { + name = params.executor + queueSize = params.queueSize } \ No newline at end of file From 84e1fbc48198b851f23d277a86f04827b8a7c28a Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Fri, 17 Nov 2023 11:35:17 +0100 Subject: [PATCH 97/98] moved test config files --- conf/tests/{ => full}/test_full_family.config | 0 conf/tests/{ => full}/test_full_multi_hpo.config | 0 conf/tests/{ => full}/test_full_single_vcf.config | 0 nextflow.config | 6 +++--- 4 files changed, 3 insertions(+), 3 deletions(-) rename conf/tests/{ => full}/test_full_family.config (100%) rename conf/tests/{ => full}/test_full_multi_hpo.config (100%) rename conf/tests/{ => full}/test_full_single_vcf.config (100%) diff --git a/conf/tests/test_full_family.config b/conf/tests/full/test_full_family.config similarity index 100% rename from conf/tests/test_full_family.config rename to conf/tests/full/test_full_family.config diff --git a/conf/tests/test_full_multi_hpo.config b/conf/tests/full/test_full_multi_hpo.config similarity index 100% rename from conf/tests/test_full_multi_hpo.config rename to conf/tests/full/test_full_multi_hpo.config diff --git a/conf/tests/test_full_single_vcf.config b/conf/tests/full/test_full_single_vcf.config similarity index 100% rename from conf/tests/test_full_single_vcf.config rename to conf/tests/full/test_full_single_vcf.config diff --git a/nextflow.config b/nextflow.config index 3f24384..2629451 100644 --- a/nextflow.config +++ b/nextflow.config @@ -82,9 +82,9 @@ profiles { eu_west_1 { includeConfig 'conf/cloud-region/eu_west_1.config' } eu_west_2 { includeConfig 'conf/cloud-region/eu_west_2.config' } test_full { includeConfig "conf/tests/full/test_full.config" } - test_full_family { includeConfig 'conf/tests/test_full_family.config' } - test_full_single_vcf { includeConfig 'conf/tests/test_full_single_vcf.config' } - test_full_multi_hpo { includeConfig 'conf/tests/test_full_multi_hpo.config' } + test_full_family { includeConfig 'conf/tests/full/test_full_family.config' } + test_full_single_vcf { includeConfig 'conf/tests/full/test_full_single_vcf.config' } + test_full_multi_hpo { includeConfig 'conf/tests/full/test_full_multi_hpo.config' } ci_test_data { includeConfig "conf/tests/ci/ci_test_data.config" } singularity { includeConfig 'conf/executors/singularity.config' } dockerhub { includeConfig 'conf/containers/dockerhub.config' } From e2db14b2b029f7cdca66d9442591380f1f3a549e Mon Sep 17 00:00:00 2001 From: Leila Mansouri Date: Fri, 17 Nov 2023 11:44:36 +0100 Subject: [PATCH 98/98] changed README to reflect changes --- README.md | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 96ad979..5e7e7b6 100644 --- a/README.md +++ b/README.md @@ -62,7 +62,7 @@ This is a file needed by exomiser to run. It contains placeholders in the text t ### --exomiser_data -This path refers to the reference data bundle needed by exomiser (~120 GB!). A copy of such files can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-data-bundle/) . The reference dataset has been added as a parameter, allowing flexibility to pull the data from any resource (i.e. cloud, local storage, ftp, ...) and Nextlfow will automatically take care of fetching the data without having to add anything to the pipeline itself. +This path refers to the reference data bundle needed by exomiser (~120 GB!). A copy of such files can be found [here](https://lifebit-featured-datasets.s3.eu-west-1.amazonaws.com/pipelines/exomiser-data-bundle/) . The reference dataset has been added as a parameter, allowing flexibility to pull the data from any resource (i.e. cloud, local storage, ftp, ...) and Nextflow will automatically take care of fetching the data without having to add anything to the pipeline itself. There are other parameters that can be tweaked to personalize the behaviour of the pipeline. These are referenced in `nextflow.config` @@ -98,13 +98,19 @@ To run the pipeline with `docker` (used by default), type the following commands To test the pipeline on a multi-VCF: ``` -nextflow run main.nf -profile family_test +nextflow run main.nf -profile test_full_family +``` + +or + +``` +nextflow run main.nf -profile test_full_multi_hpo ``` To test the pipeline on a single-sample VCF: ``` -nextflow run main.nf -profile single_vcf_test +nextflow run main.nf -profile test_full_single_vcf ``` Be careful when running this, as the pipeline requires the staging of 120 GB of reference data, required by exomiser, so only that takes a while! @@ -113,7 +119,8 @@ Be careful when running this, as the pipeline requires the staging of 120 GB of ### Profiles -| profile name | Run locally | Run on CloudOS | description | -| :-------------: | :--------------------------------------------------------------------: | :------------: | :-----------------------------------------------------------------------------: | -| test_family | the data required is so big, it was tested on a c5.4xlarge EC2 machine | Successful | this test is designed to test the pipeline on a multi-VCF with trio information | -| test_single_vcf | the data required is so big, it was tested on a c5.4xlarge EC2 machine | Successful | this test is designed to test the pipeline on a single-sample-VCF | +| profile name | Run locally | Run on CloudOS | description | +| :------------------: | :--------------------------------------------------------------------: | :------------: | :------------------------------------------------------------------------------------------------------: | +| test_full_family | the data required is so big, it was tested on a c5.4xlarge EC2 machine | Successful | this test is designed to test the pipeline on a multi-VCF with trio information | +| test_full_single_vcf | the data required is so big, it was tested on a c5.4xlarge EC2 machine | Successful | this test is designed to test the pipeline on a single-sample-VCF | +| test_full_multi_hpo | the data required is so big, it was tested on a c5.4xlarge EC2 machine | Successful | this test is designed to test the pipeline on a multi-VCF with trio information using multiple HPO terms |