From 312c09ca3a482bfa428262a6a81e478544a7709c Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Thu, 29 Aug 2024 15:50:29 +0100 Subject: [PATCH 1/7] extend template preprocessing to subgraph_io sections (in subgraphs) when pruning, only remove ports that do not appear in splice edges (aka replacement edges) --- bin/vtfp.pl | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/bin/vtfp.pl b/bin/vtfp.pl index d7c7ea44d..914e8d763 100755 --- a/bin/vtfp.pl +++ b/bin/vtfp.pl @@ -451,6 +451,12 @@ sub apply_subst { $ewi->{removelabel}->(); } + if($cfg->{subgraph_io}) { + $ewi->{addlabel}->(q{subgraph_io}); + $cfg->{subgraph_io} = subst_walk($cfg->{subgraph_io}, $params, $ewi); + $ewi->{removelabel}->(); + } + return; } @@ -1412,9 +1418,9 @@ sub validate_splice_candidates { } } - # all edge termini must be unique (over replacement and pruning edges) except for STDIN/STDOUT + # all edge termini must be unique (over replacement edges) except for STDIN/STDOUT my %endpoints; - for my $edge (@{$splice_candidates->{replacement_edges}}, @{$prune_edges}) { + for my $edge (@{$splice_candidates->{replacement_edges}}) { my $from_end = $edge->{from}; if($from_end and $from_end !~ /:/) { $from_end .= q[:STDOUT] }; @@ -1472,15 +1478,33 @@ sub final_splice { # add new edges push @{$flat_graph->{edges}}, @{$splice_candidates->{replacement_edges}}; - # remove pruned ports - prune edges are not required to be two-ended; just disregard undefined to/from attributes + # remove pruned ports - prune edges are not required to be two-ended; just disregard undefined to/from attributes; only remove ports + # that do not appear in splice edges (aka replacement edges) for my $prune_edge (@{$splice_candidates->{prune_edges}}) { - if($prune_edge->{from}) { remove_port($prune_edge->{from}, $SRC, $flat_graph); } - if($prune_edge->{to}) { remove_port($prune_edge->{to}, $DST, $flat_graph); } + if($prune_edge->{from} and not _in_replacement_edges($prune_edge->{from}, $splice_candidates, $SRC)) { remove_port($prune_edge->{from}, $SRC, $flat_graph); } + if($prune_edge->{to} and not _in_replacement_edges($prune_edge->{to}, $splice_candidates, $DST)) { remove_port($prune_edge->{to}, $DST, $flat_graph); } } return $flat_graph; } +sub _in_replacement_edges { + my ($port_spec, $splice_candidates, $type) = @_; + + my $direction = ($type == $SRC)? q[from]: q[to]; + my $std_port = ($type == $SRC)? q[STDIN]: q[STDOUT]; + + for my $edge (@{$splice_candidates->{replacement_edges}}) { + my $end = $edge->{$direction}; + if($end and $end !~ /:/) { $end .= qq[:$std_port] }; + + if($end eq $port_spec) { return 1; } + } + + return 0; + +} + ################################################################################################ # resolve_ports: # given a splice_pair specification, fully determine the [set of] source and destination ports From f79df226dbba8192e76d1bd27d63bfcbfeb42e6b Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Thu, 29 Aug 2024 15:57:21 +0100 Subject: [PATCH 2/7] allow specification of input processing for stage2 analysis --- .../vtlib/alignment_wtsi_stage2_template.json | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/data/vtlib/alignment_wtsi_stage2_template.json b/data/vtlib/alignment_wtsi_stage2_template.json index 306b4f649..bbb29e453 100644 --- a/data/vtlib/alignment_wtsi_stage2_template.json +++ b/data/vtlib/alignment_wtsi_stage2_template.json @@ -95,24 +95,20 @@ ], "nodes":[ { - "id":"crammerge", - "type":"EXEC", - "use_STDIN": false, - "use_STDOUT": true, - "cmd": [ - "samtools", - "merge", - "-n", - "-O", "BAM", - "-l", "0", - {"select":"s2_input_format", "default":"cram", "select_range":[1], "cases":{ - "cram":["--input-fmt-option", "no_ref=1"], - "bam":["--input-fmt", "bam"] + "id":"preprocess_inputs", + "type":"VTFILE", + "name":{"subst":"s2_preprocess_inputs_method", "required":true, + "ifnull":{ + "select":"s2_ppi_switch", "default":"crammerge","select_range":[1], + "cases":{ + "crammerge":"crammerge.json", + "s2_ppi":"stage2_preprocess_inputs.json" + } }}, - "-", - {"subst":"incrams"} - ], - "description":"merge individual cram files from a sample into one bam file" + "subst_map":{"input_format":{"subst":"s2_input_format"}}, + "comment":"inputs: NONE; outputs: _stdout_ (bam), subst_map_parameters:[input_format]", + "node_prefix":"ppi_", + "description":"subgraph to preprocess inputs. Default: merge individual cram files from a sample into one bam file" }, { "id":"spatial_filter", @@ -264,7 +260,7 @@ } ], "edges":[ - { "id":"src_to_bc2", "from":"crammerge", "to":{"subst":"post_cm","required":true} }, + { "id":"src_to_bc2", "from":"preprocess_inputs", "to":{"subst":"post_cm","required":true} }, {"select":"spatial_filter_switch", "required":true, "select_range":[1], "default":"on", "allow_unspec_keys":true, "cases":{ "on": [ From 4547e20ab243bbc840750f19ac3f5190f4d447b9 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Thu, 29 Aug 2024 15:59:33 +0100 Subject: [PATCH 3/7] allow selection of subsample methods (for QC) --- data/vtlib/subsample.json | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/data/vtlib/subsample.json b/data/vtlib/subsample.json index 271cf6ca2..b6f1ed928 100644 --- a/data/vtlib/subsample.json +++ b/data/vtlib/subsample.json @@ -36,22 +36,15 @@ }, { "id":"subsample", - "type":"EXEC", - "subtype":"STRINGIFY", - "use_STDIN": true, - "use_STDOUT": true, - "cmd":[ - "bash -c '", - {"subst_constructor":{"vals":["tmfs=\"", {"subst":"tag_metrics_files", "required":true}, "\""],"postproc":{"op":"concat","pad":""}}}, "; if [ ! -z \"${tmfs}\" ]; then for tag_metrics_file in ${tmfs}; do reads_count=`jq", {"subst":"jqkey", "ifnull":{"subst_constructor":{"vals":["'\"'\"'.reads_count.\"", {"subst":"s2_tag_index", "required":true}, "\"'\"'\"'"],"postproc":{"op":"concat","pad":""}}}}, "${tag_metrics_file}`; reads_count=`echo ${reads_count} | tr -cd [:digit:]`; reads_count_total=$((${reads_count_total}+${reads_count})); done; if [[ $reads_count_total -eq 0 ]]; then reads_count_total=1; fi; frac=`echo \"10000/${reads_count_total}\" | bc -l`; fi;", - "if [ ! -z $frac ]; then", - "samtools", - "view", - "-s", {"subst":"seed_frac", "required":true, "ifnull": {"subst_constructor":{"vals":[ {"subst":"subsample_seed", "ifnull":{"subst":"s2_id_run", "required":true}}, "${frac}" ],"postproc":{"op":"concat","pad":""}}}}, - "-b", - "-", - ";", - "else >&2 printf \"No tag metrics, no subsample\"; fi;'" - ] + "type":"VTFILE", + "name":{"subst":"s2_subsample_method", "required":true, + "ifnull":{ + "select":"s2_subsample_method_switch", "default":"tmf","select_range":[1], + "cases":{ + "tmf":"subsample_tmf.json", + "spec_frac":"subsample_spec_frac.json"} + }}, + "node_prefix":"ssm_" }, { "id":"bamtofastq_ss", From f76d0a982fc037ece98dc5fd03b079509b3f422b Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Thu, 29 Aug 2024 16:01:47 +0100 Subject: [PATCH 4/7] alternate subsample methods --- data/vtlib/subsample_spec_frac.json | 31 ++++++++++++++++++++++++ data/vtlib/subsample_tmf.json | 37 +++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 data/vtlib/subsample_spec_frac.json create mode 100644 data/vtlib/subsample_tmf.json diff --git a/data/vtlib/subsample_spec_frac.json b/data/vtlib/subsample_spec_frac.json new file mode 100644 index 000000000..838a763aa --- /dev/null +++ b/data/vtlib/subsample_spec_frac.json @@ -0,0 +1,31 @@ +{ +"version":"2.0", +"description":"produce 10k subsample fastq files", +"subgraph_io":{ + "ports":{ + "inputs":{ + "_stdin_":"subsample" + }, + "outputs":{ + "_stdout_":"subsample" + } + } +}, +"subst_params":[], +"nodes":[ + { + "id":"subsample", + "type":"EXEC", + "use_STDIN": true, + "use_STDOUT": true, + "cmd":[ + "samtools", + "view", + "-s", {"subst":"seed_frac", "required":true, "ifnull": {"subst_constructor":{"vals":[ {"subst":"subsample_seed", "ifnull":{"subst":"s2_id_run", "required":true}}, {"subst":"ss_frac", "required":true}],"postproc":{"op":"concat","pad":"."}}}}, + "-b", + "-" + ] + } +], +"edges":[] +} diff --git a/data/vtlib/subsample_tmf.json b/data/vtlib/subsample_tmf.json new file mode 100644 index 000000000..5a18bf928 --- /dev/null +++ b/data/vtlib/subsample_tmf.json @@ -0,0 +1,37 @@ +{ +"version":"2.0", +"description":"produce 10k subsample fastq files", +"subgraph_io":{ + "ports":{ + "inputs":{ + "_stdin_":"subsample" + }, + "outputs":{ + "_stdout_":"subsample" + } + } +}, +"subst_params":[], +"nodes":[ + { + "id":"subsample", + "type":"EXEC", + "subtype":"STRINGIFY", + "use_STDIN": true, + "use_STDOUT": true, + "cmd":[ + "bash -c '", + {"subst_constructor":{"vals":["tmfs=\"", {"subst":"tag_metrics_files", "required":true}, "\""],"postproc":{"op":"concat","pad":""}}}, "; if [ ! -z \"${tmfs}\" ]; then for tag_metrics_file in ${tmfs}; do reads_count=`jq", {"subst":"jqkey", "ifnull":{"subst_constructor":{"vals":["'\"'\"'.reads_count.\"", {"subst":"s2_tag_index", "required":true}, "\"'\"'\"'"],"postproc":{"op":"concat","pad":""}}}}, "${tag_metrics_file}`; reads_count=`echo ${reads_count} | tr -cd [:digit:]`; reads_count_total=$((${reads_count_total}+${reads_count})); done; if [[ $reads_count_total -eq 0 ]]; then reads_count_total=1; fi; frac=`echo \"10000/${reads_count_total}\" | bc -l`; fi;", + "if [ ! -z $frac ]; then", + "samtools", + "view", + "-s", {"subst":"seed_frac", "required":true, "ifnull": {"subst_constructor":{"vals":[ {"subst":"subsample_seed", "ifnull":{"subst":"s2_id_run", "required":true}}, "${frac}" ],"postproc":{"op":"concat","pad":""}}}}, + "-b", + "-", + ";", + "else >&2 printf \"No tag metrics, no subsample\"; fi;'" + ] + } +], +"edges":[] +} From f3405d393583f27b7b6bcdd56219c172e61ef45b Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Thu, 29 Aug 2024 16:03:02 +0100 Subject: [PATCH 5/7] templates to support alternate input processing methods --- data/vtlib/crammerge.json | 37 +++++++++++ data/vtlib/read2tags.json | 54 ++++++++++++++++ data/vtlib/stage2_preprocess_inputs.json | 81 ++++++++++++++++++++++++ 3 files changed, 172 insertions(+) create mode 100644 data/vtlib/crammerge.json create mode 100644 data/vtlib/read2tags.json create mode 100644 data/vtlib/stage2_preprocess_inputs.json diff --git a/data/vtlib/crammerge.json b/data/vtlib/crammerge.json new file mode 100644 index 000000000..de8333488 --- /dev/null +++ b/data/vtlib/crammerge.json @@ -0,0 +1,37 @@ +{ +"version":"2.0", +"description":"run bwa mem to to align input bam to supplied reference genome", +"subgraph_io":{ + "ports":{ + "inputs":{ + }, + "outputs":{ + "_stdout_":"crammerge" + } + } +}, +"subst_params":[], +"nodes":[ + { + "id":"crammerge", + "type":"EXEC", + "use_STDIN": false, + "use_STDOUT": true, + "cmd": [ + "samtools", + "merge", + "-n", + "-O", "BAM", + "-l", "0", + {"select":"input_format", "default":"cram", "select_range":[1], "cases":{ + "cram":["--input-fmt-option", "no_ref=1"], + "bam":["--input-fmt", "bam"] + }}, + "-", + {"subst":"incrams", "required":true} + ], + "description":"merge individual cram files from a sample into one bam file" + } +], +"edges":[] +} diff --git a/data/vtlib/read2tags.json b/data/vtlib/read2tags.json new file mode 100644 index 000000000..58424c491 --- /dev/null +++ b/data/vtlib/read2tags.json @@ -0,0 +1,54 @@ +{ +"version":"2.0", +"description":"read2tags for NanoSeq processing, including preparatory collation and reset", +"subgraph_io":{ + "ports":{ + "inputs":{"_stdin_":"collate"}, + "outputs":{ "_stdout_":"read2tags" } + } +}, +"nodes":[ + { + "id":"collate", + "type": "EXEC", + "use_STDIN": true, + "use_STDOUT": true, + "cmd": [ + {"subst":"samtools_executable", "required":true, "ifnull":"samtools"}, "collate", + "--threads", {"subst":"s2_r2t_coll_threads","required":true,"ifnull":2}, + "-u", + "-O", + "-" + ] + }, + { + "id":"reset", + "type": "EXEC", + "use_STDIN": true, + "use_STDOUT": true, + "cmd": [ + {"subst":"samtools_executable", "required":true, "ifnull":"samtools"}, "reset", + "--threads", {"subst":"s2_r2t_rs_threads","required":true,"ifnull":4}, + "--output-fmt", "BAM,level=0" + ] + }, + { + "id":"read2tags", + "type": "EXEC", + "use_STDIN": true, + "use_STDOUT": true, + "cmd": [ + {"subst":"bambi_executable", "required":true, "ifnull":"bambi"}, "read2tags", + "--tags", "rb,mb,br,rb,mb,br", + "--qtags", "rq,mq,bq,rq,mq,bq", + "--positions", "1:1:1:3,1:2:1:3,1:1:4:7,2:2:1:3,2:1:1:3,2:2:4:7", + "--compression-level", 0, + "--output-fmt", "bam" + ] + } +], +"edges":[ + { "id":"collate_to_reset", "from":"collate","to":"reset" }, + { "id":"reset_to_read2tags", "from":"reset", "to":"read2tags" } +] +} diff --git a/data/vtlib/stage2_preprocess_inputs.json b/data/vtlib/stage2_preprocess_inputs.json new file mode 100644 index 000000000..32039b93e --- /dev/null +++ b/data/vtlib/stage2_preprocess_inputs.json @@ -0,0 +1,81 @@ +{ +"version":"2.0", +"description":"alternate pre-processing method for stage2 inputs accepting FASTQ input (for e.g. Elembio NanoSeq)", +"subgraph_io":{ + "ports":{ + "inputs":{}, + "outputs":{ "_stdout_": + {"select":"pp_read2tags", "required":true, "default":"off", + "cases":{ + "off": "import", + "on": "read2tags" + } + } + } + } +}, +"nodes":[ + { + "id":"import", + "type":"EXEC", + "use_STDIN": false, + "use_STDOUT": true, + "cmd": { + "select":"pp_import_method", + "required":true, + "select_range":[1], + "default":"crammerge", + "cases":{ + "crammerge": + [ + {"subst":"samtools_executable", "required":true, "ifnull":"samtools"}, "merge", + "-n", + "-O", "BAM", + "-l", "0", + {"select":"input_format", "default":"cram", "select_range":[1], "cases":{ + "cram":["--input-fmt-option", "no_ref=1"], + "bam":["--input-fmt", "bam"] + }}, + "-", + {"subst":"incrams", "required":true} + ], + "fastq": + [ + {"subst":"samtools_executable", "required":true, "ifnull":"samtools"}, "import", + "-R", {"subst":"fastq_s2_pi_RG_ID","required":true, "comment":"readgroup"}, + "-1", {"subst":"fastq_s2_pi_fq1","required":true, "comment":"FASTQ read 1"}, + "-2", {"subst":"fastq_s2_pi_fq2","required":true, "comment":"FASTQ read 2"}, + {"select":"parse_casava_id", "default":"on", "select_range":[1], "cases":{ "on":["-i"], "off":[] }}, + {"subst":"parse_import_tags_flag", "ifnull":["-T", {"subst":"parse_import_tags","required":true,"ifnull":"*"}]}, + "-u", + "-O", "bam" + ] + } + } + }, + { + "id":"read2tags", + "type":{ + "select":"pp_read2tags", + "required":true, + "select_range":[1], + "default":"off", + "cases":{ + "on":"VTFILE", + "off":"INACTIVE" + } + }, + "use_STDIN": true, + "use_STDOUT": true, + "name":"read2tags.json", + "node_prefix":"r2t_" + } +], +"edges":[ + {"select":"pp_read2tags", "required":true, "default":"off", "cases":{ + "off": [], + "on": [ { "id":"import_to_read2tags", "from":"import", "to":"read2tags" }] + } + } +] +} From 7379a7ab540e57cc57ad9058cb9b7891423ed76d Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Thu, 29 Aug 2024 16:04:29 +0100 Subject: [PATCH 6/7] static parameter files for stage2 reanalyses (initial set targetting NanoSeq) --- .../stage2_reanalysis/align_bwa_mem2.json | 13 +++++++ .../base_params_duplexseq_cram.json | 37 +++++++++++++++++++ .../base_params_duplexseq_fastq.json | 35 ++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 data/static_params/stage2_reanalysis/align_bwa_mem2.json create mode 100644 data/static_params/stage2_reanalysis/base_params_duplexseq_cram.json create mode 100644 data/static_params/stage2_reanalysis/base_params_duplexseq_fastq.json diff --git a/data/static_params/stage2_reanalysis/align_bwa_mem2.json b/data/static_params/stage2_reanalysis/align_bwa_mem2.json new file mode 100644 index 000000000..477d97126 --- /dev/null +++ b/data/static_params/stage2_reanalysis/align_bwa_mem2.json @@ -0,0 +1,13 @@ +{ + "assign": [ + { + "alignment_method": "bwa_mem", + "bwa_executable": "bwa-mem2" + } + ], + "assign_local": {}, + "ops": { + "splice": [], + "prune": [] + } +} diff --git a/data/static_params/stage2_reanalysis/base_params_duplexseq_cram.json b/data/static_params/stage2_reanalysis/base_params_duplexseq_cram.json new file mode 100644 index 000000000..bafe27534 --- /dev/null +++ b/data/static_params/stage2_reanalysis/base_params_duplexseq_cram.json @@ -0,0 +1,37 @@ +{ + "assign": [ + { + "spatial_filter_switch":"off", + "markdup_optical_distance_value": "100", + "s2_se_pe": "pe", + "samtools_executable": "samtools", + "s2_input_format": "cram", + "markdup_method": "duplexseq", + "s2_ppi_switch":"s2_ppi", + "pp_read2tags":"on", + "pp_import_method":"crammerge", + "fastq_s2_pi_fq1": "DUMMY", + "fastq_s2_pi_fq2": "DUMMY", + "fastq_s2_pi_RG_ID": "DUMMY", + "s2_filter_files": "DUMMY", + "spatial_filter_file": "DUMMY", + "phix_reference_genome_fasta":"DUMMY", + "realignment_switch":1 + } + ], + "assign_local": {}, + "ops": { + "splice": [ + "aln_bam12auxmerge:-foptgt_000_fixmate:", + "foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk" + ], + "prune": [ + "foptgt.*_bmd_multiway:calibration_pu-", + "foptgt_cram_tee:c2a-", + "foptgt.*samtools_stats_F0.*_target.*-", + "foptgt.*samtools_stats_F0.*00_bait.*-", + "aln_tee3_tee3:to_phix_aln-scs_cmp_seqchksum:outputchk", + "ssfqc_tee_ssfqc:subsample-" + ] + } +} diff --git a/data/static_params/stage2_reanalysis/base_params_duplexseq_fastq.json b/data/static_params/stage2_reanalysis/base_params_duplexseq_fastq.json new file mode 100644 index 000000000..ab9f55385 --- /dev/null +++ b/data/static_params/stage2_reanalysis/base_params_duplexseq_fastq.json @@ -0,0 +1,35 @@ +{ + "assign": [ + { + "spatial_filter_switch":"off", + "markdup_optical_distance_value": "100", + "s2_se_pe": "pe", + "samtools_executable": "samtools", + "s2_input_format": "cram", + "markdup_method": "duplexseq", + "s2_ppi_switch":"s2_ppi", + "pp_read2tags":"on", + "pp_import_method":"fastq", + "incrams": "DUMMY", + "s2_filter_files": "DUMMY", + "spatial_filter_file": "DUMMY", + "phix_reference_genome_fasta":"DUMMY", + "realignment_switch":1 + } + ], + "assign_local": {}, + "ops": { + "splice": [ + "aln_bam12auxmerge:-foptgt_000_fixmate:", + "foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk" + ], + "prune": [ + "foptgt.*_bmd_multiway:calibration_pu-", + "foptgt_cram_tee:c2a-", + "foptgt.*samtools_stats_F0.*_target.*-", + "foptgt.*samtools_stats_F0.*00_bait.*-", + "aln_tee3_tee3:to_phix_aln-scs_cmp_seqchksum:outputchk", + "ssfqc_tee_ssfqc:subsample-" + ] + } +} From bc8b2d243b3fcf5e54e69f49ec0c0ceb53b0c56c Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Thu, 5 Sep 2024 10:57:07 +0100 Subject: [PATCH 7/7] set realignment_switch:0,pp_read2tags:on as as default in base_params_duplexseq param files --- .../stage2_reanalysis/base_params_duplexseq_cram.json | 2 +- .../stage2_reanalysis/base_params_duplexseq_fastq.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/static_params/stage2_reanalysis/base_params_duplexseq_cram.json b/data/static_params/stage2_reanalysis/base_params_duplexseq_cram.json index bafe27534..ccb8c0d9d 100644 --- a/data/static_params/stage2_reanalysis/base_params_duplexseq_cram.json +++ b/data/static_params/stage2_reanalysis/base_params_duplexseq_cram.json @@ -16,7 +16,7 @@ "s2_filter_files": "DUMMY", "spatial_filter_file": "DUMMY", "phix_reference_genome_fasta":"DUMMY", - "realignment_switch":1 + "realignment_switch":0 } ], "assign_local": {}, diff --git a/data/static_params/stage2_reanalysis/base_params_duplexseq_fastq.json b/data/static_params/stage2_reanalysis/base_params_duplexseq_fastq.json index ab9f55385..763115bd2 100644 --- a/data/static_params/stage2_reanalysis/base_params_duplexseq_fastq.json +++ b/data/static_params/stage2_reanalysis/base_params_duplexseq_fastq.json @@ -14,7 +14,7 @@ "s2_filter_files": "DUMMY", "spatial_filter_file": "DUMMY", "phix_reference_genome_fasta":"DUMMY", - "realignment_switch":1 + "realignment_switch":0 } ], "assign_local": {},