From 81015cea484a005425e69ff7825e7c1abb66fb81 Mon Sep 17 00:00:00 2001 From: "(major) john (major)" Date: Tue, 26 Aug 2025 03:20:12 -0700 Subject: [PATCH 01/13] Generate sample identifiers from manifest fields --- giab_30x_hg38_analysis_manifest.csv | 16 ++++----- workflow/rules/rule_common.smk | 33 +++++++++++++++++-- .../schemas/analysis_manifest.schema.yaml | 12 ++++--- 3 files changed, 46 insertions(+), 15 deletions(-) mode change 100755 => 100644 giab_30x_hg38_analysis_manifest.csv diff --git a/giab_30x_hg38_analysis_manifest.csv b/giab_30x_hg38_analysis_manifest.csv old mode 100755 new mode 100644 index e8ad19d..62b3ed6 --- a/giab_30x_hg38_analysis_manifest.csv +++ b/giab_30x_hg38_analysis_manifest.csv @@ -1,9 +1,9 @@ -samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer -R0_HG001_D0_0,R0_HG001_D0_0,R0_HG001_D0_0,D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,HG001,HG001,NOVASEQ,PCR-FREE,19 -R0_HG002_D0_0,R0_HG002_D0_0,R0_HG002_D0_0,D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 -R0_HG003_D0_0,R0_HG003_D0_0,R0_HG003_D0_0,D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,HG003,HG003,NOVASEQ,PCR-FREE,19 -R0_HG004_D0_0,R0_HG004_D0_0,R0_HG004_D0_0,D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,HG004,HG004,NOVASEQ,PCR-FREE,19 -R0_HG005_D0_0,R0_HG005_D0_0,R0_HG005_D0_0,D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,HG005,HG005,NOVASEQ,PCR-FREE,19 -R0_HG006_D0_0,R0_HG006_D0_0,R0_HG006_D0_0,D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,HG006,HG006,NOVASEQ,PCR-FREE,19 -R0_HG007_D0_0,R0_HG007_D0_0,R0_HG007_D0_0,D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,HG007,HG007,NOVASEQ,PCR-FREE,19 +SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer +D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,HG001,HG001,NOVASEQ,PCR-FREE,19 +D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 +D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,HG003,HG003,NOVASEQ,PCR-FREE,19 +D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,HG004,HG004,NOVASEQ,PCR-FREE,19 +D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,HG005,HG005,NOVASEQ,PCR-FREE,19 +D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,HG006,HG006,NOVASEQ,PCR-FREE,19 +D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,HG007,HG007,NOVASEQ,PCR-FREE,19 diff --git a/workflow/rules/rule_common.smk b/workflow/rules/rule_common.smk index ca65da2..f432898 100755 --- a/workflow/rules/rule_common.smk +++ b/workflow/rules/rule_common.smk @@ -241,9 +241,36 @@ os.system( ) # IMPORTANT: initialize the samples dataframe from the analysis_manifest.csv -samples = pd.read_table(analysis_manifest, ",").set_index( - ["sample", "sample_lane"], drop=False -) +samples = pd.read_table(analysis_manifest, ",") + +# Derive sample_lane and sample identifiers if they are not provided. +required_cols = {"RU", "EX", "SQ", "LANE"} +missing = required_cols - set(samples.columns) +if missing: + raise WorkflowError(f"Missing required columns in analysis manifest: {missing}") + +samples["LANE"] = samples["LANE"].astype(int) + +if "sample_lane" not in samples.columns: + samples["sample_lane"] = ( + samples["RU"].astype(str) + + "_" + + samples["EX"].astype(str) + + "_" + + samples["SQ"].astype(str) + + "_" + + samples["LANE"].astype(str) + ) + +if "sample" not in samples.columns: + samples["sample"] = samples.apply( + lambda r: f"{r['RU']}_{r['EX']}_{r['SQ']}_0" + if str(r.get("merge_single", "")).lower() == "merge" + else r["sample_lane"], + axis=1, + ) + +samples = samples.set_index(["sample", "sample_lane"], drop=False) # Ensure tum_nrm_sampleid_match exists and treat missing values as 'na' if "tum_nrm_sampleid_match" in samples.columns: diff --git a/workflow/schemas/analysis_manifest.schema.yaml b/workflow/schemas/analysis_manifest.schema.yaml index 6e2cfd9..b9c234d 100755 --- a/workflow/schemas/analysis_manifest.schema.yaml +++ b/workflow/schemas/analysis_manifest.schema.yaml @@ -5,10 +5,16 @@ description: Analysis Samples Manifest Validator. See README in config section properties: sample: type: string - description: format RU#_EX_SQ#_LANE where LANE leading zeros removed, and if merged lane =0 + description: > + Combination of RU, EX, SQ and lane used internally by the + workflow. If omitted, it will be generated automatically from + other fields. sample_lane: type: string - description: RU#_EX_SQ#_LANE with no lane substitution. Identifes the sub patterns to merge to create sample if merging is enabled. if merge_single=single then sample==sample_lane. if merge, then sample_lane is unique, sample is the aggregation pattern, which may be 1 to 1, but if set to merge should be 1(sample) pattern to >1(sample_lane) patterns.<--- this was implemented, then scaled back, and now is largely only really working for merging. + description: > + Fully qualified sample identifier including lane. When not + provided, it will be derived automatically from RU, EX, SQ and + LANE. SQ: type: string description: Sample Type @@ -58,8 +64,6 @@ properties: ont_cram_snv_caller: type: string required: - - sample - - sample_lane - SQ - RU - EX From 89a61f85aa0d7cec4f32e411c2ed177448fa0a33 Mon Sep 17 00:00:00 2001 From: AWS ParallelCluster user Date: Tue, 26 Aug 2025 10:30:04 +0000 Subject: [PATCH 02/13] X --- .../data/0.01x_3_wgs_HG002_hg38.samplesheet.csv | 6 +++--- .test_data/data/0.01xwgs_HG002_b37.samplesheet.csv | 2 +- .../data/0.01xwgs_HG002_hg38.samplesheet.csv | 2 +- .test_data/data/giab_30x_b37_analysis_manifest.csv | 14 +++++++------- .../data/giab_30x_hg38_analysis_manifest.csv | 14 +++++++------- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/.test_data/data/0.01x_3_wgs_HG002_hg38.samplesheet.csv b/.test_data/data/0.01x_3_wgs_HG002_hg38.samplesheet.csv index 918882f..94d4dd3 100755 --- a/.test_data/data/0.01x_3_wgs_HG002_hg38.samplesheet.csv +++ b/.test_data/data/0.01x_3_wgs_HG002_hg38.samplesheet.csv @@ -1,4 +1,4 @@ samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer,subsample_pct -RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19,0.95 -RIH0_ANA0-HG002_DBC1_0,RIH0_ANA0-HG002_DBC1_2,RIH0_ANA0-HG002_DBC1_2,DBC2,RIH0,ANA0-HG002,2,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19,0.85 -RIH0_ANA0-HG002_DBC1_0,RIH0_ANA0-HG002_DBC1_1,RIH0_ANA0-HG002_DBC1_1,DBC1,RIH0,ANA0-HG002,1,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19,0.75 +RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19,0.95 +RIH0_ANA0-HG002_DBC1_0,RIH0_ANA0-HG002_DBC1_2,RIH0_ANA0-HG002_DBC1_2,DBC2,RIH0,ANA0-HG002,2,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19,0.85 +RIH0_ANA0-HG002_DBC1_0,RIH0_ANA0-HG002_DBC1_1,RIH0_ANA0-HG002_DBC1_1,DBC1,RIH0,ANA0-HG002,1,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,na,HG002,HG002,NOVASEQ,PCR-FREE,19,0.75 diff --git a/.test_data/data/0.01xwgs_HG002_b37.samplesheet.csv b/.test_data/data/0.01xwgs_HG002_b37.samplesheet.csv index 72ba6c3..a65cb87 100755 --- a/.test_data/data/0.01xwgs_HG002_b37.samplesheet.csv +++ b/.test_data/data/0.01xwgs_HG002_b37.samplesheet.csv @@ -1,2 +1,2 @@ samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer -RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 +RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19 diff --git a/.test_data/data/0.01xwgs_HG002_hg38.samplesheet.csv b/.test_data/data/0.01xwgs_HG002_hg38.samplesheet.csv index d96d481..8399c88 100755 --- a/.test_data/data/0.01xwgs_HG002_hg38.samplesheet.csv +++ b/.test_data/data/0.01xwgs_HG002_hg38.samplesheet.csv @@ -1,2 +1,2 @@ samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer -RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 +RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19 diff --git a/.test_data/data/giab_30x_b37_analysis_manifest.csv b/.test_data/data/giab_30x_b37_analysis_manifest.csv index 62bb2b2..a5f9ec5 100755 --- a/.test_data/data/giab_30x_b37_analysis_manifest.csv +++ b/.test_data/data/giab_30x_b37_analysis_manifest.csv @@ -1,8 +1,8 @@ samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer -R0_HG001_D0_0,R0_HG001_D0_0,R0_HG001_D0_0,D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,HG001,HG001,NOVASEQ,PCR-FREE,19 -R0_HG002_D0_0,R0_HG002_D0_0,R0_HG002_D0_0,D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 -R0_HG003_D0_0,R0_HG003_D0_0,R0_HG003_D0_0,D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,HG003,HG003,NOVASEQ,PCR-FREE,19 -R0_HG004_D0_0,R0_HG004_D0_0,R0_HG004_D0_0,D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,HG004,HG004,NOVASEQ,PCR-FREE,19 -R0_HG005_D0_0,R0_HG005_D0_0,R0_HG005_D0_0,D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,HG005,HG005,NOVASEQ,PCR-FREE,19 -R0_HG006_D0_0,R0_HG006_D0_0,R0_HG006_D0_0,D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,HG006,HG006,NOVASEQ,PCR-FREE,19 -R0_HG007_D0_0,R0_HG007_D0_0,R0_HG007_D0_0,D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,HG007,HG007,NOVASEQ,PCR-FREE,19 +R0_HG001_D0_0,R0_HG001_D0_0,R0_HG001_D0_0,D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,na,HG001,NOVASEQ,PCR-FREE,19 +R0_HG002_D0_0,R0_HG002_D0_0,R0_HG002_D0_0,D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19 +R0_HG003_D0_0,R0_HG003_D0_0,R0_HG003_D0_0,D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,na,HG003,NOVASEQ,PCR-FREE,19 +R0_HG004_D0_0,R0_HG004_D0_0,R0_HG004_D0_0,D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,na,HG004,NOVASEQ,PCR-FREE,19 +R0_HG005_D0_0,R0_HG005_D0_0,R0_HG005_D0_0,D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,na,HG005,NOVASEQ,PCR-FREE,19 +R0_HG006_D0_0,R0_HG006_D0_0,R0_HG006_D0_0,D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,na,HG006,NOVASEQ,PCR-FREE,19 +R0_HG007_D0_0,R0_HG007_D0_0,R0_HG007_D0_0,D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,na,HG007,NOVASEQ,PCR-FREE,19 diff --git a/.test_data/data/giab_30x_hg38_analysis_manifest.csv b/.test_data/data/giab_30x_hg38_analysis_manifest.csv index ddf4b7c..f2a41ce 100755 --- a/.test_data/data/giab_30x_hg38_analysis_manifest.csv +++ b/.test_data/data/giab_30x_hg38_analysis_manifest.csv @@ -1,8 +1,8 @@ samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer -R0_HG001_D0_0,R0_HG001_D0_0,R0_HG001_D0_0,D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,HG001,HG001,NOVASEQ,PCR-FREE,19 -R0_HG002_D0_0,R0_HG002_D0_0,R0_HG002_D0_0,D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 -R0_HG003_D0_0,R0_HG003_D0_0,R0_HG003_D0_0,D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,HG003,HG003,NOVASEQ,PCR-FREE,19 -R0_HG004_D0_0,R0_HG004_D0_0,R0_HG004_D0_0,D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,HG004,HG004,NOVASEQ,PCR-FREE,19 -R0_HG005_D0_0,R0_HG005_D0_0,R0_HG005_D0_0,D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,HG005,HG005,NOVASEQ,PCR-FREE,19 -R0_HG006_D0_0,R0_HG006_D0_0,R0_HG006_D0_0,D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,HG006,HG006,NOVASEQ,PCR-FREE,19 -R0_HG007_D0_0,R0_HG007_D0_0,R0_HG007_D0_0,D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,HG007,HG007,NOVASEQ,PCR-FREE,19 +R0_HG001_D0_0,R0_HG001_D0_0,R0_HG001_D0_0,D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,na,HG001,NOVASEQ,PCR-FREE,19 +R0_HG002_D0_0,R0_HG002_D0_0,R0_HG002_D0_0,D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19 +R0_HG003_D0_0,R0_HG003_D0_0,R0_HG003_D0_0,D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,na,HG003,NOVASEQ,PCR-FREE,19 +R0_HG004_D0_0,R0_HG004_D0_0,R0_HG004_D0_0,D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,na,HG004,NOVASEQ,PCR-FREE,19 +R0_HG005_D0_0,R0_HG005_D0_0,R0_HG005_D0_0,D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,na,HG005,NOVASEQ,PCR-FREE,19 +R0_HG006_D0_0,R0_HG006_D0_0,R0_HG006_D0_0,D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,na,HG006,NOVASEQ,PCR-FREE,19 +R0_HG007_D0_0,R0_HG007_D0_0,R0_HG007_D0_0,D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,na,HG007,NOVASEQ,PCR-FREE,19 From d76bbee09454bbf842796db053d5d39df376cd74 Mon Sep 17 00:00:00 2001 From: AWS ParallelCluster user Date: Tue, 9 Sep 2025 04:24:07 +0000 Subject: [PATCH 03/13] X --- workflow/envs/vpot_v0.1.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/workflow/envs/vpot_v0.1.yaml b/workflow/envs/vpot_v0.1.yaml index e456502..378e504 100644 --- a/workflow/envs/vpot_v0.1.yaml +++ b/workflow/envs/vpot_v0.1.yaml @@ -1,11 +1,7 @@ channels: - conda-forge - bioconda - - anaconda - - r - - defaults dependencies: - - python + - python=3.9 + - numpy - pip - - pip: - - git+https://github.com/VCCRI/VPOT.git From 9acdf5550e1d7a2fbc3fc3150d79f5615cb24d01 Mon Sep 17 00:00:00 2001 From: "(major) john (major)" Date: Mon, 8 Sep 2025 21:32:23 -0700 Subject: [PATCH 04/13] Add strelka2 variant caller rules --- .../local/templates/rule_config.yaml | 9 +- .../slurm/templates/rule_config.yaml | 9 +- workflow/Snakefile | 1 + workflow/rules/strelka2.smk | 145 ++++++++++++++++++ 4 files changed, 162 insertions(+), 2 deletions(-) create mode 100644 workflow/rules/strelka2.smk diff --git a/config/day_profiles/local/templates/rule_config.yaml b/config/day_profiles/local/templates/rule_config.yaml index 6d9c26d..e69bff1 100755 --- a/config/day_profiles/local/templates/rule_config.yaml +++ b/config/day_profiles/local/templates/rule_config.yaml @@ -15,7 +15,7 @@ valid_biome: AWSPC # ###### Tool Selection CONFIG ---------------------------------------- -# these may be set on the command line as --config genome_build=hg38 aligners=['bwa2a','sent','strobe'] dedupers=['dppl'] snv_callers=['oct','clair3','deep','sentd','lfq2'] sv_callers=['manta','tiddit','dysgu'] +# these may be set on the command line as --config genome_build=hg38 aligners=['bwa2a','sent','strobe'] dedupers=['dppl'] snv_callers=['oct','clair3','deep','sentd','lfq2','strelka2'] sv_callers=['manta','tiddit','dysgu'] # and will override these defaults # Or you can set them here @@ -196,6 +196,13 @@ deepsomatic: numa: " OMP_NUM_THREADS=8 OMP_PROC_BIND=close OMP_PLACES=threads OMP_PROC_BIND=TRUE OMP_DYNAMIC=TRUE OMP_MAX_ACTIVE_LEVELS=1 OMP_SCHEDULE=dynamic OMP_WAIT_POLICY=ACTIVE " dvsom_conda: "../envs/vanilla_v0.1.yaml" +strelka2: + threads: 7 + container: "docker://mgibio/strelka-cwl:2.9.9" + partition: "i8" + mem_mb: 60000 + numa: " " + duphold: threads: 7 env_yaml: "../envs/duphold_v0.1.yaml" diff --git a/config/day_profiles/slurm/templates/rule_config.yaml b/config/day_profiles/slurm/templates/rule_config.yaml index 4f34c4b..b6e7373 100755 --- a/config/day_profiles/slurm/templates/rule_config.yaml +++ b/config/day_profiles/slurm/templates/rule_config.yaml @@ -14,7 +14,7 @@ valid_biome: AWSPC # ###### Tool Selection CONFIG ---------------------------------------- -# these may be set on the command line as --config genome_build=hg38 aligners=['bwa2a','sent','strobe'] dedupers=['dppl'] snv_callers=['oct','clair3','deep','sentd','lfq2'] sv_callers=['manta','tiddit','dysgu'] +# these may be set on the command line as --config genome_build=hg38 aligners=['bwa2a','sent','strobe'] dedupers=['dppl'] snv_callers=['oct','clair3','deep','sentd','lfq2','strelka2'] sv_callers=['manta','tiddit','dysgu'] # and will override these defaults @@ -195,6 +195,13 @@ deepsomatic: numa: " OMP_THREADS=64 OMP_PROC_BIND=close OMP_PLACES=threads OMP_DYNAMIC=true OMP_MAX_ACTIVE_LEVELS=1 OMP_SCHEDULE=dynamic OMP_WAIT_POLICY=ACTIVE " dvsom_conda: "../envs/vanilla_v0.1.yaml" +strelka2: + threads: 64 + container: "docker://mgibio/strelka-cwl:2.9.9" + partition: "i192,i128,i192mem" + mem_mb: 85000 + numa: " OMP_THREADS=64 OMP_PROC_BIND=close OMP_PLACES=threads OMP_DYNAMIC=true OMP_MAX_ACTIVE_LEVELS=1 OMP_SCHEDULE=dynamic OMP_WAIT_POLICY=ACTIVE " + duphold: diff --git a/workflow/Snakefile b/workflow/Snakefile index 49bbadb..fd048b7 100755 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -265,6 +265,7 @@ include: "rules/deepvariant_1_5.smk" include: "rules/deepvariant_1_9.smk" include: "rules/deepvariant_ug.smk" include: "rules/deepsomatic.smk" +include: "rules/strelka2.smk" include: "rules/doppel_mrkdups.smk" include: "rules/duphold.smk" include: "rules/dysgu_sv.smk" diff --git a/workflow/rules/strelka2.smk b/workflow/rules/strelka2.smk new file mode 100644 index 0000000..4204462 --- /dev/null +++ b/workflow/rules/strelka2.smk @@ -0,0 +1,145 @@ +import sys +import os + +##### strelka2 +# --------------------------- + +rule strelka2_germline: + input: + cram=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram", + crai=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram.crai", + ref_fa=lambda wc: config["supporting_files"]["files"]["huref"]["fasta"]["name"], + output: + vcfgz=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz", + vcftbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz.tbi", + log: + MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.strelka2.germline.log", + threads: config['strelka2']['threads'] + container: + config['strelka2']['container'] + resources: + vcpu=config['strelka2']['threads'], + threads=config['strelka2']['threads'], + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], + params: + run_dir=MDIR + "{sample}/align/{alnr}/snv/strelka2/work/{sample}.germline", + cluster_sample=ret_sample, + shell: + r""" + set -euo pipefail + mkdir -p {params.run_dir} + configureStrelkaGermlineWorkflow.py \ + --bam {input.cram} \ + --referenceFasta {input.ref_fa} \ + --runDir {params.run_dir} >> {log} 2>&1 + {params.run_dir}/runWorkflow.py -m local -j {threads} >> {log} 2>&1 + cp {params.run_dir}/results/variants/variants.vcf.gz {output.vcfgz} + cp {params.run_dir}/results/variants/variants.vcf.gz.tbi {output.vcftbi} + """ + + +rule strelka2_somatic: + wildcard_constraints: + sample=TUMORS_REGEX + input: + tumor_cram=get_somcall_tumor_cram, + tumor_crai=get_somcall_tumor_crai, + normal_cram=get_somcall_normal_cram, + normal_crai=get_somcall_normal_crai, + ref_fa=lambda wc: config["supporting_files"]["files"]["huref"]["fasta"]["name"], + output: + snv=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz", + snvtbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz.tbi", + indel=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz", + indeltbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz.tbi", + log: + MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.strelka2.somatic.log", + threads: config['strelka2']['threads'] + container: + config['strelka2']['container'] + resources: + vcpu=config['strelka2']['threads'], + threads=config['strelka2']['threads'], + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], + params: + run_dir=MDIR + "{sample}/align/{alnr}/snv/strelka2/work/{sample}.somatic", + cluster_sample=ret_sample, + shell: + r""" + set -euo pipefail + mkdir -p {params.run_dir} + configureStrelkaSomaticWorkflow.py \ + --tumorBam {input.tumor_cram} \ + --normalBam {input.normal_cram} \ + --referenceFasta {input.ref_fa} \ + --runDir {params.run_dir} >> {log} 2>&1 + {params.run_dir}/runWorkflow.py -m local -j {threads} >> {log} 2>&1 + cp {params.run_dir}/results/variants/somatic.snvs.vcf.gz {output.snv} + cp {params.run_dir}/results/variants/somatic.snvs.vcf.gz.tbi {output.snvtbi} + cp {params.run_dir}/results/variants/somatic.indels.vcf.gz {output.indel} + cp {params.run_dir}/results/variants/somatic.indels.vcf.gz.tbi {output.indeltbi} + """ + + +rule produce_strelka2_germline_vcf: + input: + vcftb=expand( + MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz", + sample=SSAMPS, + alnr=ALIGNERS, + ), + vcftbi=expand( + MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz.tbi", + sample=SSAMPS, + alnr=ALIGNERS, + ), + output: + "gatheredall.strelka2.germline", + threads: 4 + log: + "gatheredall.strelka2.germline.log", + params: + cluster_sample=ret_sample, + resources: + vcpu=4, + threads=4, + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], + + +rule produce_strelka2_somatic_vcf: + input: + snv=expand( + MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz", + sample=TN_TUMOR_SAMPS, + alnr=ALIGNERS, + ), + snvtbi=expand( + MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz.tbi", + sample=TN_TUMOR_SAMPS, + alnr=ALIGNERS, + ), + indel=expand( + MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz", + sample=TN_TUMOR_SAMPS, + alnr=ALIGNERS, + ), + indeltbi=expand( + MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz.tbi", + sample=TN_TUMOR_SAMPS, + alnr=ALIGNERS, + ), + output: + "gatheredall.strelka2.somatic", + threads: 4 + log: + "gatheredall.strelka2.somatic.log", + params: + cluster_sample=ret_sample, + resources: + vcpu=4, + threads=4, + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], From 86a953e5657fa6ca95907568e46dcf9b133887e2 Mon Sep 17 00:00:00 2001 From: "(major) john (major)" Date: Mon, 8 Sep 2025 23:06:20 -0700 Subject: [PATCH 05/13] Add chromosome chunking for Strelka2 --- .../local/templates/rule_config.yaml | 3 + .../slurm/templates/rule_config.yaml | 3 + workflow/rules/rule_common.smk | 25 +++ workflow/rules/strelka2.smk | 170 ++++++++++++++++-- 4 files changed, 191 insertions(+), 10 deletions(-) diff --git a/config/day_profiles/local/templates/rule_config.yaml b/config/day_profiles/local/templates/rule_config.yaml index e69bff1..b16263e 100755 --- a/config/day_profiles/local/templates/rule_config.yaml +++ b/config/day_profiles/local/templates/rule_config.yaml @@ -202,6 +202,9 @@ strelka2: partition: "i8" mem_mb: 60000 numa: " " + hg38_strelka2_chrms: "21" + hg38_broad_strelka2_chrms: "21" + b37_strelka2_chrms: "19,21" duphold: threads: 7 diff --git a/config/day_profiles/slurm/templates/rule_config.yaml b/config/day_profiles/slurm/templates/rule_config.yaml index b6e7373..d5eb084 100755 --- a/config/day_profiles/slurm/templates/rule_config.yaml +++ b/config/day_profiles/slurm/templates/rule_config.yaml @@ -201,6 +201,9 @@ strelka2: partition: "i192,i128,i192mem" mem_mb: 85000 numa: " OMP_THREADS=64 OMP_PROC_BIND=close OMP_PLACES=threads OMP_DYNAMIC=true OMP_MAX_ACTIVE_LEVELS=1 OMP_SCHEDULE=dynamic OMP_WAIT_POLICY=ACTIVE " + hg38_strelka2_chrms: "21" + hg38_broad_strelka2_chrms: "21" + b37_strelka2_chrms: "19,21" diff --git a/workflow/rules/rule_common.smk b/workflow/rules/rule_common.smk index d6b7c85..603ec43 100755 --- a/workflow/rules/rule_common.smk +++ b/workflow/rules/rule_common.smk @@ -91,6 +91,7 @@ CLAIR3_CHRMS = config["clair3"][f"{config['genome_build']}_clair3_chrms"].split( LOFREQ_CHRMS = config["lofreq2"][f"{config['genome_build']}_lofreq_chrms"].split(",") DVSOM_CHRMS = config["deepsomatic"][f"{config['genome_build']}_dvsom_chrms"].split(",") SENTTN_CHRMS = config["senttn"][f"{config['genome_build']}_senttn_chrms"].split(",") +STRELKA2_CHRMS = config["strelka2"][f"{config['genome_build']}_strelka2_chrms"].split(",") VARN_CHRMS = ( [] @@ -944,6 +945,30 @@ def get_dvsom_chrm_day(wildcards): return ret_mod_chrm(ret_str) +def get_strelka_chrm_day(wildcards): + pchr="" # prefix handled already + ret_str = "" + sl = wildcards.strelkachrm.replace('chr', '').split("-") + sl2 = wildcards.strelkachrm.replace('chr', '').split("~") + + if len(sl2) == 2: + ret_str = pchr + wildcards.strelkachrm + elif len(sl) == 1: + ret_str = pchr + sl[0] + elif len(sl) == 2: + start = int(sl[0]) + end = int(sl[1]) + while start <= end: + ret_str = str(ret_str) + " " + pchr + str(start) + start = start + 1 + else: + raise Exception( + "strelka2 chunks can only be one contiguous range per chunk : ie: 1-4 with the non numerical chrms assigned 23=X, 24=Y,25=MT" + ) + + return ret_mod_chrm(ret_str) + + def get_senttn_chrm_day(wildcards): pchr="" # prefix handled already ret_str = "" diff --git a/workflow/rules/strelka2.smk b/workflow/rules/strelka2.smk index 4204462..7937349 100644 --- a/workflow/rules/strelka2.smk +++ b/workflow/rules/strelka2.smk @@ -4,16 +4,35 @@ import os ##### strelka2 # --------------------------- +rule strelka2_germline_chunkdirs: + input: + b=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram", + i=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram.crai", + output: + expand( + MDIR + "{{sample}}/align/{{alnr}}/snv/strelka2/vcfs/{strelkachrm}/{{sample}}.ready", + strelkachrm=STRELKA2_CHRMS, + ), + threads: 1 + log: + MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.chunkdirs.log", + shell: + """ + ( echo {output}; mkdir -p $(dirname {output}); touch {output}; ls {output}; ) > {log} 2>&1; + """ + + rule strelka2_germline: input: cram=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram", crai=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram.crai", ref_fa=lambda wc: config["supporting_files"]["files"]["huref"]["fasta"]["name"], + d=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.ready", output: - vcfgz=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz", - vcftbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz.tbi", + vcfgz=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz", + vcftbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz.tbi", log: - MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.strelka2.germline.log", + MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/log/{sample}.{alnr}.strelka2.{strelkachrm}.germline.log", threads: config['strelka2']['threads'] container: config['strelka2']['container'] @@ -23,15 +42,29 @@ rule strelka2_germline: partition=config['strelka2']['partition'], mem_mb=config['strelka2']['mem_mb'], params: - run_dir=MDIR + "{sample}/align/{alnr}/snv/strelka2/work/{sample}.germline", + run_dir=MDIR + "{sample}/align/{alnr}/snv/strelka2/work/{sample}.germline.{strelkachrm}", + schrm=get_strelka_chrm_day, cluster_sample=ret_sample, + cpre="" if "b37" == config['genome_build'] else "chr", + mito_code="MT" if "b37" == config['genome_build'] else "M", shell: r""" set -euo pipefail mkdir -p {params.run_dir} + + vchr=$(echo {params.cpre}{params.schrm} | sed 's/~/\:/g' | sed 's/23\:/X\:/' | sed 's/24\:/Y\:/' | sed 's/25\:/{params.mito_code}\:/' ) + vchr=${vchr%:} + IFS=':' read -r vcontig vstart vend <<< "$vchr" + if [ -z "${vend:-}" ]; then + vstart=0 + vend=$(awk -v c="$vcontig" '$1==c{print $2; exit}' {input.ref_fa}.fai) + fi + echo -e "$vcontig\t$vstart\t$vend" > {params.run_dir}/region.bed + configureStrelkaGermlineWorkflow.py \ --bam {input.cram} \ --referenceFasta {input.ref_fa} \ + --callRegions {params.run_dir}/region.bed \ --runDir {params.run_dir} >> {log} 2>&1 {params.run_dir}/runWorkflow.py -m local -j {threads} >> {log} 2>&1 cp {params.run_dir}/results/variants/variants.vcf.gz {output.vcfgz} @@ -39,6 +72,59 @@ rule strelka2_germline: """ +rule strelka2_germline_concat: + input: + vcfs=lambda wildcards: expand( + MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz", + sample=wildcards.sample, + alnr=wildcards.alnr, + strelkachrm=STRELKA2_CHRMS, + ), + output: + vcfgz=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz", + vcfgztbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz.tbi", + threads: 4 + resources: + vcpu=4, + threads=4, + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], + conda: "../envs/vanilla_v0.1.yaml" + log: + MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.strelka2.germline.merge.log", + params: + cluster_sample=ret_sample, + shell: + """ + bcftools concat -a -d all --threads {threads} -O z -o {output.vcfgz}.tmp {input.vcfs} >> {log} 2>&1; + oldname=$(bcftools query -l {output.vcfgz}.tmp | head -n1) >> {log} 2>&1; + echo -e "${oldname}\t{params.cluster_sample}" > {output.vcfgz}.rename.txt; + bcftools reheader -s {output.vcfgz}.rename.txt -o {output.vcfgz} {output.vcfgz}.tmp >> {log} 2>&1; + bcftools index -f -t --threads {threads} {output.vcfgz} >> {log} 2>&1; + rm -f {output.vcfgz}.tmp {output.vcfgz}.rename.txt; + """ + + +rule strelka2_somatic_chunkdirs: + wildcard_constraints: + sample=TUMORS_REGEX + input: + b=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram", + i=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram.crai", + output: + expand( + MDIR + "{{sample}}/align/{{alnr}}/snv/strelka2/vcfs/{strelkachrm}/{{sample}}.ready", + strelkachrm=STRELKA2_CHRMS, + ), + threads: 1 + log: + MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.somatic.chunkdirs.log", + shell: + """ + ( echo {output}; mkdir -p $(dirname {output}); touch {output}; ls {output}; ) > {log} 2>&1; + """ + + rule strelka2_somatic: wildcard_constraints: sample=TUMORS_REGEX @@ -48,13 +134,14 @@ rule strelka2_somatic: normal_cram=get_somcall_normal_cram, normal_crai=get_somcall_normal_crai, ref_fa=lambda wc: config["supporting_files"]["files"]["huref"]["fasta"]["name"], + d=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.ready", output: - snv=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz", - snvtbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz.tbi", - indel=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz", - indeltbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz.tbi", + snv=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz", + snvtbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz.tbi", + indel=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz", + indeltbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz.tbi", log: - MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.strelka2.somatic.log", + MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/log/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.log", threads: config['strelka2']['threads'] container: config['strelka2']['container'] @@ -64,16 +151,30 @@ rule strelka2_somatic: partition=config['strelka2']['partition'], mem_mb=config['strelka2']['mem_mb'], params: - run_dir=MDIR + "{sample}/align/{alnr}/snv/strelka2/work/{sample}.somatic", + run_dir=MDIR + "{sample}/align/{alnr}/snv/strelka2/work/{sample}.somatic.{strelkachrm}", + schrm=get_strelka_chrm_day, cluster_sample=ret_sample, + cpre="" if "b37" == config['genome_build'] else "chr", + mito_code="MT" if "b37" == config['genome_build'] else "M", shell: r""" set -euo pipefail mkdir -p {params.run_dir} + + vchr=$(echo {params.cpre}{params.schrm} | sed 's/~/\:/g' | sed 's/23\:/X\:/' | sed 's/24\:/Y\:/' | sed 's/25\:/{params.mito_code}\:/' ) + vchr=${vchr%:} + IFS=':' read -r vcontig vstart vend <<< "$vchr" + if [ -z "${vend:-}" ]; then + vstart=0 + vend=$(awk -v c="$vcontig" '$1==c{print $2; exit}' {input.ref_fa}.fai) + fi + echo -e "$vcontig\t$vstart\t$vend" > {params.run_dir}/region.bed + configureStrelkaSomaticWorkflow.py \ --tumorBam {input.tumor_cram} \ --normalBam {input.normal_cram} \ --referenceFasta {input.ref_fa} \ + --callRegions {params.run_dir}/region.bed \ --runDir {params.run_dir} >> {log} 2>&1 {params.run_dir}/runWorkflow.py -m local -j {threads} >> {log} 2>&1 cp {params.run_dir}/results/variants/somatic.snvs.vcf.gz {output.snv} @@ -83,6 +184,55 @@ rule strelka2_somatic: """ +rule strelka2_somatic_concat: + wildcard_constraints: + sample=TUMORS_REGEX + input: + snv_vcfs=lambda wildcards: expand( + MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz", + sample=wildcards.sample, + alnr=wildcards.alnr, + strelkachrm=STRELKA2_CHRMS, + ), + indel_vcfs=lambda wildcards: expand( + MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz", + sample=wildcards.sample, + alnr=wildcards.alnr, + strelkachrm=STRELKA2_CHRMS, + ), + output: + snv=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz", + snvtbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz.tbi", + indel=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz", + indeltbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz.tbi", + threads: 4 + resources: + vcpu=4, + threads=4, + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], + conda: "../envs/vanilla_v0.1.yaml" + log: + MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.strelka2.somatic.merge.log", + params: + cluster_sample=ret_sample, + shell: + """ + bcftools concat -a -d all --threads {threads} -O z -o {output.snv}.tmp {input.snv_vcfs} >> {log} 2>&1; + oldname=$(bcftools query -l {output.snv}.tmp | head -n1) >> {log} 2>&1; + echo -e "${oldname}\t{params.cluster_sample}" > {output.snv}.rename.txt; + bcftools reheader -s {output.snv}.rename.txt -o {output.snv} {output.snv}.tmp >> {log} 2>&1; + bcftools index -f -t --threads {threads} {output.snv} >> {log} 2>&1; + rm -f {output.snv}.tmp {output.snv}.rename.txt; + + bcftools concat -a -d all --threads {threads} -O z -o {output.indel}.tmp {input.indel_vcfs} >> {log} 2>&1; + oldname=$(bcftools query -l {output.indel}.tmp | head -n1) >> {log} 2>&1; + echo -e "${oldname}\t{params.cluster_sample}" > {output.indel}.rename.txt; + bcftools reheader -s {output.indel}.rename.txt -o {output.indel} {output.indel}.tmp >> {log} 2>&1; + bcftools index -f -t --threads {threads} {output.indel} >> {log} 2>&1; + rm -f {output.indel}.tmp {output.indel}.rename.txt; + """ + rule produce_strelka2_germline_vcf: input: vcftb=expand( From e039229b6547d78845a85b420f126a06f32e7f95 Mon Sep 17 00:00:00 2001 From: AWS ParallelCluster user Date: Tue, 9 Sep 2025 06:20:48 +0000 Subject: [PATCH 06/13] goteam --- workflow/rules/strelka2.smk | 84 ++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/workflow/rules/strelka2.smk b/workflow/rules/strelka2.smk index 7937349..b2cdfc7 100644 --- a/workflow/rules/strelka2.smk +++ b/workflow/rules/strelka2.smk @@ -10,12 +10,12 @@ rule strelka2_germline_chunkdirs: i=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram.crai", output: expand( - MDIR + "{{sample}}/align/{{alnr}}/snv/strelka2/vcfs/{strelkachrm}/{{sample}}.ready", + MDIR + "{{sample}}/align/{{alnr}}/snv/slk2g/vcfs/{strelkachrm}/{{sample}}.ready", strelkachrm=STRELKA2_CHRMS, ), threads: 1 log: - MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.chunkdirs.log", + MDIR + "{sample}/align/{alnr}/snv/slk2g/log/{sample}.{alnr}.chunkdirs.log", shell: """ ( echo {output}; mkdir -p $(dirname {output}); touch {output}; ls {output}; ) > {log} 2>&1; @@ -27,12 +27,12 @@ rule strelka2_germline: cram=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram", crai=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram.crai", ref_fa=lambda wc: config["supporting_files"]["files"]["huref"]["fasta"]["name"], - d=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.ready", + d=MDIR + "{sample}/align/{alnr}/snv/slk2g/vcfs/{strelkachrm}/{sample}.ready", output: - vcfgz=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz", - vcftbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz.tbi", + vcfgz=MDIR + "{sample}/align/{alnr}/snv/slk2g/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz", + vcftbi=MDIR + "{sample}/align/{alnr}/snv/slk2g/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz.tbi", log: - MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/log/{sample}.{alnr}.strelka2.{strelkachrm}.germline.log", + MDIR + "{sample}/align/{alnr}/snv/slk2g/vcfs/{strelkachrm}/log/{sample}.{alnr}.strelka2.{strelkachrm}.germline.log", threads: config['strelka2']['threads'] container: config['strelka2']['container'] @@ -42,7 +42,7 @@ rule strelka2_germline: partition=config['strelka2']['partition'], mem_mb=config['strelka2']['mem_mb'], params: - run_dir=MDIR + "{sample}/align/{alnr}/snv/strelka2/work/{sample}.germline.{strelkachrm}", + run_dir=MDIR + "{sample}/align/{alnr}/snv/slk2g/work/{sample}.germline.{strelkachrm}", schrm=get_strelka_chrm_day, cluster_sample=ret_sample, cpre="" if "b37" == config['genome_build'] else "chr", @@ -53,11 +53,11 @@ rule strelka2_germline: mkdir -p {params.run_dir} vchr=$(echo {params.cpre}{params.schrm} | sed 's/~/\:/g' | sed 's/23\:/X\:/' | sed 's/24\:/Y\:/' | sed 's/25\:/{params.mito_code}\:/' ) - vchr=${vchr%:} + vchr=${{vchr%:}} IFS=':' read -r vcontig vstart vend <<< "$vchr" - if [ -z "${vend:-}" ]; then + if [ -z "${{vend:-}}" ]; then vstart=0 - vend=$(awk -v c="$vcontig" '$1==c{print $2; exit}' {input.ref_fa}.fai) + vend=$(awk -v c="$vcontig" '$1==c{{print $2; exit}}' {input.ref_fa}.fai) fi echo -e "$vcontig\t$vstart\t$vend" > {params.run_dir}/region.bed @@ -75,14 +75,14 @@ rule strelka2_germline: rule strelka2_germline_concat: input: vcfs=lambda wildcards: expand( - MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz", + MDIR + "{sample}/align/{alnr}/snv/slk2g/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz", sample=wildcards.sample, alnr=wildcards.alnr, strelkachrm=STRELKA2_CHRMS, ), output: - vcfgz=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz", - vcfgztbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz.tbi", + vcfgz=MDIR + "{sample}/align/{alnr}/snv/slk2g/{sample}.{alnr}.strelka2.germline.vcf.gz", + vcfgztbi=MDIR + "{sample}/align/{alnr}/snv/slk2g/{sample}.{alnr}.strelka2.germline.vcf.gz.tbi", threads: 4 resources: vcpu=4, @@ -91,14 +91,14 @@ rule strelka2_germline_concat: mem_mb=config['strelka2']['mem_mb'], conda: "../envs/vanilla_v0.1.yaml" log: - MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.strelka2.germline.merge.log", + MDIR + "{sample}/align/{alnr}/snv/slk2g/log/{sample}.{alnr}.strelka2.germline.merge.log", params: cluster_sample=ret_sample, shell: """ bcftools concat -a -d all --threads {threads} -O z -o {output.vcfgz}.tmp {input.vcfs} >> {log} 2>&1; oldname=$(bcftools query -l {output.vcfgz}.tmp | head -n1) >> {log} 2>&1; - echo -e "${oldname}\t{params.cluster_sample}" > {output.vcfgz}.rename.txt; + echo -e "${{oldname}}\t{params.cluster_sample}" > {output.vcfgz}.rename.txt; bcftools reheader -s {output.vcfgz}.rename.txt -o {output.vcfgz} {output.vcfgz}.tmp >> {log} 2>&1; bcftools index -f -t --threads {threads} {output.vcfgz} >> {log} 2>&1; rm -f {output.vcfgz}.tmp {output.vcfgz}.rename.txt; @@ -113,12 +113,12 @@ rule strelka2_somatic_chunkdirs: i=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram.crai", output: expand( - MDIR + "{{sample}}/align/{{alnr}}/snv/strelka2/vcfs/{strelkachrm}/{{sample}}.ready", + MDIR + "{{sample}}/align/{{alnr}}/snv/slk2s/vcfs/{strelkachrm}/{{sample}}.ready", strelkachrm=STRELKA2_CHRMS, ), threads: 1 log: - MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.somatic.chunkdirs.log", + MDIR + "{sample}/align/{alnr}/snv/slk2s/log/{sample}.{alnr}.somatic.chunkdirs.log", shell: """ ( echo {output}; mkdir -p $(dirname {output}); touch {output}; ls {output}; ) > {log} 2>&1; @@ -134,14 +134,14 @@ rule strelka2_somatic: normal_cram=get_somcall_normal_cram, normal_crai=get_somcall_normal_crai, ref_fa=lambda wc: config["supporting_files"]["files"]["huref"]["fasta"]["name"], - d=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.ready", + d=MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.ready", output: - snv=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz", - snvtbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz.tbi", - indel=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz", - indeltbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz.tbi", + snv=MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz", + snvtbi=MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz.tbi", + indel=MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz", + indeltbi=MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz.tbi", log: - MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/log/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.log", + MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/log/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.log", threads: config['strelka2']['threads'] container: config['strelka2']['container'] @@ -151,7 +151,7 @@ rule strelka2_somatic: partition=config['strelka2']['partition'], mem_mb=config['strelka2']['mem_mb'], params: - run_dir=MDIR + "{sample}/align/{alnr}/snv/strelka2/work/{sample}.somatic.{strelkachrm}", + run_dir=MDIR + "{sample}/align/{alnr}/snv/slk2s/work/{sample}.somatic.{strelkachrm}", schrm=get_strelka_chrm_day, cluster_sample=ret_sample, cpre="" if "b37" == config['genome_build'] else "chr", @@ -162,11 +162,11 @@ rule strelka2_somatic: mkdir -p {params.run_dir} vchr=$(echo {params.cpre}{params.schrm} | sed 's/~/\:/g' | sed 's/23\:/X\:/' | sed 's/24\:/Y\:/' | sed 's/25\:/{params.mito_code}\:/' ) - vchr=${vchr%:} + vchr=${{vchr%:}} IFS=':' read -r vcontig vstart vend <<< "$vchr" - if [ -z "${vend:-}" ]; then + if [ -z "${{vend:-}}" ]; then vstart=0 - vend=$(awk -v c="$vcontig" '$1==c{print $2; exit}' {input.ref_fa}.fai) + vend=$(awk -v c="$vcontig" '$1==c{{print $2; exit}}' {input.ref_fa}.fai) fi echo -e "$vcontig\t$vstart\t$vend" > {params.run_dir}/region.bed @@ -189,22 +189,22 @@ rule strelka2_somatic_concat: sample=TUMORS_REGEX input: snv_vcfs=lambda wildcards: expand( - MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz", + MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz", sample=wildcards.sample, alnr=wildcards.alnr, strelkachrm=STRELKA2_CHRMS, ), indel_vcfs=lambda wildcards: expand( - MDIR + "{sample}/align/{alnr}/snv/strelka2/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz", + MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz", sample=wildcards.sample, alnr=wildcards.alnr, strelkachrm=STRELKA2_CHRMS, ), output: - snv=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz", - snvtbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz.tbi", - indel=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz", - indeltbi=MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz.tbi", + snv=MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz", + snvtbi=MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz.tbi", + indel=MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz", + indeltbi=MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz.tbi", threads: 4 resources: vcpu=4, @@ -213,21 +213,21 @@ rule strelka2_somatic_concat: mem_mb=config['strelka2']['mem_mb'], conda: "../envs/vanilla_v0.1.yaml" log: - MDIR + "{sample}/align/{alnr}/snv/strelka2/log/{sample}.{alnr}.strelka2.somatic.merge.log", + MDIR + "{sample}/align/{alnr}/snv/slk2s/log/{sample}.{alnr}.strelka2.somatic.merge.log", params: cluster_sample=ret_sample, shell: """ bcftools concat -a -d all --threads {threads} -O z -o {output.snv}.tmp {input.snv_vcfs} >> {log} 2>&1; oldname=$(bcftools query -l {output.snv}.tmp | head -n1) >> {log} 2>&1; - echo -e "${oldname}\t{params.cluster_sample}" > {output.snv}.rename.txt; + echo -e "${{oldname}}\t{params.cluster_sample}" > {output.snv}.rename.txt; bcftools reheader -s {output.snv}.rename.txt -o {output.snv} {output.snv}.tmp >> {log} 2>&1; bcftools index -f -t --threads {threads} {output.snv} >> {log} 2>&1; rm -f {output.snv}.tmp {output.snv}.rename.txt; bcftools concat -a -d all --threads {threads} -O z -o {output.indel}.tmp {input.indel_vcfs} >> {log} 2>&1; oldname=$(bcftools query -l {output.indel}.tmp | head -n1) >> {log} 2>&1; - echo -e "${oldname}\t{params.cluster_sample}" > {output.indel}.rename.txt; + echo -e "${{oldname}}\t{params.cluster_sample}" > {output.indel}.rename.txt; bcftools reheader -s {output.indel}.rename.txt -o {output.indel} {output.indel}.tmp >> {log} 2>&1; bcftools index -f -t --threads {threads} {output.indel} >> {log} 2>&1; rm -f {output.indel}.tmp {output.indel}.rename.txt; @@ -236,12 +236,12 @@ rule strelka2_somatic_concat: rule produce_strelka2_germline_vcf: input: vcftb=expand( - MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz", + MDIR + "{sample}/align/{alnr}/snv/slk2g/{sample}.{alnr}.strelka2.germline.vcf.gz", sample=SSAMPS, alnr=ALIGNERS, ), vcftbi=expand( - MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.germline.vcf.gz.tbi", + MDIR + "{sample}/align/{alnr}/snv/slk2g/{sample}.{alnr}.strelka2.germline.vcf.gz.tbi", sample=SSAMPS, alnr=ALIGNERS, ), @@ -262,22 +262,22 @@ rule produce_strelka2_germline_vcf: rule produce_strelka2_somatic_vcf: input: snv=expand( - MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz", + MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz", sample=TN_TUMOR_SAMPS, alnr=ALIGNERS, ), snvtbi=expand( - MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz.tbi", + MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz.tbi", sample=TN_TUMOR_SAMPS, alnr=ALIGNERS, ), indel=expand( - MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz", + MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz", sample=TN_TUMOR_SAMPS, alnr=ALIGNERS, ), indeltbi=expand( - MDIR + "{sample}/align/{alnr}/snv/strelka2/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz.tbi", + MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz.tbi", sample=TN_TUMOR_SAMPS, alnr=ALIGNERS, ), From d511d07dda48fd13e08c4cd59db4226f939e435d Mon Sep 17 00:00:00 2001 From: "(major) john (major)" Date: Tue, 9 Sep 2025 00:14:49 -0700 Subject: [PATCH 07/13] fix mutect2 sample name reheader --- workflow/rules/mutect2.smk | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/workflow/rules/mutect2.smk b/workflow/rules/mutect2.smk index 50736d2..08bf590 100644 --- a/workflow/rules/mutect2.smk +++ b/workflow/rules/mutect2.smk @@ -86,17 +86,14 @@ rule mutect2_bams: outbam=$2 sm=$3 tmphdr=$(mktemp) - OFS="\t" samtools view -H "$inbam" \ - | awk -v sm="$sm" 'BEGIN{OFS="\t"} - /^@RG/ { - hasSM=0 - for (i=1;i<=NF;i++){ - if ($i ~ /^SM:/) { $i="SM:" sm; hasSM=1 } - } - if (!hasSM){ $0 = $0 OFS "SM:" sm } - } - { print } + | awk -v sm="$sm" 'BEGIN{{FS=OFS="\t"}} + /^@RG/ {{ + found=0 + for (i=1;i<=NF;i++) if ($i ~ /^SM:/) {{ $i="SM:" sm; found=1 }} + if (!found) {{ $0 = $0 OFS "SM:" sm }} + }} + {{ print }} ' > "$tmphdr" # Reheader and replace atomically samtools reheader "$tmphdr" "$inbam" > "$outbam" From e6f4e6bd223fa2303391a74c47e8ded9d4c8ded8 Mon Sep 17 00:00:00 2001 From: John Major Date: Tue, 9 Sep 2025 00:43:48 -0700 Subject: [PATCH 08/13] x --- workflow/rules/mutect2.smk | 87 +++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/workflow/rules/mutect2.smk b/workflow/rules/mutect2.smk index 08bf590..12b7cff 100644 --- a/workflow/rules/mutect2.smk +++ b/workflow/rules/mutect2.smk @@ -38,86 +38,85 @@ rule mutect2_bams: r""" set -euo pipefail ulimit -n 65536 || true - - mkdir -p "$(dirname {output.tumor_bam}" + + mkdir -p "$(dirname {output.tumor_bam})" # Build interval token; map 23→X, 24→Y, 25→{params.mito_code}; strip trailing colon. tchr=$(echo {params.cpre}{params.chrm} \ | sed 's/~/\:/g' | sed 's/23\:/X\:/' | sed 's/24\:/Y\:/' | sed 's/25\:/{params.mito_code}\:/') - tchr=${{tchr}} + tchr=${{tchr%:}} IFS=':' read -r tcontig tstart tend <<< "$tchr" # Look up contig length early contig_len=$(awk -v c="$tcontig" '$1==c{{print $2; exit}}' {input.ref_fai}) if [ -z "${{contig_len}}" ]; then - echo "ERROR: Contig '$tcontig' not found in {input.ref_fai}" >&2 - exit 1 + echo "ERROR: Contig '$tcontig' not found in {input.ref_fai}" >&2 + exit 1 fi - if [ -z "${{tend}}" ]; then - # Whole contig - region="$tcontig" + if [ -z "${{tend:-}}" ]; then + # Whole contig + region="$tcontig" else - # Normalize to 1-based inclusive and clamp to [1, contig_len] - if [ -z "${{tstart}}" ] || [ "$tstart" -lt 1 ]; then tstart=1; fi - if [ "$tend" -gt "$contig_len" ]; then tend="$contig_len"; fi - if [ "$tstart" -gt "$tend" ]; then + # Normalize to 1-based inclusive and clamp to [1, contig_len] + : "${{tstart:=1}}" + if [ "$tstart" -lt 1 ]; then tstart=1; fi + if [ "$tend" -gt "$contig_len" ]; then tend="$contig_len"; fi + if [ "$tstart" -gt "$tend" ]; then echo "ERROR: Empty/invalid interval after normalization: $tcontig:$tstart-$tend" >&2 exit 1 - fi - region="$tcontig:$tstart-$tend" fi - + region="$tcontig:$tstart-$tend" + fi + # Tumor samtools view -@ {threads} -T {input.ref_fa} -b {input.tumor_cram} "$region" \ - | samtools sort -@ {threads} -o {output.tumor_bam} - >> {log} 2>&1 - samtools index -@ {threads} {output.tumor_bam} >> {log} 2>&1 - + | samtools sort -@ {threads} -o {output.tumor_bam} - >> {log} 2>&1 + samtools index -@ {threads} {output.tumor_bam} >> {log} 2>&1 + # Normal samtools view -@ {threads} -T {input.ref_fa} -b {input.normal_cram} "$region" \ - | samtools sort -@ {threads} -o {output.normal_bam} - >> {log} 2>&1 - samtools index -@ {threads} {output.normal_bam} >> {log} 2>&1 + | samtools sort -@ {threads} -o {output.normal_bam} - >> {log} 2>&1 + samtools index -@ {threads} {output.normal_bam} >> {log} 2>&1 - - # ---- Fix SM in headers (preserve all other @RG fields) ---- + # ---- Fix SM in headers (preserve other @RG fields) ---- fix_sm () {{ - inbam=$1 - outbam=$2 - sm=$3 - tmphdr=$(mktemp) - samtools view -H "$inbam" \ + inbam=$1 + outbam=$2 + sm=$3 + tmphdr=$(mktemp) + samtools view -H "$inbam" \ | awk -v sm="$sm" 'BEGIN{{FS=OFS="\t"}} /^@RG/ {{ - found=0 - for (i=1;i<=NF;i++) if ($i ~ /^SM:/) {{ $i="SM:" sm; found=1 }} - if (!found) {{ $0 = $0 OFS "SM:" sm }} + found=0 + for (i=1;i<=NF;i++) if ($i ~ /^SM:/) {{ $i="SM:" sm; found=1 }} + if (!found) {{ $0 = $0 OFS "SM:" sm }} }} {{ print }} - ' > "$tmphdr" - # Reheader and replace atomically - samtools reheader "$tmphdr" "$inbam" > "$outbam" - rm -f "$tmphdr" - samtools index -@ {threads} "$outbam" >/dev/null 2>&1 || true + ' > "$tmphdr" + samtools reheader "$tmphdr" "$inbam" > "$outbam" + rm -f "$tmphdr" + samtools index -@ {threads} "$outbam" >/dev/null 2>&1 || true }} - - # Distinct names for tumor/normal (required by Mutect2) + T_SM="{params.cluster_sample}-T" N_SM="{params.cluster_sample}-N" - + fix_sm "{output.tumor_bam}" "{output.tumor_bam}.smfix" "$T_SM" mv "{output.tumor_bam}.smfix" "{output.tumor_bam}" - samtools index -@ {threads} -f "{output.tumor_bam}" >> {log} 2>&1 - + samtools index -@ {threads} -f "{output.tumor_bam}" >> {log} 2>&1 + fix_sm "{output.normal_bam}" "{output.normal_bam}.smfix" "$N_SM" mv "{output.normal_bam}.smfix" "{output.normal_bam}" - samtools index -@ {threads} -f "{output.normal_bam}" >> {log} 2>&1 - - # Optional: log the final names for sanity - gatk GetSampleName -I {output.tumor_bam} 2>>{log} | sed "s/^/Tumor SM: /" >> {log} + samtools index -@ {threads} -f "{output.normal_bam}" >> {log} 2>&1 + + # Sanity log + gatk GetSampleName -I {output.tumor_bam} 2>>{log} | sed "s/^/Tumor SM: /" >> {log} gatk GetSampleName -I {output.normal_bam} 2>>{log} | sed "s/^/Normal SM: /" >> {log} """ + rule mutect2: wildcard_constraints: sample=TUMORS_REGEX From 95dafbcef029e47321a3fa244da1b1bfe32d077b Mon Sep 17 00:00:00 2001 From: John Major Date: Tue, 9 Sep 2025 00:46:02 -0700 Subject: [PATCH 09/13] x --- workflow/rules/mutect2.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/mutect2.smk b/workflow/rules/mutect2.smk index 12b7cff..b1e2588 100644 --- a/workflow/rules/mutect2.smk +++ b/workflow/rules/mutect2.smk @@ -105,11 +105,11 @@ rule mutect2_bams: fix_sm "{output.tumor_bam}" "{output.tumor_bam}.smfix" "$T_SM" mv "{output.tumor_bam}.smfix" "{output.tumor_bam}" - samtools index -@ {threads} -f "{output.tumor_bam}" >> {log} 2>&1 + samtools index -@ {threads} "{output.tumor_bam}" >> {log} 2>&1 fix_sm "{output.normal_bam}" "{output.normal_bam}.smfix" "$N_SM" mv "{output.normal_bam}.smfix" "{output.normal_bam}" - samtools index -@ {threads} -f "{output.normal_bam}" >> {log} 2>&1 + samtools index -@ {threads} "{output.normal_bam}" >> {log} 2>&1 # Sanity log gatk GetSampleName -I {output.tumor_bam} 2>>{log} | sed "s/^/Tumor SM: /" >> {log} From d072d7044167a32a05397afad04c332e5c5882c5 Mon Sep 17 00:00:00 2001 From: John Major Date: Tue, 9 Sep 2025 00:47:10 -0700 Subject: [PATCH 10/13] x --- workflow/rules/mutect2.smk | 3 --- 1 file changed, 3 deletions(-) diff --git a/workflow/rules/mutect2.smk b/workflow/rules/mutect2.smk index b1e2588..85b2b74 100644 --- a/workflow/rules/mutect2.smk +++ b/workflow/rules/mutect2.smk @@ -111,9 +111,6 @@ rule mutect2_bams: mv "{output.normal_bam}.smfix" "{output.normal_bam}" samtools index -@ {threads} "{output.normal_bam}" >> {log} 2>&1 - # Sanity log - gatk GetSampleName -I {output.tumor_bam} 2>>{log} | sed "s/^/Tumor SM: /" >> {log} - gatk GetSampleName -I {output.normal_bam} 2>>{log} | sed "s/^/Normal SM: /" >> {log} """ From cf8acf0ea6d4958cf5e8ec59efd9ed67f70e480e Mon Sep 17 00:00:00 2001 From: John Major Date: Tue, 9 Sep 2025 00:53:03 -0700 Subject: [PATCH 11/13] x --- workflow/rules/mutect2.smk | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/workflow/rules/mutect2.smk b/workflow/rules/mutect2.smk index 85b2b74..5012437 100644 --- a/workflow/rules/mutect2.smk +++ b/workflow/rules/mutect2.smk @@ -100,8 +100,11 @@ rule mutect2_bams: samtools index -@ {threads} "$outbam" >/dev/null 2>&1 || true }} - T_SM="{params.cluster_sample}-T" - N_SM="{params.cluster_sample}-N" + sample=$(echo "{log}" | sed -E 's#.*/([^/]+)/align/.*#\1#') + echo "Sample: $sample" >> {log} 2>&1 + T_SM="${{sample}}" + N_SM="${{sample}}" + fix_sm "{output.tumor_bam}" "{output.tumor_bam}.smfix" "$T_SM" mv "{output.tumor_bam}.smfix" "{output.tumor_bam}" From f3c21333acdf7b7582f4cac1c3e9fa7c4e014ac8 Mon Sep 17 00:00:00 2001 From: John Major Date: Tue, 9 Sep 2025 01:02:07 -0700 Subject: [PATCH 12/13] x --- workflow/rules/mutect2.smk | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/workflow/rules/mutect2.smk b/workflow/rules/mutect2.smk index 5012437..af1db5d 100644 --- a/workflow/rules/mutect2.smk +++ b/workflow/rules/mutect2.smk @@ -100,11 +100,14 @@ rule mutect2_bams: samtools index -@ {threads} "$outbam" >/dev/null 2>&1 || true }} - sample=$(echo "{log}" | sed -E 's#.*/([^/]+)/align/.*#\1#') - echo "Sample: $sample" >> {log} 2>&1 - T_SM="${{sample}}" - N_SM="${{sample}}" - + tum_sample=$(echo "{input.tumor_cram}" | sed -E 's#.*/([^/]+)/align/.*#\1#') + nrm_sample=$(echo "{input.normal_cram}" | sed -E 's#.*/([^/]+)/align/.*#\1#') + + echo "Tumor Sample: $tum_sample" >> {log} 2>&1 + echo "Normal Sample: $nrm_sample" >> {log} 2>& + T_SM="${{tum_sample}}" + N_SM="${{nrm_sample}}" + fix_sm "{output.tumor_bam}" "{output.tumor_bam}.smfix" "$T_SM" mv "{output.tumor_bam}.smfix" "{output.tumor_bam}" From bbec76e11825b91671cb3dce885873803ea9f3ce Mon Sep 17 00:00:00 2001 From: John Major Date: Tue, 9 Sep 2025 01:11:12 -0700 Subject: [PATCH 13/13] x --- workflow/rules/mutect2.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/mutect2.smk b/workflow/rules/mutect2.smk index af1db5d..44b76fc 100644 --- a/workflow/rules/mutect2.smk +++ b/workflow/rules/mutect2.smk @@ -104,7 +104,7 @@ rule mutect2_bams: nrm_sample=$(echo "{input.normal_cram}" | sed -E 's#.*/([^/]+)/align/.*#\1#') echo "Tumor Sample: $tum_sample" >> {log} 2>&1 - echo "Normal Sample: $nrm_sample" >> {log} 2>& + echo "Normal Sample: $nrm_sample" >> {log} 2>&1 T_SM="${{tum_sample}}" N_SM="${{nrm_sample}}"