diff --git a/.test_data/data/0.01x_3_wgs_HG002_hg38.samplesheet.csv b/.test_data/data/0.01x_3_wgs_HG002_hg38.samplesheet.csv index 918882f..94d4dd3 100755 --- a/.test_data/data/0.01x_3_wgs_HG002_hg38.samplesheet.csv +++ b/.test_data/data/0.01x_3_wgs_HG002_hg38.samplesheet.csv @@ -1,4 +1,4 @@ samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer,subsample_pct -RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19,0.95 -RIH0_ANA0-HG002_DBC1_0,RIH0_ANA0-HG002_DBC1_2,RIH0_ANA0-HG002_DBC1_2,DBC2,RIH0,ANA0-HG002,2,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19,0.85 -RIH0_ANA0-HG002_DBC1_0,RIH0_ANA0-HG002_DBC1_1,RIH0_ANA0-HG002_DBC1_1,DBC1,RIH0,ANA0-HG002,1,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19,0.75 +RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19,0.95 +RIH0_ANA0-HG002_DBC1_0,RIH0_ANA0-HG002_DBC1_2,RIH0_ANA0-HG002_DBC1_2,DBC2,RIH0,ANA0-HG002,2,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19,0.85 +RIH0_ANA0-HG002_DBC1_0,RIH0_ANA0-HG002_DBC1_1,RIH0_ANA0-HG002_DBC1_1,DBC1,RIH0,ANA0-HG002,1,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,na,HG002,HG002,NOVASEQ,PCR-FREE,19,0.75 diff --git a/.test_data/data/0.01xwgs_HG002_b37.samplesheet.csv b/.test_data/data/0.01xwgs_HG002_b37.samplesheet.csv index 72ba6c3..a65cb87 100755 --- a/.test_data/data/0.01xwgs_HG002_b37.samplesheet.csv +++ b/.test_data/data/0.01xwgs_HG002_b37.samplesheet.csv @@ -1,2 +1,2 @@ samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer -RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 +RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19 diff --git a/.test_data/data/0.01xwgs_HG002_hg38.samplesheet.csv b/.test_data/data/0.01xwgs_HG002_hg38.samplesheet.csv index d96d481..8399c88 100755 --- a/.test_data/data/0.01xwgs_HG002_hg38.samplesheet.csv +++ b/.test_data/data/0.01xwgs_HG002_hg38.samplesheet.csv @@ -1,2 +1,2 @@ samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer -RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 +RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,RIH0_ANA0-HG002_DBC0_0,DBC0,RIH0,ANA0-HG002,0,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R1.fastq.gz,.test_data/data/RIH0_ANA0-HG002_DBC0_0.R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG2/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19 diff --git a/.test_data/data/giab_30x_b37_analysis_manifest.csv b/.test_data/data/giab_30x_b37_analysis_manifest.csv index 62bb2b2..a5f9ec5 100755 --- a/.test_data/data/giab_30x_b37_analysis_manifest.csv +++ b/.test_data/data/giab_30x_b37_analysis_manifest.csv @@ -1,8 +1,8 @@ samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer -R0_HG001_D0_0,R0_HG001_D0_0,R0_HG001_D0_0,D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,HG001,HG001,NOVASEQ,PCR-FREE,19 -R0_HG002_D0_0,R0_HG002_D0_0,R0_HG002_D0_0,D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 -R0_HG003_D0_0,R0_HG003_D0_0,R0_HG003_D0_0,D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,HG003,HG003,NOVASEQ,PCR-FREE,19 -R0_HG004_D0_0,R0_HG004_D0_0,R0_HG004_D0_0,D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,HG004,HG004,NOVASEQ,PCR-FREE,19 -R0_HG005_D0_0,R0_HG005_D0_0,R0_HG005_D0_0,D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,HG005,HG005,NOVASEQ,PCR-FREE,19 -R0_HG006_D0_0,R0_HG006_D0_0,R0_HG006_D0_0,D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,HG006,HG006,NOVASEQ,PCR-FREE,19 -R0_HG007_D0_0,R0_HG007_D0_0,R0_HG007_D0_0,D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,HG007,HG007,NOVASEQ,PCR-FREE,19 +R0_HG001_D0_0,R0_HG001_D0_0,R0_HG001_D0_0,D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,na,HG001,NOVASEQ,PCR-FREE,19 +R0_HG002_D0_0,R0_HG002_D0_0,R0_HG002_D0_0,D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19 +R0_HG003_D0_0,R0_HG003_D0_0,R0_HG003_D0_0,D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,na,HG003,NOVASEQ,PCR-FREE,19 +R0_HG004_D0_0,R0_HG004_D0_0,R0_HG004_D0_0,D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,na,HG004,NOVASEQ,PCR-FREE,19 +R0_HG005_D0_0,R0_HG005_D0_0,R0_HG005_D0_0,D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,na,HG005,NOVASEQ,PCR-FREE,19 +R0_HG006_D0_0,R0_HG006_D0_0,R0_HG006_D0_0,D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,na,HG006,NOVASEQ,PCR-FREE,19 +R0_HG007_D0_0,R0_HG007_D0_0,R0_HG007_D0_0,D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/b37/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,na,HG007,NOVASEQ,PCR-FREE,19 diff --git a/.test_data/data/giab_30x_hg38_analysis_manifest.csv b/.test_data/data/giab_30x_hg38_analysis_manifest.csv index ddf4b7c..f2a41ce 100755 --- a/.test_data/data/giab_30x_hg38_analysis_manifest.csv +++ b/.test_data/data/giab_30x_hg38_analysis_manifest.csv @@ -1,8 +1,8 @@ samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer -R0_HG001_D0_0,R0_HG001_D0_0,R0_HG001_D0_0,D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,HG001,HG001,NOVASEQ,PCR-FREE,19 -R0_HG002_D0_0,R0_HG002_D0_0,R0_HG002_D0_0,D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 -R0_HG003_D0_0,R0_HG003_D0_0,R0_HG003_D0_0,D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,HG003,HG003,NOVASEQ,PCR-FREE,19 -R0_HG004_D0_0,R0_HG004_D0_0,R0_HG004_D0_0,D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,HG004,HG004,NOVASEQ,PCR-FREE,19 -R0_HG005_D0_0,R0_HG005_D0_0,R0_HG005_D0_0,D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,HG005,HG005,NOVASEQ,PCR-FREE,19 -R0_HG006_D0_0,R0_HG006_D0_0,R0_HG006_D0_0,D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,HG006,HG006,NOVASEQ,PCR-FREE,19 -R0_HG007_D0_0,R0_HG007_D0_0,R0_HG007_D0_0,D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,HG007,HG007,NOVASEQ,PCR-FREE,19 +R0_HG001_D0_0,R0_HG001_D0_0,R0_HG001_D0_0,D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,na,HG001,NOVASEQ,PCR-FREE,19 +R0_HG002_D0_0,R0_HG002_D0_0,R0_HG002_D0_0,D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,na,HG002,NOVASEQ,PCR-FREE,19 +R0_HG003_D0_0,R0_HG003_D0_0,R0_HG003_D0_0,D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,na,HG003,NOVASEQ,PCR-FREE,19 +R0_HG004_D0_0,R0_HG004_D0_0,R0_HG004_D0_0,D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,na,HG004,NOVASEQ,PCR-FREE,19 +R0_HG005_D0_0,R0_HG005_D0_0,R0_HG005_D0_0,D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,na,HG005,NOVASEQ,PCR-FREE,19 +R0_HG006_D0_0,R0_HG006_D0_0,R0_HG006_D0_0,D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,na,HG006,NOVASEQ,PCR-FREE,19 +R0_HG007_D0_0,R0_HG007_D0_0,R0_HG007_D0_0,D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,na,HG007,NOVASEQ,PCR-FREE,19 diff --git a/config/day_profiles/local/templates/rule_config.yaml b/config/day_profiles/local/templates/rule_config.yaml index 1b8979c..4da46f2 100755 --- a/config/day_profiles/local/templates/rule_config.yaml +++ b/config/day_profiles/local/templates/rule_config.yaml @@ -15,7 +15,7 @@ valid_biome: AWSPC # ###### Tool Selection CONFIG ---------------------------------------- -# these may be set on the command line as --config genome_build=hg38 aligners=['bwa2a','sent','strobe'] dedupers=['dppl'] snv_callers=['oct','clair3','deep','sentd','lfq2'] sv_callers=['manta','tiddit','dysgu'] +# these may be set on the command line as --config genome_build=hg38 aligners=['bwa2a','sent','strobe'] dedupers=['dppl'] snv_callers=['oct','clair3','deep','sentd','lfq2','strelka2'] sv_callers=['manta','tiddit','dysgu'] # and will override these defaults # Or you can set them here @@ -196,6 +196,7 @@ deepsomatic: numa: " OMP_NUM_THREADS=8 OMP_PROC_BIND=close OMP_PLACES=threads OMP_PROC_BIND=TRUE OMP_DYNAMIC=TRUE OMP_MAX_ACTIVE_LEVELS=1 OMP_SCHEDULE=dynamic OMP_WAIT_POLICY=ACTIVE " dvsom_conda: "../envs/vanilla_v0.1.yaml" + mutect2: threads: 7 container: "docker://broadinstitute/gatk:4.5.0.0" @@ -208,6 +209,7 @@ mutect2: numa: "" conda: "../envs/vanilla_v0.1.yaml" + duphold: threads: 7 env_yaml: "../envs/duphold_v0.1.yaml" diff --git a/config/day_profiles/slurm/templates/rule_config.yaml b/config/day_profiles/slurm/templates/rule_config.yaml index 65ce5da..3d0e471 100755 --- a/config/day_profiles/slurm/templates/rule_config.yaml +++ b/config/day_profiles/slurm/templates/rule_config.yaml @@ -14,7 +14,7 @@ valid_biome: AWSPC # ###### Tool Selection CONFIG ---------------------------------------- -# these may be set on the command line as --config genome_build=hg38 aligners=['bwa2a','sent','strobe'] dedupers=['dppl'] snv_callers=['oct','clair3','deep','sentd','lfq2'] sv_callers=['manta','tiddit','dysgu'] +# these may be set on the command line as --config genome_build=hg38 aligners=['bwa2a','sent','strobe'] dedupers=['dppl'] snv_callers=['oct','clair3','deep','sentd','lfq2','strelka2'] sv_callers=['manta','tiddit','dysgu'] # and will override these defaults @@ -209,6 +209,7 @@ mutect2: + duphold: threads: 32 env_yaml: "../envs/duphold_v0.1.yaml" diff --git a/giab_30x_hg38_analysis_manifest.csv b/giab_30x_hg38_analysis_manifest.csv old mode 100755 new mode 100644 index e8ad19d..62b3ed6 --- a/giab_30x_hg38_analysis_manifest.csv +++ b/giab_30x_hg38_analysis_manifest.csv @@ -1,9 +1,9 @@ -samp,sample,sample_lane,SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer -R0_HG001_D0_0,R0_HG001_D0_0,R0_HG001_D0_0,D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,HG001,HG001,NOVASEQ,PCR-FREE,19 -R0_HG002_D0_0,R0_HG002_D0_0,R0_HG002_D0_0,D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 -R0_HG003_D0_0,R0_HG003_D0_0,R0_HG003_D0_0,D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,HG003,HG003,NOVASEQ,PCR-FREE,19 -R0_HG004_D0_0,R0_HG004_D0_0,R0_HG004_D0_0,D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,HG004,HG004,NOVASEQ,PCR-FREE,19 -R0_HG005_D0_0,R0_HG005_D0_0,R0_HG005_D0_0,D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,HG005,HG005,NOVASEQ,PCR-FREE,19 -R0_HG006_D0_0,R0_HG006_D0_0,R0_HG006_D0_0,D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,HG006,HG006,NOVASEQ,PCR-FREE,19 -R0_HG007_D0_0,R0_HG007_D0_0,R0_HG007_D0_0,D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,HG007,HG007,NOVASEQ,PCR-FREE,19 +SQ,RU,EX,LANE,r1_path,r2_path,biological_sex,iddna_uid,concordance_control_path,is_positive_control,is_negative_control,sample_type,merge_single,tum_nrm_sampleid_match,external_sample_id,instrument,lib_prep,bwa_kmer +D0,R0,HG001,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG001_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG001/,true,false,blood,merge,HG001,HG001,NOVASEQ,PCR-FREE,19 +D0,R0,HG002,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG002_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG002/,true,false,blood,merge,HG002,HG002,NOVASEQ,PCR-FREE,19 +D0,R0,HG003,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG003_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG003/,true,false,blood,merge,HG003,HG003,NOVASEQ,PCR-FREE,19 +D0,R0,HG004,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG004_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG004/,true,false,blood,merge,HG004,HG004,NOVASEQ,PCR-FREE,19 +D0,R0,HG005,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG005_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG005/,true,false,blood,merge,HG005,HG005,NOVASEQ,PCR-FREE,19 +D0,R0,HG006,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG006_30x_R2.fastq.gz,male,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG006/,true,false,blood,merge,HG006,HG006,NOVASEQ,PCR-FREE,19 +D0,R0,HG007,0,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R1.fastq.gz,/fsx/data/genomic_data/organism_reads/H_sapiens/giab/NovaSeqX_WHGS_TruSeqPF_HG002-007/HG007_30x_R2.fastq.gz,female,na,/fsx/data/genomic_data/organism_annotations/H_sapiens/hg38/controls/giab/snv/v4.2.1/HG007/,true,false,blood,merge,HG007,HG007,NOVASEQ,PCR-FREE,19 diff --git a/workflow/envs/vpot_v0.1.yaml b/workflow/envs/vpot_v0.1.yaml index e456502..378e504 100644 --- a/workflow/envs/vpot_v0.1.yaml +++ b/workflow/envs/vpot_v0.1.yaml @@ -1,11 +1,7 @@ channels: - conda-forge - bioconda - - anaconda - - r - - defaults dependencies: - - python + - python=3.9 + - numpy - pip - - pip: - - git+https://github.com/VCCRI/VPOT.git diff --git a/workflow/rules/mutect2.smk b/workflow/rules/mutect2.smk index 50736d2..44b76fc 100644 --- a/workflow/rules/mutect2.smk +++ b/workflow/rules/mutect2.smk @@ -38,89 +38,88 @@ rule mutect2_bams: r""" set -euo pipefail ulimit -n 65536 || true - - mkdir -p "$(dirname {output.tumor_bam}" + + mkdir -p "$(dirname {output.tumor_bam})" # Build interval token; map 23→X, 24→Y, 25→{params.mito_code}; strip trailing colon. tchr=$(echo {params.cpre}{params.chrm} \ | sed 's/~/\:/g' | sed 's/23\:/X\:/' | sed 's/24\:/Y\:/' | sed 's/25\:/{params.mito_code}\:/') - tchr=${{tchr}} + tchr=${{tchr%:}} IFS=':' read -r tcontig tstart tend <<< "$tchr" # Look up contig length early contig_len=$(awk -v c="$tcontig" '$1==c{{print $2; exit}}' {input.ref_fai}) if [ -z "${{contig_len}}" ]; then - echo "ERROR: Contig '$tcontig' not found in {input.ref_fai}" >&2 - exit 1 + echo "ERROR: Contig '$tcontig' not found in {input.ref_fai}" >&2 + exit 1 fi - if [ -z "${{tend}}" ]; then - # Whole contig - region="$tcontig" + if [ -z "${{tend:-}}" ]; then + # Whole contig + region="$tcontig" else - # Normalize to 1-based inclusive and clamp to [1, contig_len] - if [ -z "${{tstart}}" ] || [ "$tstart" -lt 1 ]; then tstart=1; fi - if [ "$tend" -gt "$contig_len" ]; then tend="$contig_len"; fi - if [ "$tstart" -gt "$tend" ]; then + # Normalize to 1-based inclusive and clamp to [1, contig_len] + : "${{tstart:=1}}" + if [ "$tstart" -lt 1 ]; then tstart=1; fi + if [ "$tend" -gt "$contig_len" ]; then tend="$contig_len"; fi + if [ "$tstart" -gt "$tend" ]; then echo "ERROR: Empty/invalid interval after normalization: $tcontig:$tstart-$tend" >&2 exit 1 - fi - region="$tcontig:$tstart-$tend" fi - + region="$tcontig:$tstart-$tend" + fi + # Tumor samtools view -@ {threads} -T {input.ref_fa} -b {input.tumor_cram} "$region" \ - | samtools sort -@ {threads} -o {output.tumor_bam} - >> {log} 2>&1 - samtools index -@ {threads} {output.tumor_bam} >> {log} 2>&1 - + | samtools sort -@ {threads} -o {output.tumor_bam} - >> {log} 2>&1 + samtools index -@ {threads} {output.tumor_bam} >> {log} 2>&1 + # Normal samtools view -@ {threads} -T {input.ref_fa} -b {input.normal_cram} "$region" \ - | samtools sort -@ {threads} -o {output.normal_bam} - >> {log} 2>&1 - samtools index -@ {threads} {output.normal_bam} >> {log} 2>&1 - + | samtools sort -@ {threads} -o {output.normal_bam} - >> {log} 2>&1 + samtools index -@ {threads} {output.normal_bam} >> {log} 2>&1 - # ---- Fix SM in headers (preserve all other @RG fields) ---- + # ---- Fix SM in headers (preserve other @RG fields) ---- fix_sm () {{ - inbam=$1 - outbam=$2 - sm=$3 - tmphdr=$(mktemp) - OFS="\t" - samtools view -H "$inbam" \ - | awk -v sm="$sm" 'BEGIN{OFS="\t"} - /^@RG/ { - hasSM=0 - for (i=1;i<=NF;i++){ - if ($i ~ /^SM:/) { $i="SM:" sm; hasSM=1 } - } - if (!hasSM){ $0 = $0 OFS "SM:" sm } - } - { print } - ' > "$tmphdr" - # Reheader and replace atomically - samtools reheader "$tmphdr" "$inbam" > "$outbam" - rm -f "$tmphdr" - samtools index -@ {threads} "$outbam" >/dev/null 2>&1 || true + inbam=$1 + outbam=$2 + sm=$3 + tmphdr=$(mktemp) + samtools view -H "$inbam" \ + | awk -v sm="$sm" 'BEGIN{{FS=OFS="\t"}} + /^@RG/ {{ + found=0 + for (i=1;i<=NF;i++) if ($i ~ /^SM:/) {{ $i="SM:" sm; found=1 }} + if (!found) {{ $0 = $0 OFS "SM:" sm }} + }} + {{ print }} + ' > "$tmphdr" + samtools reheader "$tmphdr" "$inbam" > "$outbam" + rm -f "$tmphdr" + samtools index -@ {threads} "$outbam" >/dev/null 2>&1 || true }} - - # Distinct names for tumor/normal (required by Mutect2) - T_SM="{params.cluster_sample}-T" - N_SM="{params.cluster_sample}-N" - + + tum_sample=$(echo "{input.tumor_cram}" | sed -E 's#.*/([^/]+)/align/.*#\1#') + nrm_sample=$(echo "{input.normal_cram}" | sed -E 's#.*/([^/]+)/align/.*#\1#') + + echo "Tumor Sample: $tum_sample" >> {log} 2>&1 + echo "Normal Sample: $nrm_sample" >> {log} 2>&1 + T_SM="${{tum_sample}}" + N_SM="${{nrm_sample}}" + + fix_sm "{output.tumor_bam}" "{output.tumor_bam}.smfix" "$T_SM" mv "{output.tumor_bam}.smfix" "{output.tumor_bam}" - samtools index -@ {threads} -f "{output.tumor_bam}" >> {log} 2>&1 - + samtools index -@ {threads} "{output.tumor_bam}" >> {log} 2>&1 + fix_sm "{output.normal_bam}" "{output.normal_bam}.smfix" "$N_SM" mv "{output.normal_bam}.smfix" "{output.normal_bam}" - samtools index -@ {threads} -f "{output.normal_bam}" >> {log} 2>&1 - - # Optional: log the final names for sanity - gatk GetSampleName -I {output.tumor_bam} 2>>{log} | sed "s/^/Tumor SM: /" >> {log} - gatk GetSampleName -I {output.normal_bam} 2>>{log} | sed "s/^/Normal SM: /" >> {log} + samtools index -@ {threads} "{output.normal_bam}" >> {log} 2>&1 + """ + rule mutect2: wildcard_constraints: sample=TUMORS_REGEX diff --git a/workflow/rules/rule_common.smk b/workflow/rules/rule_common.smk index 4b9ea9b..8540a5b 100755 --- a/workflow/rules/rule_common.smk +++ b/workflow/rules/rule_common.smk @@ -92,6 +92,7 @@ LOFREQ_CHRMS = config["lofreq2"][f"{config['genome_build']}_lofreq_chrms"].split DVSOM_CHRMS = config["deepsomatic"][f"{config['genome_build']}_dvsom_chrms"].split(",") M2_CHRMS = config["mutect2"][f"{config['genome_build']}_mutect2_chrms"].split(",") SENTTN_CHRMS = config["senttn"][f"{config['genome_build']}_senttn_chrms"].split(",") +STRELKA2_CHRMS = config["strelka2"][f"{config['genome_build']}_strelka2_chrms"].split(",") VARN_CHRMS = ( [] @@ -242,9 +243,36 @@ os.system( ) # IMPORTANT: initialize the samples dataframe from the analysis_manifest.csv -samples = pd.read_table(analysis_manifest, ",").set_index( - ["sample", "sample_lane"], drop=False -) +samples = pd.read_table(analysis_manifest, ",") + +# Derive sample_lane and sample identifiers if they are not provided. +required_cols = {"RU", "EX", "SQ", "LANE"} +missing = required_cols - set(samples.columns) +if missing: + raise WorkflowError(f"Missing required columns in analysis manifest: {missing}") + +samples["LANE"] = samples["LANE"].astype(int) + +if "sample_lane" not in samples.columns: + samples["sample_lane"] = ( + samples["RU"].astype(str) + + "_" + + samples["EX"].astype(str) + + "_" + + samples["SQ"].astype(str) + + "_" + + samples["LANE"].astype(str) + ) + +if "sample" not in samples.columns: + samples["sample"] = samples.apply( + lambda r: f"{r['RU']}_{r['EX']}_{r['SQ']}_0" + if str(r.get("merge_single", "")).lower() == "merge" + else r["sample_lane"], + axis=1, + ) + +samples = samples.set_index(["sample", "sample_lane"], drop=False) # Ensure tum_nrm_sampleid_match exists and treat missing values as 'na' if "tum_nrm_sampleid_match" in samples.columns: @@ -953,6 +981,7 @@ def get_mutect2_chrm_day(wildcards): if len(sl2) == 2: ret_str = pchr + wildcards.m2chrm + elif len(sl) == 1: ret_str = pchr + sl[0] elif len(sl) == 2: @@ -964,6 +993,7 @@ def get_mutect2_chrm_day(wildcards): else: raise Exception( "mutect2 chunks can only be one contiguous range per chunk : ie: 1-4 with the non numerical chrms assigned 23=X, 24=Y,25=MT" + ) return ret_mod_chrm(ret_str) diff --git a/workflow/rules/strelka2.smk b/workflow/rules/strelka2.smk new file mode 100644 index 0000000..b2cdfc7 --- /dev/null +++ b/workflow/rules/strelka2.smk @@ -0,0 +1,295 @@ +import sys +import os + +##### strelka2 +# --------------------------- + +rule strelka2_germline_chunkdirs: + input: + b=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram", + i=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram.crai", + output: + expand( + MDIR + "{{sample}}/align/{{alnr}}/snv/slk2g/vcfs/{strelkachrm}/{{sample}}.ready", + strelkachrm=STRELKA2_CHRMS, + ), + threads: 1 + log: + MDIR + "{sample}/align/{alnr}/snv/slk2g/log/{sample}.{alnr}.chunkdirs.log", + shell: + """ + ( echo {output}; mkdir -p $(dirname {output}); touch {output}; ls {output}; ) > {log} 2>&1; + """ + + +rule strelka2_germline: + input: + cram=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram", + crai=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram.crai", + ref_fa=lambda wc: config["supporting_files"]["files"]["huref"]["fasta"]["name"], + d=MDIR + "{sample}/align/{alnr}/snv/slk2g/vcfs/{strelkachrm}/{sample}.ready", + output: + vcfgz=MDIR + "{sample}/align/{alnr}/snv/slk2g/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz", + vcftbi=MDIR + "{sample}/align/{alnr}/snv/slk2g/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz.tbi", + log: + MDIR + "{sample}/align/{alnr}/snv/slk2g/vcfs/{strelkachrm}/log/{sample}.{alnr}.strelka2.{strelkachrm}.germline.log", + threads: config['strelka2']['threads'] + container: + config['strelka2']['container'] + resources: + vcpu=config['strelka2']['threads'], + threads=config['strelka2']['threads'], + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], + params: + run_dir=MDIR + "{sample}/align/{alnr}/snv/slk2g/work/{sample}.germline.{strelkachrm}", + schrm=get_strelka_chrm_day, + cluster_sample=ret_sample, + cpre="" if "b37" == config['genome_build'] else "chr", + mito_code="MT" if "b37" == config['genome_build'] else "M", + shell: + r""" + set -euo pipefail + mkdir -p {params.run_dir} + + vchr=$(echo {params.cpre}{params.schrm} | sed 's/~/\:/g' | sed 's/23\:/X\:/' | sed 's/24\:/Y\:/' | sed 's/25\:/{params.mito_code}\:/' ) + vchr=${{vchr%:}} + IFS=':' read -r vcontig vstart vend <<< "$vchr" + if [ -z "${{vend:-}}" ]; then + vstart=0 + vend=$(awk -v c="$vcontig" '$1==c{{print $2; exit}}' {input.ref_fa}.fai) + fi + echo -e "$vcontig\t$vstart\t$vend" > {params.run_dir}/region.bed + + configureStrelkaGermlineWorkflow.py \ + --bam {input.cram} \ + --referenceFasta {input.ref_fa} \ + --callRegions {params.run_dir}/region.bed \ + --runDir {params.run_dir} >> {log} 2>&1 + {params.run_dir}/runWorkflow.py -m local -j {threads} >> {log} 2>&1 + cp {params.run_dir}/results/variants/variants.vcf.gz {output.vcfgz} + cp {params.run_dir}/results/variants/variants.vcf.gz.tbi {output.vcftbi} + """ + + +rule strelka2_germline_concat: + input: + vcfs=lambda wildcards: expand( + MDIR + "{sample}/align/{alnr}/snv/slk2g/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.germline.vcf.gz", + sample=wildcards.sample, + alnr=wildcards.alnr, + strelkachrm=STRELKA2_CHRMS, + ), + output: + vcfgz=MDIR + "{sample}/align/{alnr}/snv/slk2g/{sample}.{alnr}.strelka2.germline.vcf.gz", + vcfgztbi=MDIR + "{sample}/align/{alnr}/snv/slk2g/{sample}.{alnr}.strelka2.germline.vcf.gz.tbi", + threads: 4 + resources: + vcpu=4, + threads=4, + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], + conda: "../envs/vanilla_v0.1.yaml" + log: + MDIR + "{sample}/align/{alnr}/snv/slk2g/log/{sample}.{alnr}.strelka2.germline.merge.log", + params: + cluster_sample=ret_sample, + shell: + """ + bcftools concat -a -d all --threads {threads} -O z -o {output.vcfgz}.tmp {input.vcfs} >> {log} 2>&1; + oldname=$(bcftools query -l {output.vcfgz}.tmp | head -n1) >> {log} 2>&1; + echo -e "${{oldname}}\t{params.cluster_sample}" > {output.vcfgz}.rename.txt; + bcftools reheader -s {output.vcfgz}.rename.txt -o {output.vcfgz} {output.vcfgz}.tmp >> {log} 2>&1; + bcftools index -f -t --threads {threads} {output.vcfgz} >> {log} 2>&1; + rm -f {output.vcfgz}.tmp {output.vcfgz}.rename.txt; + """ + + +rule strelka2_somatic_chunkdirs: + wildcard_constraints: + sample=TUMORS_REGEX + input: + b=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram", + i=MDIR + "{sample}/align/{alnr}/{sample}.{alnr}.cram.crai", + output: + expand( + MDIR + "{{sample}}/align/{{alnr}}/snv/slk2s/vcfs/{strelkachrm}/{{sample}}.ready", + strelkachrm=STRELKA2_CHRMS, + ), + threads: 1 + log: + MDIR + "{sample}/align/{alnr}/snv/slk2s/log/{sample}.{alnr}.somatic.chunkdirs.log", + shell: + """ + ( echo {output}; mkdir -p $(dirname {output}); touch {output}; ls {output}; ) > {log} 2>&1; + """ + + +rule strelka2_somatic: + wildcard_constraints: + sample=TUMORS_REGEX + input: + tumor_cram=get_somcall_tumor_cram, + tumor_crai=get_somcall_tumor_crai, + normal_cram=get_somcall_normal_cram, + normal_crai=get_somcall_normal_crai, + ref_fa=lambda wc: config["supporting_files"]["files"]["huref"]["fasta"]["name"], + d=MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.ready", + output: + snv=MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz", + snvtbi=MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz.tbi", + indel=MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz", + indeltbi=MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz.tbi", + log: + MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/log/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.log", + threads: config['strelka2']['threads'] + container: + config['strelka2']['container'] + resources: + vcpu=config['strelka2']['threads'], + threads=config['strelka2']['threads'], + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], + params: + run_dir=MDIR + "{sample}/align/{alnr}/snv/slk2s/work/{sample}.somatic.{strelkachrm}", + schrm=get_strelka_chrm_day, + cluster_sample=ret_sample, + cpre="" if "b37" == config['genome_build'] else "chr", + mito_code="MT" if "b37" == config['genome_build'] else "M", + shell: + r""" + set -euo pipefail + mkdir -p {params.run_dir} + + vchr=$(echo {params.cpre}{params.schrm} | sed 's/~/\:/g' | sed 's/23\:/X\:/' | sed 's/24\:/Y\:/' | sed 's/25\:/{params.mito_code}\:/' ) + vchr=${{vchr%:}} + IFS=':' read -r vcontig vstart vend <<< "$vchr" + if [ -z "${{vend:-}}" ]; then + vstart=0 + vend=$(awk -v c="$vcontig" '$1==c{{print $2; exit}}' {input.ref_fa}.fai) + fi + echo -e "$vcontig\t$vstart\t$vend" > {params.run_dir}/region.bed + + configureStrelkaSomaticWorkflow.py \ + --tumorBam {input.tumor_cram} \ + --normalBam {input.normal_cram} \ + --referenceFasta {input.ref_fa} \ + --callRegions {params.run_dir}/region.bed \ + --runDir {params.run_dir} >> {log} 2>&1 + {params.run_dir}/runWorkflow.py -m local -j {threads} >> {log} 2>&1 + cp {params.run_dir}/results/variants/somatic.snvs.vcf.gz {output.snv} + cp {params.run_dir}/results/variants/somatic.snvs.vcf.gz.tbi {output.snvtbi} + cp {params.run_dir}/results/variants/somatic.indels.vcf.gz {output.indel} + cp {params.run_dir}/results/variants/somatic.indels.vcf.gz.tbi {output.indeltbi} + """ + + +rule strelka2_somatic_concat: + wildcard_constraints: + sample=TUMORS_REGEX + input: + snv_vcfs=lambda wildcards: expand( + MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.snvs.vcf.gz", + sample=wildcards.sample, + alnr=wildcards.alnr, + strelkachrm=STRELKA2_CHRMS, + ), + indel_vcfs=lambda wildcards: expand( + MDIR + "{sample}/align/{alnr}/snv/slk2s/vcfs/{strelkachrm}/{sample}.{alnr}.strelka2.{strelkachrm}.somatic.indels.vcf.gz", + sample=wildcards.sample, + alnr=wildcards.alnr, + strelkachrm=STRELKA2_CHRMS, + ), + output: + snv=MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz", + snvtbi=MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz.tbi", + indel=MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz", + indeltbi=MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz.tbi", + threads: 4 + resources: + vcpu=4, + threads=4, + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], + conda: "../envs/vanilla_v0.1.yaml" + log: + MDIR + "{sample}/align/{alnr}/snv/slk2s/log/{sample}.{alnr}.strelka2.somatic.merge.log", + params: + cluster_sample=ret_sample, + shell: + """ + bcftools concat -a -d all --threads {threads} -O z -o {output.snv}.tmp {input.snv_vcfs} >> {log} 2>&1; + oldname=$(bcftools query -l {output.snv}.tmp | head -n1) >> {log} 2>&1; + echo -e "${{oldname}}\t{params.cluster_sample}" > {output.snv}.rename.txt; + bcftools reheader -s {output.snv}.rename.txt -o {output.snv} {output.snv}.tmp >> {log} 2>&1; + bcftools index -f -t --threads {threads} {output.snv} >> {log} 2>&1; + rm -f {output.snv}.tmp {output.snv}.rename.txt; + + bcftools concat -a -d all --threads {threads} -O z -o {output.indel}.tmp {input.indel_vcfs} >> {log} 2>&1; + oldname=$(bcftools query -l {output.indel}.tmp | head -n1) >> {log} 2>&1; + echo -e "${{oldname}}\t{params.cluster_sample}" > {output.indel}.rename.txt; + bcftools reheader -s {output.indel}.rename.txt -o {output.indel} {output.indel}.tmp >> {log} 2>&1; + bcftools index -f -t --threads {threads} {output.indel} >> {log} 2>&1; + rm -f {output.indel}.tmp {output.indel}.rename.txt; + """ + +rule produce_strelka2_germline_vcf: + input: + vcftb=expand( + MDIR + "{sample}/align/{alnr}/snv/slk2g/{sample}.{alnr}.strelka2.germline.vcf.gz", + sample=SSAMPS, + alnr=ALIGNERS, + ), + vcftbi=expand( + MDIR + "{sample}/align/{alnr}/snv/slk2g/{sample}.{alnr}.strelka2.germline.vcf.gz.tbi", + sample=SSAMPS, + alnr=ALIGNERS, + ), + output: + "gatheredall.strelka2.germline", + threads: 4 + log: + "gatheredall.strelka2.germline.log", + params: + cluster_sample=ret_sample, + resources: + vcpu=4, + threads=4, + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], + + +rule produce_strelka2_somatic_vcf: + input: + snv=expand( + MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz", + sample=TN_TUMOR_SAMPS, + alnr=ALIGNERS, + ), + snvtbi=expand( + MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.snvs.vcf.gz.tbi", + sample=TN_TUMOR_SAMPS, + alnr=ALIGNERS, + ), + indel=expand( + MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz", + sample=TN_TUMOR_SAMPS, + alnr=ALIGNERS, + ), + indeltbi=expand( + MDIR + "{sample}/align/{alnr}/snv/slk2s/{sample}.{alnr}.strelka2.somatic.indels.vcf.gz.tbi", + sample=TN_TUMOR_SAMPS, + alnr=ALIGNERS, + ), + output: + "gatheredall.strelka2.somatic", + threads: 4 + log: + "gatheredall.strelka2.somatic.log", + params: + cluster_sample=ret_sample, + resources: + vcpu=4, + threads=4, + partition=config['strelka2']['partition'], + mem_mb=config['strelka2']['mem_mb'], diff --git a/workflow/schemas/analysis_manifest.schema.yaml b/workflow/schemas/analysis_manifest.schema.yaml index 6e2cfd9..b9c234d 100755 --- a/workflow/schemas/analysis_manifest.schema.yaml +++ b/workflow/schemas/analysis_manifest.schema.yaml @@ -5,10 +5,16 @@ description: Analysis Samples Manifest Validator. See README in config section properties: sample: type: string - description: format RU#_EX_SQ#_LANE where LANE leading zeros removed, and if merged lane =0 + description: > + Combination of RU, EX, SQ and lane used internally by the + workflow. If omitted, it will be generated automatically from + other fields. sample_lane: type: string - description: RU#_EX_SQ#_LANE with no lane substitution. Identifes the sub patterns to merge to create sample if merging is enabled. if merge_single=single then sample==sample_lane. if merge, then sample_lane is unique, sample is the aggregation pattern, which may be 1 to 1, but if set to merge should be 1(sample) pattern to >1(sample_lane) patterns.<--- this was implemented, then scaled back, and now is largely only really working for merging. + description: > + Fully qualified sample identifier including lane. When not + provided, it will be derived automatically from RU, EX, SQ and + LANE. SQ: type: string description: Sample Type @@ -58,8 +64,6 @@ properties: ont_cram_snv_caller: type: string required: - - sample - - sample_lane - SQ - RU - EX