From b97d6db8c16e2711577567feea2236e32aaa7529 Mon Sep 17 00:00:00 2001
From: mholt <mholt@pacificbiosciences.com>
Date: Thu, 15 Aug 2024 11:21:23 -0700
Subject: [PATCH] updates for v1.4.4

---
 CHANGELOG.md                                  |   8 +-
 Cargo.lock                                    |   2 +-
 Cargo.toml                                    |   2 +-
 LICENSE-THIRDPARTY.json                       |   2 +-
 docs/user_guide.md                            |   6 +
 src/writers/ordered_bam_writer.rs             |  67 +++++++-
 src/writers/ordered_vcf_writer.rs             | 148 +++++++++++++++++-
 test_data/iupac_test/small_variants.vcf.gz    | Bin 0 -> 608 bytes
 .../iupac_test/small_variants.vcf.gz.tbi      | Bin 0 -> 111 bytes
 test_data/iupac_test/test_reference.fa        |   2 +
 test_data/prephased_test/prephased.vcf        |  23 +++
 11 files changed, 252 insertions(+), 8 deletions(-)
 create mode 100644 test_data/iupac_test/small_variants.vcf.gz
 create mode 100644 test_data/iupac_test/small_variants.vcf.gz.tbi
 create mode 100644 test_data/iupac_test/test_reference.fa
 create mode 100644 test_data/prephased_test/prephased.vcf

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 18b1e5d..8094f60 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,10 @@
-# 1.4.3
+# v1.4.4
+## Fixed
+* Fixed an error where phasing information that was present in input files would be copied through to output files if it was not overwritten by HiPhase phasing results. HiPhase will now automatically remove this phasing information to prevent accidental mixing of phase results. 
+  * For VCF files, any unphased genotypes will be switched to unphased and sorted by allele index (e.g. 1|0 -> 0/1). The "FORMAT:PS" and "FORMAT:PF" tags will either be removed entirely if the whole record is unphased or set to "." for partially phased records.
+  * For BAM files, the "HP" and "PS" tags will be removed for any unphased records.
+
+# v1.4.3
 ## Fixed
 * Replaced a panic caused by a chromosome appearing in a VCF but not in the BAM file with a more descriptive error message
 * Fixed an error caused by a multi-sample VCF with a mixture of haploid and diploid genotypes
diff --git a/Cargo.lock b/Cargo.lock
index 28deead..01de4e5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -526,7 +526,7 @@ dependencies = [
 
 [[package]]
 name = "hiphase"
-version = "1.4.3"
+version = "1.4.4"
 dependencies = [
  "bio",
  "bit-vec",
diff --git a/Cargo.toml b/Cargo.toml
index 6b5ab01..a20516e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "hiphase"
-version = "1.4.3"
+version = "1.4.4"
 authors = ["J. Matthew Holt <mholt@pacificbiosciences.com>"]
 description = "A tool for jointly phasing small, structural, and tandem repeat variants for PacBio sequencing data"
 edition = "2021"
diff --git a/LICENSE-THIRDPARTY.json b/LICENSE-THIRDPARTY.json
index 37f11a4..cf1a9f9 100644
--- a/LICENSE-THIRDPARTY.json
+++ b/LICENSE-THIRDPARTY.json
@@ -496,7 +496,7 @@
   },
   {
     "name": "hiphase",
-    "version": "1.4.3",
+    "version": "1.4.4",
     "authors": "J. Matthew Holt <mholt@pacificbiosciences.com>",
     "repository": null,
     "license": null,
diff --git a/docs/user_guide.md b/docs/user_guide.md
index b186549..bdc4151 100644
--- a/docs/user_guide.md
+++ b/docs/user_guide.md
@@ -337,3 +337,9 @@ Since TRGT calls often correspond to multiple small variant calls, the total num
 Additionally, this has the benefit of removing false heterozygous variants in the tandem repeat regions.
 This can have the side effect of (correctly) reducing block NG50 when those false variants are the only links between two otherwise unlinked phase blocks.
 Variants that are intentionally excluded from phasing this way will have a FORMAT `PF` tag of `TR_OVERLAP`.
+
+## Can I provide pre-existing phase information to HiPhase?
+Currently no, HiPhase will not use any pre-existing phase information in the provided VCF or BAM files.
+To prevent accidental mis-interpretation of results, HiPhase will remove or overwrite any existing phase information from the input files prior to writing the output phased VCFs and/or BAMs.
+For VCF files, this includes the "FORMAT:GT" (genotype), "FORMAT:PS" (phase set), and "FORMAT:PF" (phase filter) fields.
+For BAM files, this includes the "HP" (haplotype) and "PS" (phase set) tags.
diff --git a/src/writers/ordered_bam_writer.rs b/src/writers/ordered_bam_writer.rs
index 14c3832..dac3ef0 100644
--- a/src/writers/ordered_bam_writer.rs
+++ b/src/writers/ordered_bam_writer.rs
@@ -221,7 +221,8 @@ impl OrderedBamWriter {
                                             bam_writer.write(&record)?;
                                         },
                                         None => {
-                                            // no match, so just copy the read over
+                                            // no match, so just copy the read over after stripping any phase information
+                                            strip_record_phasing(&mut record)?;
                                             bam_writer.write(&record)?;
                                         }
                                     };
@@ -277,7 +278,7 @@ impl OrderedBamWriter {
                 Ok(()) => {
                     for record_result in bam_reader.records() {
                         // set up the record for the new BAM file
-                        let record = record_result?;
+                        let mut record = record_result?;
                         let record_pos = record.pos();
                         if record_pos < start_pos as i64 {
                             // this can happen when you have reads that overlap the location but don't start *after* the position
@@ -286,6 +287,7 @@ impl OrderedBamWriter {
                         }
 
                         // nothing left should be tagged
+                        strip_record_phasing(&mut record)?;
                         bam_writer.write(&record)?;
     
                         // adding this last bit should prevent double writes by accident from a user
@@ -336,9 +338,10 @@ impl OrderedBamWriter {
                     Ok(()) => {
                         for record_result in bam_reader.records() {
                             // set up the record for the new BAM file
-                            let record = record_result?;
+                            let mut record = record_result?;
                             
                             // nothing left should be tagged
+                            strip_record_phasing(&mut record)?;
                             bam_writer.write(&record)?;
                         }
                     },
@@ -353,3 +356,61 @@ impl OrderedBamWriter {
         Ok(())
     }
 }
+
+/// Removes the phase tags that HiPhase will normally write from a record.
+/// This is so we erase anything that did not come from the tool.
+fn strip_record_phasing(record: &mut bam::Record) -> Result<(), rust_htslib::errors::Error> {
+    // much simpler than the VCF version since there is no multi-sample to worry about, just drop these tags
+    match record.remove_aux(b"HP") {
+        // either it was remove or was not there to begin with
+        Ok(()) |
+        Err(rust_htslib::errors::Error::BamAuxTagNotFound) => {},
+        // some other crazy error
+        Err(e) => {
+            return Err(e);
+        }
+    };
+    match record.remove_aux(b"PS") {
+        // either it was remove or was not there to begin with
+        Ok(()) |
+        Err(rust_htslib::errors::Error::BamAuxTagNotFound) => Ok(()),
+        // some other crazy error
+        Err(e) => Err(e)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    
+    /// Simple empty bam record creator; if we just use a "new()" record, htslib is unhappy when we access Aux
+    fn dummy_bam_record() -> bam::Record {
+        let mut record = bam::Record::new();
+        record.set(
+            b"test_qname",
+            None,
+            b"A",
+            &[255]
+        );
+        record
+    }
+
+    #[test]
+    fn test_clear_record() {
+        // make sure running on a record without the tags is fine (this will be *most* cases)
+        let mut record = dummy_bam_record();
+        strip_record_phasing(&mut record).unwrap();
+
+        // create a record with the tags
+        let mut record = dummy_bam_record();
+        record.push_aux(b"HP", bam::record::Aux::U8(1)).unwrap();
+        record.push_aux(b"PS", bam::record::Aux::I32(42)).unwrap();
+        assert!(record.aux(b"HP").is_ok());
+        assert!(record.aux(b"PS").is_ok());
+
+        // strip them and verify they are gone
+        strip_record_phasing(&mut record).unwrap();
+        assert_eq!(record.aux(b"HP").err().unwrap(), rust_htslib::errors::Error::BamAuxTagNotFound);
+        assert_eq!(record.aux(b"PS").err().unwrap(), rust_htslib::errors::Error::BamAuxTagNotFound);
+    }
+}
\ No newline at end of file
diff --git a/src/writers/ordered_vcf_writer.rs b/src/writers/ordered_vcf_writer.rs
index 750c75b..d01ef7b 100644
--- a/src/writers/ordered_vcf_writer.rs
+++ b/src/writers/ordered_vcf_writer.rs
@@ -97,6 +97,12 @@ impl OrderedVcfWriter {
             
             //now setup the outputs, we want to do the header stuff here also
             let mut output_header: bcf::header::Header = bcf::header::Header::from_template(&vcf_header);
+            
+            // we can remove any existing headers without crashing anything
+            output_header.remove_format(b"PS"); 
+            output_header.remove_format(b"PF");
+            
+            // add in the new headers
             let cli_string: String = std::env::args().collect::<Vec<String>>().join(" ");
             let cli_version: &str = &crate::cli::FULL_VERSION;
             output_header.push_record(format!(r#"##HiPhase_version="{cli_version}""#).as_bytes());
@@ -308,12 +314,16 @@ impl OrderedVcfWriter {
                             // we have already written though, so don't write it again
                             continue;
                         }
+
+                        // the record will get written, clear out all phasing information first
+                        strip_record_phasing(&mut record)?;
                         
                         // we now have to iterate over each sample and modify the entries as necessary
                         let vcf_sample_indices = &self.sample_indices[vcf_index];
                         let mut changes_made: bool = false;
 
-                        // initialize the alleles array to be the same, these may change inside the loop
+                        // initialize the alleles array to be the same as what's currently in the record; this is post-phase stripping
+                        // alleles may get overwritten following this loop depending on the success of the phasing
                         let mut alleles = vec![];
                         let record_gt = record.genotypes().unwrap();
                         for si in 0..record.sample_count() {
@@ -422,3 +432,139 @@ impl OrderedVcfWriter {
         Ok(())
     }
 }
+
+/// This will remove all phasing information from a given VCF record.
+/// # Arguments
+/// * `record` - the VCF to strip all phasing from
+/// # Errors
+/// * if it fails to clear the PS tag
+/// * if it fails to parse genotypes
+/// * if it encounters any weird genotypes (we probably would fail before this)
+fn strip_record_phasing(record: &mut bcf::Record) -> Result<(), Box<dyn std::error::Error>> {
+    // first remove the "PS" tag
+    clear_format_tag(record, b"PS")?;
+    clear_format_tag(record, b"PF")?;
+
+    let record_gt = record.genotypes()?;
+    let mut alleles = vec![];
+    for si in 0..record.sample_count() {
+        let genotype = record_gt.get(si as usize);
+        match genotype.len() {
+            0 => bail!("Encountered empty genotype record at position {}", record.pos()),
+            1 => {
+                // TRGT can make single-allele GT calls, just copy it over as normal
+                // it will not be modified below because it is a homozygous allele
+                let g0 = unphase_genotype_allele(genotype[0]);
+                alleles.push(i32::from(g0));
+                alleles.push(i32::MIN+1); // sentinel value for end
+            },
+            2 => {
+                // this is 99.99999999% path
+                let g0 = i32::from(unphase_genotype_allele(genotype[0]));
+                let g1 = i32::from(unphase_genotype_allele(genotype[1]));
+                let min_value = g0.min(g1);
+                let max_value = g0.max(g1);
+                alleles.push(min_value);
+                alleles.push(max_value);
+            },
+            gt_len => {
+                // we do not have 3+ GT entries implemented
+                bail!("Encountered GT of length {} at record {}", gt_len, record.desc())
+            }
+        }
+    }
+    record.push_format_integer(b"GT", &alleles)?;
+
+    Ok(())
+}
+
+/// This will remove a specified format field from the provided record. 
+/// Other approaches seem to set it to "." instead of removing the field entirely.
+/// If the tag is absent (from this record or the entire file), this seemingly returns without error.
+/// # Arguments
+/// * `record` - the record to remove the tag from 
+/// * `tag` - the tag to remove, e.g. b"PS"
+/// # Errors
+/// * if we cannot remove the tag for some reason
+fn clear_format_tag(record: &mut bcf::Record, tag: &[u8]) -> Result<(), rust_htslib::errors::Error> {
+    let tag_c_str = std::ffi::CString::new(tag).unwrap();
+    unsafe {
+        if rust_htslib::htslib::bcf_update_format(
+            record.header().inner,
+            record.inner,
+            tag_c_str.into_raw(),
+            std::ptr::null() as *const ::std::os::raw::c_void,
+            0,
+            rust_htslib::htslib::BCF_HT_INT as i32
+        ) == 0 {
+            Ok(())
+        } else {
+            Err(rust_htslib::errors::Error::BcfSetTag { tag: std::str::from_utf8(tag).unwrap().to_owned() })
+        }
+    }
+}
+
+/// This will convert a genotype allele into an unphased version.
+/// # Arguments
+/// * `gt` - the genotype allele to unphase
+fn unphase_genotype_allele(gt: GenotypeAllele) -> GenotypeAllele {
+    match gt {
+        GenotypeAllele::Unphased(i) |
+        GenotypeAllele::Phased(i) => GenotypeAllele::Unphased(i),
+        GenotypeAllele::UnphasedMissing |
+        GenotypeAllele::PhasedMissing => GenotypeAllele::UnphasedMissing
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    
+    #[test]
+    fn test_clear_record() {
+        let vcf_fn = PathBuf::from("./test_data/prephased_test/prephased.vcf");
+        let mut vcf_reader = bcf::Reader::from_path(&vcf_fn).unwrap();
+
+        // expectations for sample 1
+        let expected_gt1 = vec![
+            vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)],
+            vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(1)],
+            vec![GenotypeAllele::Unphased(1)],
+            vec![GenotypeAllele::UnphasedMissing, GenotypeAllele::Unphased(1)]
+        ];
+
+        // expectations for sample 2
+        let expected_gt2 = vec![
+            vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(1)],
+            vec![GenotypeAllele::Unphased(0), GenotypeAllele::Unphased(0)],
+            vec![GenotypeAllele::Unphased(1), GenotypeAllele::Unphased(2)],
+            vec![GenotypeAllele::UnphasedMissing, GenotypeAllele::Unphased(0)]
+        ];
+
+        for (record_result, (exp_gt1, exp_gt2)) in vcf_reader.records()
+            .zip(expected_gt1.iter().zip(expected_gt2.iter())) {
+            // load the record and strip out the phasing information
+            let mut record = record_result.unwrap();
+            println!("Handling {}", record.desc());
+            strip_record_phasing(&mut record).unwrap();
+            
+            // verify that PS tag is completely gone
+            let ps_tag_result = record.format(b"PS").integer();
+            assert!(ps_tag_result.is_err());
+            assert_eq!(ps_tag_result.err().unwrap(), rust_htslib::errors::Error::BcfMissingTag { tag: "PS".to_string(), record: record.desc() });
+
+            // verify that PF tag is completely gone
+            let pf_tag_result = record.format(b"PF").string();
+            assert!(pf_tag_result.is_err());
+            assert_eq!(pf_tag_result.err().unwrap(), rust_htslib::errors::Error::BcfMissingTag { tag: "PF".to_string(), record: record.desc() });
+
+            // make sure GT 1 matches our expectation, which will be unphased
+            let genotype1 = record.genotypes().unwrap().get(0);
+            assert_eq!(genotype1.as_slice(), exp_gt1.as_slice());
+            
+            // make sure GT 2 matches our expectation, which will be unphased
+            let genotype2 = record.genotypes().unwrap().get(1);
+            assert_eq!(genotype2.as_slice(), exp_gt2.as_slice());
+        }
+    }
+}
\ No newline at end of file
diff --git a/test_data/iupac_test/small_variants.vcf.gz b/test_data/iupac_test/small_variants.vcf.gz
new file mode 100644
index 0000000000000000000000000000000000000000..8f3ef1ec2de662fc643fedf9f8934dfe0eace8cc
GIT binary patch
literal 608
zcmV-m0-yaKiwFb&00000{{{d;LjnLp0*#bSkDD+MhR@Eg7%i7pN@=p&-pWcSgfvpV
zme5>R4jzKFgH3IdRsMYiWZQ;R-JB2~^UULSY&<WQgt;~)cfoEFAHVqb1J8@+i*%X<
z-{w)U3Afu}gf=s>a#E|{E+m3nl0##m;?|<Ld%UbkFrRRu%d#0N?M{_c2Pm~b0#``&
zVw;NvD;<KgU@%96Dnq}|T<F)Qn)jF!t#T=v??I}LbmT^I<zSCZCxe_C?jA|MLdJ=j
zr>-z)3r*sx%SvC1ZyUFQTM$^eA{c=|Ta#-*Qi0Mfwj<3~@w(+qSJ7})mwPn9c$l6l
z4Ccz=0nH_PssvQp%H|F}<k~=O5niP$z@EyzCYb>zLPGnoQ(Y&^Fl}4R(q4zHGg2M?
z89H7SKi#4?(Mq&#oZxUK!OxnLbf-Qs+T21+kc%$Ie*JR3dcKXZluDL$3DE}h-l938
zXd`QP0K#lHiD6GVd-3}{!rwzlZgPZXBFMAiJcK@P7shYEumi@Vxx2*e&Kt5rKyG-}
zys%exo5dS!_8+@VVNiUs87CBfhC=cMNfufQ3r5#Upa7@aJTlT2tpnN7u5zD7x8{yQ
za`k@feF25s{3Xz>J3_3^$vhv?*rvnSfAB{G&&#xO^3d94g&7YC)xj0P`2KO=O};1V
zCEKjGY#y;>8nbYbvZo*6f_0~uHAj$j>tmLe72&huB@P-A`^tP4u4X9<Q}!@o&A*NL
uY?iXo=kYJVc|Va01pojaiwFb&00000{{{d;LjnLB00RI3000000002&oEjPc

literal 0
HcmV?d00001

diff --git a/test_data/iupac_test/small_variants.vcf.gz.tbi b/test_data/iupac_test/small_variants.vcf.gz.tbi
new file mode 100644
index 0000000000000000000000000000000000000000..7a541194f274b9de19c6b2bd6a8565aff7de4dbc
GIT binary patch
literal 111
zcmb2|=3rp}f&Xj_PR>jWK@8l5pHfm%5)u-U5)v9N@&Li9fLT);8x<`VpAgV_sL2xe
trK^>ZE9Stfy=smoi@J@C7BGeE6<}a6G|={J1ezp|W~4L&GuQ?Y0RRYQ925Wm

literal 0
HcmV?d00001

diff --git a/test_data/iupac_test/test_reference.fa b/test_data/iupac_test/test_reference.fa
new file mode 100644
index 0000000..e86e37d
--- /dev/null
+++ b/test_data/iupac_test/test_reference.fa
@@ -0,0 +1,2 @@
+>chr1
+ACGTARGTACGT
diff --git a/test_data/prephased_test/prephased.vcf b/test_data/prephased_test/prephased.vcf
new file mode 100644
index 0000000..bbf2b6a
--- /dev/null
+++ b/test_data/prephased_test/prephased.vcf
@@ -0,0 +1,23 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=RefCall,Description="Genotyping model thinks this site is reference.">
+##FILTER=<ID=LowQual,Description="Confidence in this variant being real is below calling threshold.">
+##FILTER=<ID=NoCall,Description="Site has depth=0 resulting in no call.">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position (for use with symbolic alleles)">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
+##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block.">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
+##FORMAT=<ID=VAF,Number=A,Type=Float,Description="Variant allele fractions.">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Phred-scaled genotype likelihoods rounded to the closest integer">
+##FORMAT=<ID=MED_DP,Number=1,Type=Integer,Description="Median DP observed within the GVCF block rounded to the nearest integer.">
+##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set identifier">
+##FORMAT=<ID=PF,Number=1,Type=String,Description="Phasing flag">
+##DeepVariant_version=1.3.0
+##contig=<ID=chr1,length=12>
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample_name	sample_name2
+chr1	5	.	A	T	30	PASS	.	GT:PS:PF	0|1:5:test	1|0:5:.
+chr1	15	.	A	T	30	PASS	.	GT:PS	1|1:5	0|0:5
+chr1	25	.	A	T,C	30	PASS	.	GT:PS	1:5	2|1:5
+chr1	30	.	A	T	30	PASS	.	GT:PS	.|1:5	0|.:5
\ No newline at end of file