diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index c78dd65..c0c0b04 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -265,7 +265,8 @@ def _pysam_open( assert file_type is not None, "Must specify file_type when writing to standard output" path = sys.stdout else: - file_type = file_type or SamFileType.from_path(path) + if file_type is None: + file_type = SamFileType.from_path(path) path = str(path) elif not isinstance(path, _IOClasses): # type: ignore[unreachable] open_type = "reading" if open_for_reading else "writing" @@ -606,6 +607,111 @@ def __getitem__( return self.elements[index] +@enum.unique +class PairOrientation(enum.Enum): + """Enumerations of read pair orientations.""" + + FR = "FR" + """A pair orientation for forward-reverse reads ("innie").""" + + RF = "RF" + """A pair orientation for reverse-forward reads ("outie").""" + + TANDEM = "TANDEM" + """A pair orientation for tandem (forward-forward or reverse-reverse) reads.""" + + @classmethod + def build( + cls, r1: AlignedSegment, r2: Optional[AlignedSegment] = None + ) -> Optional["PairOrientation"]: + """Returns the pair orientation if both reads are mapped to the same reference sequence. + + + Args: + r1: The first read in the template. + r2: The second read in the template. If undefined, mate data on R1 will be used. + + See: + - https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L71-L102 + """ + + if r2 is None: + r2_is_unmapped = r1.mate_is_unmapped + r2_reference_id = r1.next_reference_id + else: + r2_is_unmapped = r2.is_unmapped + r2_reference_id = r2.reference_id + + if r1.is_unmapped or r2_is_unmapped or r1.reference_id != r2_reference_id: + return None + + if r2 is None: + if not r1.has_tag("MC"): + raise ValueError('Cannot determine proper pair status without R2\'s cigar ("MC")!') + r2_cigar = Cigar.from_cigarstring(str(r1.get_tag("MC"))) + r2_is_forward = r1.mate_is_forward + r2_reference_start = r1.next_reference_start + r2_reference_end = r1.next_reference_start + r2_cigar.length_on_target() + else: + r2_is_forward = r2.is_forward + r2_reference_start = r2.reference_start + r2_reference_end = r2.reference_end + + if r1.is_forward is r2_is_forward: + return PairOrientation.TANDEM + elif r1.is_forward and r1.reference_start < r2_reference_end: + return PairOrientation.FR + elif r1.is_reverse and r2_reference_start < r1.reference_end: + return PairOrientation.FR + else: + return PairOrientation.RF + + +DefaultProperlyPairedOrientations = {PairOrientation.FR} +"""The default orientations for properly paired reads.""" + + +def properly_paired( + r1: AlignedSegment, + r2: Optional[AlignedSegment] = None, + max_insert_size: int = 1000, + orientations: set[PairOrientation] = DefaultProperlyPairedOrientations, +) -> bool: + """Determines if a read pair is properly paired or not. + + Criteria for reads in a proper pair are: + - Both reads are aligned + - Both reads are aligned to the same reference sequence + - The pair orientation of the reads is a part of the valid pair orientations (default "FR") + - The inferred insert size is not more than a maximum length (default 1000) + + Args: + r1: The first read in the template. + r2: The second read in the template. If undefined, mate data on R1 will be used. + max_insert_size: The maximum insert size to consider a read pair "proper". + orientations: The valid set of orientations to consider a read pair "proper". + + See: + https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L106-L125 + """ + if r2 is None: + r2_is_mapped = r1.mate_is_mapped + r2_reference_id = r1.next_reference_id + else: + r2_is_mapped = r2.is_mapped + r2_reference_id = r2.reference_id + + return ( + r1.is_mapped + and r2_is_mapped + and r1.reference_id == r2_reference_id + and PairOrientation.build(r1, r2) in orientations + # TODO: consider replacing with `abs(isize(r1, r2)) <= max_insert_size` + # which can only be done if isize() is modified to allow for an optional R2. + and abs(r1.template_length) <= max_insert_size + ) + + @attr.s(frozen=True, auto_attribs=True) class SupplementaryAlignment: """Stores a supplementary alignment record produced by BWA and stored in the SA SAM tag. @@ -706,9 +812,8 @@ def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = Tr r1: read 1 r2: read 2 with the same queryname as r1 """ - assert not r1.is_unmapped, f"Cannot process unmapped mate {r1.query_name}/1" - assert not r2.is_unmapped, f"Cannot process unmapped mate {r2.query_name}/2" - assert r1.query_name == r2.query_name, "Attempting to pair reads with different qnames." + if r1.query_name != r2.query_name: + raise ValueError("Cannot set pair info on reads with different query names!") for r in [r1, r2]: r.is_paired = True @@ -723,8 +828,9 @@ def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = Tr dest.next_reference_id = src.reference_id dest.next_reference_start = src.reference_start dest.mate_is_reverse = src.is_reverse - dest.mate_is_unmapped = False + dest.mate_is_unmapped = src.mate_is_unmapped dest.set_tag("MC", src.cigarstring) + dest.set_tag("MQ", src.mapping_quality) insert_size = isize(r1, r2) r1.template_length = insert_size diff --git a/tests/fgpyo/sam/test_sam.py b/tests/fgpyo/sam/test_sam.py index b2086b6..497a35b 100755 --- a/tests/fgpyo/sam/test_sam.py +++ b/tests/fgpyo/sam/test_sam.py @@ -17,7 +17,9 @@ from fgpyo.sam import CigarElement from fgpyo.sam import CigarOp from fgpyo.sam import CigarParsingException +from fgpyo.sam import PairOrientation from fgpyo.sam import SamFileType +from fgpyo.sam import properly_paired from fgpyo.sam.builder import SamBuilder @@ -300,6 +302,200 @@ def test_is_clipping() -> None: assert clips == [CigarOp.S, CigarOp.H] +def test_pair_orientation_build_with_r2() -> None: + """Test that we can build all pair orientations with R1 and R2.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + assert PairOrientation.build(r1, r2) is PairOrientation.FR + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = False + r2.is_forward = True + sam.set_pair_info(r1, r2) + assert PairOrientation.build(r1, r2) is PairOrientation.RF + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = True + r2.is_forward = True + sam.set_pair_info(r1, r2) + assert PairOrientation.build(r1, r2) is PairOrientation.TANDEM + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = False + r2.is_forward = False + sam.set_pair_info(r1, r2) + assert PairOrientation.build(r1, r2) is PairOrientation.TANDEM + + +def test_pair_orientation_is_fr_if_opposite_directions_and_overlapping() -> None: + """Test the pair orientation is always FR if the reads overlap and are oriented opposite.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M") + assert PairOrientation.build(r1, r2) is PairOrientation.FR + + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M") + r1.is_reverse = True + r2.is_reverse = False + sam.set_pair_info(r1, r2) + assert PairOrientation.build(r1, r2) is PairOrientation.FR + + +def test_pair_orientation_build_with_either_unmapped() -> None: + """Test that we can return None with either R1 and R2 unmapped (or both).""" + builder = SamBuilder() + r1, r2 = builder.add_pair() + assert r1.is_unmapped + assert r2.is_unmapped + assert PairOrientation.build(r1, r2) is None + + r1, r2 = builder.add_pair(chrom="chr1", start1=100) + assert r1.is_mapped + assert r2.is_unmapped + assert PairOrientation.build(r1, r2) is None + + r1, r2 = builder.add_pair(chrom="chr1", start2=100) + assert r1.is_unmapped + assert r2.is_mapped + assert PairOrientation.build(r1, r2) is None + + +def test_pair_orientation_build_with_no_r2_but_r2_mapped() -> None: + """Test that we can build all pair orientations with R1 and no R2, but R2 is mapped.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + assert PairOrientation.build(r1) is PairOrientation.FR + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = False + r2.is_forward = True + sam.set_pair_info(r1, r2) + assert PairOrientation.build(r1) is PairOrientation.RF + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = True + r2.is_forward = True + sam.set_pair_info(r1, r2) + assert PairOrientation.build(r1) is PairOrientation.TANDEM + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = False + r2.is_forward = False + sam.set_pair_info(r1, r2) + assert PairOrientation.build(r1) is PairOrientation.TANDEM + + +def test_pair_orientation_build_with_either_unmapped_but_no_r2() -> None: + """Test that we can return None with either R1 and R2 unmapped (or both), but no R2.""" + builder = SamBuilder() + r1, r2 = builder.add_pair() + assert r1.is_unmapped + assert r2.is_unmapped + sam.set_pair_info(r1, r2) + assert PairOrientation.build(r1) is None + + r1, r2 = builder.add_pair(chrom="chr1", start1=100) + assert r1.is_mapped + assert r2.is_unmapped + sam.set_pair_info(r1, r2) + assert PairOrientation.build(r1) is None + + r1, r2 = builder.add_pair(chrom="chr1", start2=100) + assert r1.is_unmapped + assert r2.is_mapped + sam.set_pair_info(r1, r2) + assert PairOrientation.build(r1) is None + + +def test_pair_orientation_build_raises_if_it_cant_find_mate_cigar_tag() -> None: + """Test that an exception is raised if we cannot find the mate cigar tag.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=10, start2=30) + sam.set_pair_info(r1, r2) + r1.set_tag("MC", None) # Clear out the MC tag. + + with pytest.raises(ValueError): + PairOrientation.build(r1) + + +def test_properly_paired_when_actually_proper() -> None: + """Test that properly_paired returns True when reads are properly paired.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + assert properly_paired(r1, r2) + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M") + r1.is_reverse = True + r2.is_reverse = False + sam.set_pair_info(r1, r2) + assert properly_paired(r1, r2) + + +def test_properly_paired_when_actually_proper_and_no_r2() -> None: + """Test that properly_paired returns True when reads are properly paired, but no R2.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + assert properly_paired(r1) + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M") + r1.is_reverse = True + r2.is_reverse = False + sam.set_pair_info(r1, r2) + assert properly_paired(r1) + + +def test_not_properly_paired_if_wrong_orientation() -> None: + """Test that reads are not properly paired if they are not in the right orientation.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = False + r2.is_forward = True + sam.set_pair_info(r1, r2) + assert not properly_paired(r1, r2) + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = True + r2.is_forward = True + sam.set_pair_info(r1, r2) + assert not properly_paired(r1, r2) + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = False + r2.is_forward = False + sam.set_pair_info(r1, r2) + assert not properly_paired(r1, r2) + + +def test_not_properly_paired_if_wrong_orientation_and_no_r2() -> None: + """Test reads are not properly paired if they are not in the right orientation, but no R2.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = False + r2.is_forward = True + sam.set_pair_info(r1, r2) + assert not properly_paired(r1) + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = True + r2.is_forward = True + sam.set_pair_info(r1, r2) + assert not properly_paired(r1) + + r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1.is_forward = False + r2.is_forward = False + sam.set_pair_info(r1, r2) + assert not properly_paired(r1) + + +def test_not_properly_paired_if_too_far_apart() -> None: + """Test that reads are not properly paired if they are too far apart.""" + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom="chr1", start1=100, start2=100 + 1000) + sam.set_pair_info(r1, r2) + assert not properly_paired(r1, r2) + + def test_isize() -> None: builder = SamBuilder() r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")