Skip to content

Commit

Permalink
Add a function for determining read pair orientation
Browse files Browse the repository at this point in the history
  • Loading branch information
clintval committed Dec 24, 2024
1 parent 3680b4f commit 441551f
Show file tree
Hide file tree
Showing 2 changed files with 307 additions and 5 deletions.
116 changes: 111 additions & 5 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,8 @@ def _pysam_open(
assert file_type is not None, "Must specify file_type when writing to standard output"
path = sys.stdout
else:
file_type = file_type or SamFileType.from_path(path)
if file_type is None:
file_type = SamFileType.from_path(path)
path = str(path)
elif not isinstance(path, _IOClasses): # type: ignore[unreachable]
open_type = "reading" if open_for_reading else "writing"
Expand Down Expand Up @@ -606,6 +607,111 @@ def __getitem__(
return self.elements[index]


@enum.unique
class PairOrientation(enum.Enum):
"""Enumerations of read pair orientations."""

FR = "FR"
"""A pair orientation for forward-reverse reads ("innie")."""

RF = "RF"
"""A pair orientation for reverse-forward reads ("outie")."""

TANDEM = "TANDEM"
"""A pair orientation for tandem (forward-forward or reverse-reverse) reads."""

@classmethod
def build(
cls, r1: AlignedSegment, r2: Optional[AlignedSegment] = None
) -> Optional["PairOrientation"]:
"""Returns the pair orientation if both reads are mapped to the same reference sequence.
Args:
r1: The first read in the template.
r2: The second read in the template. If undefined, mate data on R1 will be used.
See:
- https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L71-L102
"""

if r2 is None:
r2_is_unmapped = r1.mate_is_unmapped
r2_reference_id = r1.next_reference_id
else:
r2_is_unmapped = r2.is_unmapped
r2_reference_id = r2.reference_id

if r1.is_unmapped or r2_is_unmapped or r1.reference_id != r2_reference_id:
return None

if r2 is None:
if not r1.has_tag("MC"):
raise ValueError('Cannot determine proper pair status without R2\'s cigar ("MC")!')
r2_cigar = Cigar.from_cigarstring(str(r1.get_tag("MC")))
r2_is_forward = r1.mate_is_forward
r2_reference_start = r1.next_reference_start
r2_reference_end = r1.next_reference_start + r2_cigar.length_on_target()
else:
r2_is_forward = r2.is_forward
r2_reference_start = r2.reference_start
r2_reference_end = r2.reference_end

if r1.is_forward is r2_is_forward:
return PairOrientation.TANDEM
elif r1.is_forward and r1.reference_start < r2_reference_end:
return PairOrientation.FR
elif r1.is_reverse and r2_reference_start < r1.reference_end:
return PairOrientation.FR
else:
return PairOrientation.RF


DefaultProperlyPairedOrientations = {PairOrientation.FR}
"""The default orientations for properly paired reads."""


def properly_paired(
r1: AlignedSegment,
r2: Optional[AlignedSegment] = None,
max_insert_size: int = 1000,
orientations: set[PairOrientation] = DefaultProperlyPairedOrientations,
) -> bool:
"""Determines if a read pair is properly paired or not.
Criteria for reads in a proper pair are:
- Both reads are aligned
- Both reads are aligned to the same reference sequence
- The pair orientation of the reads is a part of the valid pair orientations (default "FR")
- The inferred insert size is not more than a maximum length (default 1000)
Args:
r1: The first read in the template.
r2: The second read in the template. If undefined, mate data on R1 will be used.
max_insert_size: The maximum insert size to consider a read pair "proper".
orientations: The valid set of orientations to consider a read pair "proper".
See:
https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L106-L125
"""
if r2 is None:
r2_is_mapped = r1.mate_is_mapped
r2_reference_id = r1.next_reference_id
else:
r2_is_mapped = r2.is_mapped
r2_reference_id = r2.reference_id

return (
r1.is_mapped
and r2_is_mapped
and r1.reference_id == r2_reference_id
and PairOrientation.build(r1, r2) in orientations
# TODO: consider replacing with `abs(isize(r1, r2)) <= max_insert_size`
# which can only be done if isize() is modified to allow for an optional R2.
and abs(r1.template_length) <= max_insert_size
)


@attr.s(frozen=True, auto_attribs=True)
class SupplementaryAlignment:
"""Stores a supplementary alignment record produced by BWA and stored in the SA SAM tag.
Expand Down Expand Up @@ -706,9 +812,8 @@ def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = Tr
r1: read 1
r2: read 2 with the same queryname as r1
"""
assert not r1.is_unmapped, f"Cannot process unmapped mate {r1.query_name}/1"
assert not r2.is_unmapped, f"Cannot process unmapped mate {r2.query_name}/2"
assert r1.query_name == r2.query_name, "Attempting to pair reads with different qnames."
if r1.query_name != r2.query_name:
raise ValueError("Cannot set pair info on reads with different query names!")

for r in [r1, r2]:
r.is_paired = True
Expand All @@ -723,8 +828,9 @@ def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = Tr
dest.next_reference_id = src.reference_id
dest.next_reference_start = src.reference_start
dest.mate_is_reverse = src.is_reverse
dest.mate_is_unmapped = False
dest.mate_is_unmapped = src.mate_is_unmapped
dest.set_tag("MC", src.cigarstring)
dest.set_tag("MQ", src.mapping_quality)

insert_size = isize(r1, r2)
r1.template_length = insert_size
Expand Down
196 changes: 196 additions & 0 deletions tests/fgpyo/sam/test_sam.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
from fgpyo.sam import CigarElement
from fgpyo.sam import CigarOp
from fgpyo.sam import CigarParsingException
from fgpyo.sam import PairOrientation
from fgpyo.sam import SamFileType
from fgpyo.sam import properly_paired
from fgpyo.sam.builder import SamBuilder


Expand Down Expand Up @@ -300,6 +302,200 @@ def test_is_clipping() -> None:
assert clips == [CigarOp.S, CigarOp.H]


def test_pair_orientation_build_with_r2() -> None:
"""Test that we can build all pair orientations with R1 and R2."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert PairOrientation.build(r1, r2) is PairOrientation.FR

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.build(r1, r2) is PairOrientation.RF

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.build(r1, r2) is PairOrientation.TANDEM

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert PairOrientation.build(r1, r2) is PairOrientation.TANDEM


def test_pair_orientation_is_fr_if_opposite_directions_and_overlapping() -> None:
"""Test the pair orientation is always FR if the reads overlap and are oriented opposite."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
assert PairOrientation.build(r1, r2) is PairOrientation.FR

builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
r1.is_reverse = True
r2.is_reverse = False
sam.set_pair_info(r1, r2)
assert PairOrientation.build(r1, r2) is PairOrientation.FR


def test_pair_orientation_build_with_either_unmapped() -> None:
"""Test that we can return None with either R1 and R2 unmapped (or both)."""
builder = SamBuilder()
r1, r2 = builder.add_pair()
assert r1.is_unmapped
assert r2.is_unmapped
assert PairOrientation.build(r1, r2) is None

r1, r2 = builder.add_pair(chrom="chr1", start1=100)
assert r1.is_mapped
assert r2.is_unmapped
assert PairOrientation.build(r1, r2) is None

r1, r2 = builder.add_pair(chrom="chr1", start2=100)
assert r1.is_unmapped
assert r2.is_mapped
assert PairOrientation.build(r1, r2) is None


def test_pair_orientation_build_with_no_r2_but_r2_mapped() -> None:
"""Test that we can build all pair orientations with R1 and no R2, but R2 is mapped."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert PairOrientation.build(r1) is PairOrientation.FR

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.build(r1) is PairOrientation.RF

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert PairOrientation.build(r1) is PairOrientation.TANDEM

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert PairOrientation.build(r1) is PairOrientation.TANDEM


def test_pair_orientation_build_with_either_unmapped_but_no_r2() -> None:
"""Test that we can return None with either R1 and R2 unmapped (or both), but no R2."""
builder = SamBuilder()
r1, r2 = builder.add_pair()
assert r1.is_unmapped
assert r2.is_unmapped
sam.set_pair_info(r1, r2)
assert PairOrientation.build(r1) is None

r1, r2 = builder.add_pair(chrom="chr1", start1=100)
assert r1.is_mapped
assert r2.is_unmapped
sam.set_pair_info(r1, r2)
assert PairOrientation.build(r1) is None

r1, r2 = builder.add_pair(chrom="chr1", start2=100)
assert r1.is_unmapped
assert r2.is_mapped
sam.set_pair_info(r1, r2)
assert PairOrientation.build(r1) is None


def test_pair_orientation_build_raises_if_it_cant_find_mate_cigar_tag() -> None:
"""Test that an exception is raised if we cannot find the mate cigar tag."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=10, start2=30)
sam.set_pair_info(r1, r2)
r1.set_tag("MC", None) # Clear out the MC tag.

with pytest.raises(ValueError):
PairOrientation.build(r1)


def test_properly_paired_when_actually_proper() -> None:
"""Test that properly_paired returns True when reads are properly paired."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert properly_paired(r1, r2)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
r1.is_reverse = True
r2.is_reverse = False
sam.set_pair_info(r1, r2)
assert properly_paired(r1, r2)


def test_properly_paired_when_actually_proper_and_no_r2() -> None:
"""Test that properly_paired returns True when reads are properly paired, but no R2."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert properly_paired(r1)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="10M", start2=100, cigar2="10M")
r1.is_reverse = True
r2.is_reverse = False
sam.set_pair_info(r1, r2)
assert properly_paired(r1)


def test_not_properly_paired_if_wrong_orientation() -> None:
"""Test that reads are not properly paired if they are not in the right orientation."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not properly_paired(r1, r2)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not properly_paired(r1, r2)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert not properly_paired(r1, r2)


def test_not_properly_paired_if_wrong_orientation_and_no_r2() -> None:
"""Test reads are not properly paired if they are not in the right orientation, but no R2."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not properly_paired(r1)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = True
r2.is_forward = True
sam.set_pair_info(r1, r2)
assert not properly_paired(r1)

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
r1.is_forward = False
r2.is_forward = False
sam.set_pair_info(r1, r2)
assert not properly_paired(r1)


def test_not_properly_paired_if_too_far_apart() -> None:
"""Test that reads are not properly paired if they are too far apart."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, start2=100 + 1000)
sam.set_pair_info(r1, r2)
assert not properly_paired(r1, r2)


def test_isize() -> None:
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
Expand Down

0 comments on commit 441551f

Please sign in to comment.