Skip to content

Commit

Permalink
Add sort_order to SamBuilder specifing output SAM order (#25)
Browse files Browse the repository at this point in the history
* Add SamOrder enum consistent with SAM spec
* Add _sort_order as private member of SamBuilder to avoid changes
   inconsistent with header
* Update poetry environment for newer python
    
Co-authored-by: Nathan Roach <nathan@fulcrumgenomics.com>
Co-authored-by: Ted Brookings <ted@fulcrumgenomics.com>
  • Loading branch information
NatPRoach authored Nov 14, 2023
1 parent 273c81c commit ca4a572
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 9 deletions.
12 changes: 12 additions & 0 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@
- :class:`~fgpyo.sam.SupplementaryAlignment` -- Stores a supplementary alignment record
produced by BWA and stored in the SA SAM tag.
- :class:`~fgpyo.sam.SamFileType` -- Enumeration of valid SAM/BAM/CRAM file types.
- :class:`~fgpyo.sam.SamOrder` -- Enumeration of possible SAM/BAM/CRAM sort orders.
- :class:`~fgpyo.sam.CigarOp` -- Enumeration of operators that can appear in a Cigar string.
- :class:`~fgpyo.sam.CigarElement` -- Class representing an element in a Cigar string.
- :class:`~fgpyo.sam.CigarParsingException` -- The exception raised specific to parsing a
Expand Down Expand Up @@ -899,3 +900,14 @@ def __next__(self) -> Template:
name = self._iter.peek().query_name
recs = self._iter.takewhile(lambda r: r.query_name == name)
return Template.build(recs, validate=False)


class SamOrder(enum.Enum):
"""
Enumerations of possible sort orders for a SAM file.
"""

Unsorted = "unsorted" #: the SAM / BAM / CRAM is unsorted
Coordinate = "coordinate" #: coordinate sorted
QueryName = "queryname" #: queryname sorted
Unknown = "unknown" # Unknown SAM / BAM / CRAM sort order
35 changes: 29 additions & 6 deletions fgpyo/sam/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pathlib import Path
from random import Random
from tempfile import NamedTemporaryFile
from typing import IO
from typing import Any
from typing import Callable
from typing import Dict
Expand All @@ -26,6 +27,7 @@
from pysam import AlignmentHeader

from fgpyo import sam
from fgpyo.sam import SamOrder


class SamBuilder:
Expand Down Expand Up @@ -104,6 +106,7 @@ def __init__(
rg: Optional[Dict[str, str]] = None,
extra_header: Optional[Dict[str, Any]] = None,
seed: int = 42,
sort_order: SamOrder = SamOrder.Coordinate,
) -> None:
"""Initializes a new SamBuilder for generating alignment records and SAM/BAM files.
Expand All @@ -116,14 +119,20 @@ def __init__(
extra_header: a dictionary of extra values to add to the header, None otherwise. See
`::class::~pysam.AlignmentHeader` for more details.
seed: a seed value for random number/string generation
sort_order: Order to sort records when writing to file, or output of to_sorted_list()
"""

self.r1_len: int = r1_len if r1_len is not None else self.DEFAULT_R1_LENGTH
self.r2_len: int = r2_len if r2_len is not None else self.DEFAULT_R2_LENGTH
self.base_quality: int = base_quality
self.mapping_quality: int = mapping_quality

if not isinstance(sort_order, SamOrder):
raise ValueError(f"sort_order must be a SamOrder, got {type(sort_order)}")
self._sort_order = sort_order

self._header: Dict[str, Any] = {
"HD": {"VN": "1.5", "SO": "coordinate"},
"HD": {"VN": "1.5", "SO": sort_order.value},
"SQ": (sd if sd is not None else SamBuilder.default_sd()),
"RG": [(rg if rg is not None else SamBuilder.default_rg())],
}
Expand Down Expand Up @@ -554,7 +563,7 @@ def to_path(
Args:
path: a path at which to write the file, otherwise a temp file is used.
index: if True an index is generated, otherwise not.
index: if True and `sort_order` is `Coordinate` index is generated, otherwise not.
pred: optional predicate to specify which reads should be output
Returns:
Expand All @@ -566,16 +575,30 @@ def to_path(
path = Path(fp.name)

with NamedTemporaryFile(suffix=".bam", delete=True) as fp:
file_handle: IO
if self._sort_order in {SamOrder.Unsorted, SamOrder.Unknown}:
file_handle = path.open("w")
else:
file_handle = fp.file

with sam.writer(
fp.file, header=self._samheader, file_type=sam.SamFileType.BAM # type: ignore
file_handle, header=self._samheader, file_type=sam.SamFileType.BAM # type: ignore
) as writer:
for rec in self._records:
if pred(rec):
writer.write(rec)

pysam.sort("-o", str(path), fp.name) # type: ignore
if index:
pysam.index(str(path)) # type: ignore
samtools_sort_args = ["-o", str(path), fp.name]

file_handle.close()
if self._sort_order == SamOrder.QueryName:
# Ignore type hints for now until we have wrappers to use here.
pysam.sort("-n", *samtools_sort_args) # type: ignore
elif self._sort_order == SamOrder.Coordinate:
# Ignore type hints for now until we have wrappers to use here.
if index:
samtools_sort_args.insert(0, "--write-index")
pysam.sort(*samtools_sort_args) # type: ignore

return path

Expand Down
45 changes: 45 additions & 0 deletions fgpyo/sam/tests/test_builder.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
"""Basic tests of the builder module."""

from pathlib import Path
from typing import List
from typing import Optional

import pytest

from fgpyo import sam
from fgpyo.sam import SamOrder
from fgpyo.sam.builder import SamBuilder


Expand Down Expand Up @@ -247,6 +252,46 @@ def test_sorting() -> None:
last_start = start


def make_sort_order_builder(tmp_path: Path, sort_order: SamOrder) -> Path:
builder = SamBuilder(sort_order=sort_order)
builder.add_pair(
name="test3", chrom="chr1", start1=5000, start2=4700, strand1="-", strand2="+"
)
builder.add_pair(name="test2", chrom="chr1", start1=4000, start2=4300)
builder.add_pair(name="test1", chrom="chr5", start1=4000, start2=4300)
builder.add_pair(name="test4", chrom="chr2", start1=4000, start2=4300)

pos_path = tmp_path / "test.bam"
builder.to_path(pos_path)
return pos_path


@pytest.mark.parametrize(
argnames=["sort_order", "expected_name_order"],
argvalues=[
(SamOrder.Coordinate, ["test2", "test3", "test4", "test1"]),
(SamOrder.QueryName, ["test1", "test2", "test3", "test4"]),
(SamOrder.Unsorted, ["test3", "test2", "test1", "test4"]),
(SamOrder.Unknown, ["test3", "test2", "test1", "test4"]),
],
ids=["Coordinate sorting", "Query name sorting", "Unsorted output", "Unknown sorting"],
)
def test_sort_types(
tmp_path: Path, sort_order: Optional[SamOrder], expected_name_order: List[str]
) -> None:
bam_path = make_sort_order_builder(tmp_path=tmp_path, sort_order=sort_order)
with sam.reader(bam_path) as in_bam:
for name in expected_name_order:
read1 = next(in_bam)
assert (
name == read1.query_name
), "Position based read sort order did not match expectation"
read2 = next(in_bam)
assert (
name == read2.query_name
), "Position based read sort order did not match expectation"


def test_custom_sd() -> None:
builder1 = SamBuilder()
builder2 = SamBuilder(sd=[{"SN": "hi", "LN": 999}, {"SN": "bye", "LN": 888}])
Expand Down
42 changes: 40 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,13 @@ pysam = ">=0.22.0"
docs = ["sphinx", "sphinx_rtd_theme"]

[tool.poetry.dev-dependencies]
setuptools = ">=68.0.0"
pytest = ">=5.4.2"
mypy = ">=0.770"
flake8 = ">=3.8.1"
flake8 = [
{ version = ">=3.8.1", python = "<3.12.0" },
{ version = ">=6.1.0", python = ">=3.12.0" },
]
black = ">=19.10b0"
pytest-cov = ">=2.8.1"
isort = ">=5.10.1"
Expand Down

0 comments on commit ca4a572

Please sign in to comment.