From ca4a572524d2e5f79838250603e2fcf4d5d63155 Mon Sep 17 00:00:00 2001 From: Nathan Roach Date: Tue, 14 Nov 2023 14:35:21 -0700 Subject: [PATCH] Add sort_order to SamBuilder specifing output SAM order (#25) * Add SamOrder enum consistent with SAM spec * Add _sort_order as private member of SamBuilder to avoid changes inconsistent with header * Update poetry environment for newer python Co-authored-by: Nathan Roach Co-authored-by: Ted Brookings --- fgpyo/sam/__init__.py | 12 +++++++++ fgpyo/sam/builder.py | 35 ++++++++++++++++++++----- fgpyo/sam/tests/test_builder.py | 45 +++++++++++++++++++++++++++++++++ poetry.lock | 42 ++++++++++++++++++++++++++++-- pyproject.toml | 6 ++++- 5 files changed, 131 insertions(+), 9 deletions(-) diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index e054350a..caceadac 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -136,6 +136,7 @@ - :class:`~fgpyo.sam.SupplementaryAlignment` -- Stores a supplementary alignment record produced by BWA and stored in the SA SAM tag. - :class:`~fgpyo.sam.SamFileType` -- Enumeration of valid SAM/BAM/CRAM file types. + - :class:`~fgpyo.sam.SamOrder` -- Enumeration of possible SAM/BAM/CRAM sort orders. - :class:`~fgpyo.sam.CigarOp` -- Enumeration of operators that can appear in a Cigar string. - :class:`~fgpyo.sam.CigarElement` -- Class representing an element in a Cigar string. - :class:`~fgpyo.sam.CigarParsingException` -- The exception raised specific to parsing a @@ -899,3 +900,14 @@ def __next__(self) -> Template: name = self._iter.peek().query_name recs = self._iter.takewhile(lambda r: r.query_name == name) return Template.build(recs, validate=False) + + +class SamOrder(enum.Enum): + """ + Enumerations of possible sort orders for a SAM file. + """ + + Unsorted = "unsorted" #: the SAM / BAM / CRAM is unsorted + Coordinate = "coordinate" #: coordinate sorted + QueryName = "queryname" #: queryname sorted + Unknown = "unknown" # Unknown SAM / BAM / CRAM sort order diff --git a/fgpyo/sam/builder.py b/fgpyo/sam/builder.py index 8690473a..69f0db1e 100755 --- a/fgpyo/sam/builder.py +++ b/fgpyo/sam/builder.py @@ -14,6 +14,7 @@ from pathlib import Path from random import Random from tempfile import NamedTemporaryFile +from typing import IO from typing import Any from typing import Callable from typing import Dict @@ -26,6 +27,7 @@ from pysam import AlignmentHeader from fgpyo import sam +from fgpyo.sam import SamOrder class SamBuilder: @@ -104,6 +106,7 @@ def __init__( rg: Optional[Dict[str, str]] = None, extra_header: Optional[Dict[str, Any]] = None, seed: int = 42, + sort_order: SamOrder = SamOrder.Coordinate, ) -> None: """Initializes a new SamBuilder for generating alignment records and SAM/BAM files. @@ -116,14 +119,20 @@ def __init__( extra_header: a dictionary of extra values to add to the header, None otherwise. See `::class::~pysam.AlignmentHeader` for more details. seed: a seed value for random number/string generation + sort_order: Order to sort records when writing to file, or output of to_sorted_list() """ self.r1_len: int = r1_len if r1_len is not None else self.DEFAULT_R1_LENGTH self.r2_len: int = r2_len if r2_len is not None else self.DEFAULT_R2_LENGTH self.base_quality: int = base_quality self.mapping_quality: int = mapping_quality + + if not isinstance(sort_order, SamOrder): + raise ValueError(f"sort_order must be a SamOrder, got {type(sort_order)}") + self._sort_order = sort_order + self._header: Dict[str, Any] = { - "HD": {"VN": "1.5", "SO": "coordinate"}, + "HD": {"VN": "1.5", "SO": sort_order.value}, "SQ": (sd if sd is not None else SamBuilder.default_sd()), "RG": [(rg if rg is not None else SamBuilder.default_rg())], } @@ -554,7 +563,7 @@ def to_path( Args: path: a path at which to write the file, otherwise a temp file is used. - index: if True an index is generated, otherwise not. + index: if True and `sort_order` is `Coordinate` index is generated, otherwise not. pred: optional predicate to specify which reads should be output Returns: @@ -566,16 +575,30 @@ def to_path( path = Path(fp.name) with NamedTemporaryFile(suffix=".bam", delete=True) as fp: + file_handle: IO + if self._sort_order in {SamOrder.Unsorted, SamOrder.Unknown}: + file_handle = path.open("w") + else: + file_handle = fp.file + with sam.writer( - fp.file, header=self._samheader, file_type=sam.SamFileType.BAM # type: ignore + file_handle, header=self._samheader, file_type=sam.SamFileType.BAM # type: ignore ) as writer: for rec in self._records: if pred(rec): writer.write(rec) - pysam.sort("-o", str(path), fp.name) # type: ignore - if index: - pysam.index(str(path)) # type: ignore + samtools_sort_args = ["-o", str(path), fp.name] + + file_handle.close() + if self._sort_order == SamOrder.QueryName: + # Ignore type hints for now until we have wrappers to use here. + pysam.sort("-n", *samtools_sort_args) # type: ignore + elif self._sort_order == SamOrder.Coordinate: + # Ignore type hints for now until we have wrappers to use here. + if index: + samtools_sort_args.insert(0, "--write-index") + pysam.sort(*samtools_sort_args) # type: ignore return path diff --git a/fgpyo/sam/tests/test_builder.py b/fgpyo/sam/tests/test_builder.py index a2a94a09..b457fc90 100755 --- a/fgpyo/sam/tests/test_builder.py +++ b/fgpyo/sam/tests/test_builder.py @@ -1,8 +1,13 @@ """Basic tests of the builder module.""" +from pathlib import Path +from typing import List +from typing import Optional + import pytest from fgpyo import sam +from fgpyo.sam import SamOrder from fgpyo.sam.builder import SamBuilder @@ -247,6 +252,46 @@ def test_sorting() -> None: last_start = start +def make_sort_order_builder(tmp_path: Path, sort_order: SamOrder) -> Path: + builder = SamBuilder(sort_order=sort_order) + builder.add_pair( + name="test3", chrom="chr1", start1=5000, start2=4700, strand1="-", strand2="+" + ) + builder.add_pair(name="test2", chrom="chr1", start1=4000, start2=4300) + builder.add_pair(name="test1", chrom="chr5", start1=4000, start2=4300) + builder.add_pair(name="test4", chrom="chr2", start1=4000, start2=4300) + + pos_path = tmp_path / "test.bam" + builder.to_path(pos_path) + return pos_path + + +@pytest.mark.parametrize( + argnames=["sort_order", "expected_name_order"], + argvalues=[ + (SamOrder.Coordinate, ["test2", "test3", "test4", "test1"]), + (SamOrder.QueryName, ["test1", "test2", "test3", "test4"]), + (SamOrder.Unsorted, ["test3", "test2", "test1", "test4"]), + (SamOrder.Unknown, ["test3", "test2", "test1", "test4"]), + ], + ids=["Coordinate sorting", "Query name sorting", "Unsorted output", "Unknown sorting"], +) +def test_sort_types( + tmp_path: Path, sort_order: Optional[SamOrder], expected_name_order: List[str] +) -> None: + bam_path = make_sort_order_builder(tmp_path=tmp_path, sort_order=sort_order) + with sam.reader(bam_path) as in_bam: + for name in expected_name_order: + read1 = next(in_bam) + assert ( + name == read1.query_name + ), "Position based read sort order did not match expectation" + read2 = next(in_bam) + assert ( + name == read2.query_name + ), "Position based read sort order did not match expectation" + + def test_custom_sd() -> None: builder1 = SamBuilder() builder2 = SamBuilder(sd=[{"SN": "hi", "LN": 999}, {"SN": "bye", "LN": 888}]) diff --git a/poetry.lock b/poetry.lock index da0b6ac1..f49531f5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -228,6 +228,22 @@ mccabe = ">=0.7.0,<0.8.0" pycodestyle = ">=2.9.0,<2.10.0" pyflakes = ">=2.5.0,<2.6.0" +[[package]] +name = "flake8" +version = "6.1.0" +description = "the modular source code checker: pep8 pyflakes and co" +optional = false +python-versions = ">=3.8.1" +files = [ + {file = "flake8-6.1.0-py2.py3-none-any.whl", hash = "sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5"}, + {file = "flake8-6.1.0.tar.gz", hash = "sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23"}, +] + +[package.dependencies] +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.11.0,<2.12.0" +pyflakes = ">=3.1.0,<3.2.0" + [[package]] name = "idna" version = "3.4" @@ -522,6 +538,17 @@ files = [ {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"}, ] +[[package]] +name = "pycodestyle" +version = "2.11.1" +description = "Python style guide checker" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pycodestyle-2.11.1-py2.py3-none-any.whl", hash = "sha256:44fe31000b2d866f2e41841b18528a505fbd7fef9017b04eff4e2648a0fadc67"}, + {file = "pycodestyle-2.11.1.tar.gz", hash = "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f"}, +] + [[package]] name = "pyflakes" version = "2.5.0" @@ -533,6 +560,17 @@ files = [ {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, ] +[[package]] +name = "pyflakes" +version = "3.1.0" +description = "passive checker of Python programs" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyflakes-3.1.0-py2.py3-none-any.whl", hash = "sha256:4132f6d49cb4dae6819e5379898f2b8cce3c5f23994194c24b77d5da2e36f774"}, + {file = "pyflakes-3.1.0.tar.gz", hash = "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc"}, +] + [[package]] name = "pygments" version = "2.13.0" @@ -773,7 +811,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<5)"] name = "setuptools" version = "68.0.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "setuptools-68.0.0-py3-none-any.whl", hash = "sha256:11e52c67415a381d10d6b462ced9cfb97066179f0e871399e006c4ab101fc85f"}, @@ -1071,4 +1109,4 @@ docs = ["sphinx", "sphinx_rtd_theme"] [metadata] lock-version = "2.0" python-versions = ">=3.7.0,<4.0" -content-hash = "5c35d3ca5036eff3f0862d568297def94f2c1062bd57cf87d9d9e9a78905bfe8" +content-hash = "5e1d5ff1445474f8dfe2e508fe67f20feb7dd02642d188f1e657a348bc333fd0" diff --git a/pyproject.toml b/pyproject.toml index 09e47a56..783be7c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,9 +37,13 @@ pysam = ">=0.22.0" docs = ["sphinx", "sphinx_rtd_theme"] [tool.poetry.dev-dependencies] +setuptools = ">=68.0.0" pytest = ">=5.4.2" mypy = ">=0.770" -flake8 = ">=3.8.1" +flake8 = [ + { version = ">=3.8.1", python = "<3.12.0" }, + { version = ">=6.1.0", python = ">=3.12.0" }, +] black = ">=19.10b0" pytest-cov = ">=2.8.1" isort = ">=5.10.1"