diff --git a/fgpyo/fasta/sequence_dictionary.py b/fgpyo/fasta/sequence_dictionary.py index 56e068bf..2ed9e542 100644 --- a/fgpyo/fasta/sequence_dictionary.py +++ b/fgpyo/fasta/sequence_dictionary.py @@ -126,6 +126,7 @@ from dataclasses import field from dataclasses import replace from enum import unique +from pathlib import Path from typing import Any from typing import Dict from typing import Iterator @@ -137,6 +138,8 @@ from typing import Union from typing import overload +from fgpyo import sam + if sys.version_info[0] == 3 and sys.version_info[1] < 11: from strenum import StrEnum else: @@ -214,7 +217,7 @@ def parse(value: str) -> "AlternateLocus": class SequenceMetadata(MutableMapping[Union[Keys, str], str]): """Stores information about a single Sequence (ex. chromosome, contig). - Implements the mutable mapping interface, which provide access to the attributes of this + Implements the mutable mapping interface, which provides access to the attributes of this sequence, including name, length, but not index. When using the mapping interface, for example getting, setting, deleting, as well as iterating over keys, values, and items, the _values_ will always be strings (`str` type). For example, the length will be an `str` when accessing via @@ -446,28 +449,56 @@ def to_sam_header( @staticmethod @overload - def from_sam(header: pysam.AlignmentHeader) -> "SequenceDictionary": ... + def from_sam(data: Path) -> "SequenceDictionary": ... @staticmethod @overload - def from_sam(header: List[Dict[str, Any]]) -> "SequenceDictionary": ... + def from_sam(data: pysam.AlignmentFile) -> "SequenceDictionary": ... @staticmethod - def from_sam( - header: Union[pysam.AlignmentHeader, List[Dict[str, Any]]], - ) -> "SequenceDictionary": - """Creates a `SequenceDictionary` from either a `pysam.AlignmentHeader` or from - the list of sequences returned by `pysam.AlignmentHeader#to_dict()["SQ"]`.""" - if isinstance(header, pysam.AlignmentHeader): - return SequenceDictionary.from_sam(header=header.to_dict()["SQ"]) + @overload + def from_sam(data: pysam.AlignmentHeader) -> "SequenceDictionary": ... - infos: List[SequenceMetadata] = [ - SequenceMetadata.from_sam(meta=meta, index=index) for index, meta in enumerate(header) - ] + @staticmethod + @overload + def from_sam(data: List[Dict[str, Any]]) -> "SequenceDictionary": ... - return SequenceDictionary(infos=infos) + @staticmethod + def from_sam( + data: Union[Path, pysam.AlignmentFile, pysam.AlignmentHeader, List[Dict[str, Any]]], + ) -> "SequenceDictionary": + """Creates a `SequenceDictionary` from a SAM file or its header. - # TODO: mypyp doesn't like these + Args: + data: The input may be any of: + - a path to a SAM file + - an open `pysam.AlignmentFile` + - the `pysam.AlignmentHeader` associated with a `pysam.AlignmentFile` + - the contents of a header's `SQ` fields, as returned by `AlignmentHeader.to_dict()` + Returns: + A `SequenceDictionary` mapping refrence names to their metadata. + """ + seq_dict: SequenceDictionary + if isinstance(data, pysam.AlignmentHeader): + seq_dict = SequenceDictionary.from_sam(data.to_dict()["SQ"]) + elif isinstance(data, pysam.AlignmentFile): + seq_dict = SequenceDictionary.from_sam(data.header.to_dict()["SQ"]) + elif isinstance(data, Path): + with sam.reader(data) as fh: + seq_dict = SequenceDictionary.from_sam(fh.header) + else: # assuming `data` is a `list[dict[str, Any]]` + try: + infos: List[SequenceMetadata] = [ + SequenceMetadata.from_sam(meta=meta, index=index) + for index, meta in enumerate(data) + ] + seq_dict = SequenceDictionary(infos=infos) + except Exception as e: + raise ValueError(f"Could not parse sequence information from data: {data}") from e + + return seq_dict + + # TODO: mypy doesn't like these # @overload # def __getitem__(self, key: str) -> SequenceMetadata: ... # diff --git a/pyproject.toml b/pyproject.toml index 801f94b7..f1e7c30c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,3 +88,9 @@ warn_unused_configs = true warn_unused_ignores = true enable_error_code = "ignore-without-code" exclude = ["site/", "docs/"] + +[tool.coverage.report] +exclude_lines = [ + "pragma: not covered", + "@overload" +] diff --git a/tests/fgpyo/fasta/test_sequence_dictionary.py b/tests/fgpyo/fasta/test_sequence_dictionary.py index 493f1d14..22fb1b61 100644 --- a/tests/fgpyo/fasta/test_sequence_dictionary.py +++ b/tests/fgpyo/fasta/test_sequence_dictionary.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Any from typing import Dict from typing import List @@ -11,6 +12,8 @@ from fgpyo.fasta.sequence_dictionary import SequenceDictionary from fgpyo.fasta.sequence_dictionary import SequenceMetadata from fgpyo.fasta.sequence_dictionary import Topology +from fgpyo.sam import builder +from fgpyo.sam import reader def test_alternate_locus_raises_start_gt_end() -> None: @@ -315,10 +318,6 @@ def test_sequence_dictionary_same_as() -> None: assert not this.same_as(that) -# to_sam -# from_sam - - def test_sequence_dictionary_to_and_from_sam() -> None: sd = SequenceDictionary( infos=[ @@ -333,10 +332,15 @@ def test_sequence_dictionary_to_and_from_sam() -> None: header = pysam.AlignmentHeader.from_dict( header_dict={"HD": {"VN": "1.5"}, "SQ": mapping, "RG": [{"ID": "foo"}]} ) - + samfile: Path = builder.SamBuilder(sd=mapping).to_path() + alignment: pysam.AlignmentFile = reader(samfile) + assert SequenceDictionary.from_sam(samfile) == sd + assert SequenceDictionary.from_sam(alignment) == sd assert SequenceDictionary.from_sam(mapping) == sd assert SequenceDictionary.from_sam(header) == sd assert sd.to_sam_header(extra_header={"RG": [{"ID": "foo"}]}) + with pytest.raises(ValueError): + SequenceDictionary.from_sam([{}]) def test_sequence_dictionary_mapping() -> None: