Skip to content

Commit

Permalink
Improvements to testing and parser
Browse files Browse the repository at this point in the history
- Added `__eq__` functionality to `parser.Residue` and `parser.Chain`
- Fixed issue with reading/writing files with new `_entity_poly.type` chain
  type determination
- Added several tests for the parser, including reading/writing tests

# Please enter the commit message for your changes. Lines starting
# with '#' will be ignored, and an empty message aborts the commit.
#
# On branch main
# Your branch is up to date with 'origin/main'.
#
# Changes to be committed:
#	modified:   rna3db/parser.py
#	new file:   tests/test_data/1ehz.cif
#	new file:   tests/test_data/3cgs.cif
#	new file:   tests/test_data/old_1ehz_A.cif
#	new file:   tests/test_parser.py
#
# Untracked files:
#	notebooks/4x4n.ipynb
#	notebooks/8opl.ipynb
#	notebooks/_entity_poly_investigate.ipynb
#	notebooks/_entity_poly_investigate.txt
#	notebooks/clan_investigate.ipynb
#	notebooks/dev.ipynb
#	notebooks/diff.ipynb
#	notebooks/families_with_hits.ipynb
#	notebooks/find_length_range.ipynb
#	notebooks/improved_split_strategy.ipynb
#	notebooks/make_atom_table.ipynb
#	notebooks/mini-overfit.ipynb
#	notebooks/multichain_small.ipynb
#	notebooks/o.cif
#	notebooks/parse_coords.ipynb
#	notebooks/remake_split.ipynb
#	notebooks/resolution.ipynb
#	notebooks/rna3db-json.ipynb
#	notebooks/rna3db-update.json
#	notebooks/rna3db.json
#	notebooks/split2_dev.ipynb
#	notebooks/split_dev.ipynb
#	notebooks/supplementary_table.ipynb
#	notebooks/table.ipynb
#	notebooks/tmp.ipynb
#
  • Loading branch information
marcellszi committed Oct 8, 2024
1 parent 5567f82 commit 1b041a9
Show file tree
Hide file tree
Showing 5 changed files with 7,787 additions and 24 deletions.
94 changes: 70 additions & 24 deletions rna3db/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,15 @@ def __init__(
three_letter_code: str,
one_letter_code: str,
index: int,
atoms: dict = None,
):
self.three_letter_code = three_letter_code
self.one_letter_code = one_letter_code
self.index = index
self.atoms = {}
# NOTE: we need to handle dict like this, cannot use `atoms: dict = {}` in method definition
# See important warning: https://docs.python.org/3/tutorial/controlflow.html#default-argument-values
# (the default value is evaluated only once, which causes issues with mutable dictionaries)
self.atoms = atoms if atoms else {}

@property
def code(self) -> str:
Expand All @@ -108,6 +112,15 @@ def code(self) -> str:
def is_missing(self) -> bool:
return not len(self.atoms) > 0

def __eq__(self, other) -> bool:
# NOTE: we don't care about three letter codes, only one letter
# this means modifications are still equal
return (
self.one_letter_code == other.one_letter_code
and self.index == other.index
and self.atoms == other.atoms
)

def __repr__(self):
return (
f"Residue(code={self.code}, three_letter_code={self.three_letter_code}, "
Expand Down Expand Up @@ -135,6 +148,17 @@ def __getitem__(self, idx):
def __len__(self):
return len(self.residues)

def __eq__(self, other):
# NOTE: we ignore the author_id for equality checks
if len(self) != len(other):
return False

for res_self, res_other in zip(self, other):
if res_self != res_other:
return False

return True

@property
def has_atoms(self):
return any([not res.is_missing for res in self])
Expand Down Expand Up @@ -314,9 +338,7 @@ def __init__(
)
file_parser = PDBParser
else:
raise ValueError(
f"The extension {self.path.suffix.lower()} is not supported."
)
raise ValueError(f"The extension `{path.suffix.lower()}` is not supported.")

# make the parser
parser = file_parser(
Expand Down Expand Up @@ -475,6 +497,15 @@ def write_mmcif_chain(self, output_path, author_id):
("N", "'RNA linking'", "y", '"N"', "?", "''", 0),
],
)
entity_poly = StructureFile._gen_mmcif_loop_str(
"entity_poly",
[
"entity_id",
"type",
],
[(1, "polyribonucleotide")],
)

entity_poly_seq_str = StructureFile._gen_mmcif_loop_str(
"entity_poly_seq",
[
Expand Down Expand Up @@ -518,6 +549,7 @@ def write_mmcif_chain(self, output_path, author_id):
f.write(header_str)
f.write(struct_asym_str)
f.write(chem_comp_str)
f.write(entity_poly)
f.write(entity_poly_seq_str)
f.write(atom_site_str)

Expand Down Expand Up @@ -648,14 +680,6 @@ def chains(self):
k = mmcif_chain_to_entity_id[mmcif_chain_id]
id_map[k].add(author_chain_id)

# get the chem_comp type for each mon_id
chem_comp_type = {
mon_id: comp_type
for mon_id, comp_type in zip(
self.parsed_info["_chem_comp.id"], self.parsed_info["_chem_comp.type"]
)
}

# parse full chains from "seqres"
chains_full = defaultdict(Chain)
for entity_id, mon_id, idx in zip(
Expand All @@ -673,14 +697,42 @@ def chains(self):
)
)

# Get chain/polymer types
chain_type = {}
for entity_id, poly_type in zip(
self.parsed_info["_entity_poly.entity_id"],
self.parsed_info["_entity_poly.type"],
# we check if we have _entity_poly
if (
"_entity_poly.entity_id" in self.parsed_info
and "_entity_poly.type" in self.parsed_info
):
for author_id in id_map[entity_id]:
chain_type[author_id] = poly_type
# get chain/polymer types
for entity_id, poly_type in zip(
self.parsed_info["_entity_poly.entity_id"],
self.parsed_info["_entity_poly.type"],
):
for author_id in id_map[entity_id]:
chain_type[author_id] = poly_type
else:
# if we don't have _entity_poly, we fall back to chem_comp type for each mon_id
# this is for backwards compatibility with older RNA3BD version release mmCIFs
chem_comp_type = {
mon_id: comp_type
for mon_id, comp_type in zip(
self.parsed_info["_chem_comp.id"],
self.parsed_info["_chem_comp.type"],
)
}
for author_id, chain_data in chains_full.items():
# "keep" only chains that contain at least one `self.molecule_type`
if any(
[
self.molecule_type in chem_comp_type[i.three_letter_code]
for i in chain_data.residues
]
):
# if RNA we set to self.polymer_type (i.e. "polyribonucleotide")
chain_type[author_id] = self.polymer_type
else:
# we just set to "other" if not an RNA
chain_type[author_id] = "other"

# keep only chains of the appropriate polymer type
chains = {}
Expand Down Expand Up @@ -722,12 +774,6 @@ def chains(self):
)

# make sure that the sites actually match, should never be a mismatch
"""
assert (
site.three_letter_code
== chains[site.author_chain_id][seq_idx].three_letter_code
), f"residue mismatch at chain {site.author_chain_id} pos {seq_idx} (expected {site.three_letter_code}, got {chains[site.author_chain_id][seq_idx].three_letter_code})"
"""
if (
site.three_letter_code
!= chains[site.author_chain_id][seq_idx].three_letter_code
Expand Down
Loading

0 comments on commit 1b041a9

Please sign in to comment.