Improvements to testing and parser

- Added `__eq__` functionality to `parser.Residue` and `parser.Chain` - Fixed issue with reading/writing files with new `_entity_poly.type` chain type determination - Added several tests for the parser, including reading/writing tests # Please enter the commit message for your changes. Lines starting # with '#' will be ignored, and an empty message aborts the commit. # # On branch main # Your branch is up to date with 'origin/main'. # # Changes to be committed: # modified: rna3db/parser.py # new file: tests/test_data/1ehz.cif # new file: tests/test_data/3cgs.cif # new file: tests/test_data/old_1ehz_A.cif # new file: tests/test_parser.py # # Untracked files: # notebooks/4x4n.ipynb # notebooks/8opl.ipynb # notebooks/_entity_poly_investigate.ipynb # notebooks/_entity_poly_investigate.txt # notebooks/clan_investigate.ipynb # notebooks/dev.ipynb # notebooks/diff.ipynb # notebooks/families_with_hits.ipynb # notebooks/find_length_range.ipynb # notebooks/improved_split_strategy.ipynb # notebooks/make_atom_table.ipynb # notebooks/mini-overfit.ipynb # notebooks/multichain_small.ipynb # notebooks/o.cif # notebooks/parse_coords.ipynb # notebooks/remake_split.ipynb # notebooks/resolution.ipynb # notebooks/rna3db-json.ipynb # notebooks/rna3db-update.json # notebooks/rna3db.json # notebooks/split2_dev.ipynb # notebooks/split_dev.ipynb # notebooks/supplementary_table.ipynb # notebooks/table.ipynb # notebooks/tmp.ipynb #
marcellszi · Oct 8, 2024 · 1b041a9 · 1b041a9
1 parent 5567f82
commit 1b041a9
Show file tree

Hide file tree

Showing 5 changed files with 7,787 additions and 24 deletions.
diff --git a/rna3db/parser.py b/rna3db/parser.py
@@ -94,11 +94,15 @@ def __init__(
         three_letter_code: str,
         one_letter_code: str,
         index: int,
+        atoms: dict = None,
     ):
         self.three_letter_code = three_letter_code
         self.one_letter_code = one_letter_code
         self.index = index
-        self.atoms = {}
+        # NOTE: we need to handle dict like this, cannot use `atoms: dict = {}` in method definition
+        # See important warning: https://docs.python.org/3/tutorial/controlflow.html#default-argument-values
+        # (the default value is evaluated only once, which causes issues with mutable dictionaries)
+        self.atoms = atoms if atoms else {}
 
     @property
     def code(self) -> str:
@@ -108,6 +112,15 @@ def code(self) -> str:
     def is_missing(self) -> bool:
         return not len(self.atoms) > 0
 
+    def __eq__(self, other) -> bool:
+        # NOTE: we don't care about three letter codes, only one letter
+        #       this means modifications are still equal
+        return (
+            self.one_letter_code == other.one_letter_code
+            and self.index == other.index
+            and self.atoms == other.atoms
+        )
+
     def __repr__(self):
         return (
             f"Residue(code={self.code}, three_letter_code={self.three_letter_code}, "
@@ -135,6 +148,17 @@ def __getitem__(self, idx):
     def __len__(self):
         return len(self.residues)
 
+    def __eq__(self, other):
+        # NOTE: we ignore the author_id for equality checks
+        if len(self) != len(other):
+            return False
+
+        for res_self, res_other in zip(self, other):
+            if res_self != res_other:
+                return False
+
+        return True
+
     @property
     def has_atoms(self):
         return any([not res.is_missing for res in self])
@@ -314,9 +338,7 @@ def __init__(
             )
             file_parser = PDBParser
         else:
-            raise ValueError(
-                f"The extension {self.path.suffix.lower()} is not supported."
-            )
+            raise ValueError(f"The extension `{path.suffix.lower()}` is not supported.")
 
         # make the parser
         parser = file_parser(
@@ -475,6 +497,15 @@ def write_mmcif_chain(self, output_path, author_id):
                 ("N", "'RNA linking'", "y", '"N"', "?", "''", 0),
             ],
         )
+        entity_poly = StructureFile._gen_mmcif_loop_str(
+            "entity_poly",
+            [
+                "entity_id",
+                "type",
+            ],
+            [(1, "polyribonucleotide")],
+        )
+
         entity_poly_seq_str = StructureFile._gen_mmcif_loop_str(
             "entity_poly_seq",
             [
@@ -518,6 +549,7 @@ def write_mmcif_chain(self, output_path, author_id):
             f.write(header_str)
             f.write(struct_asym_str)
             f.write(chem_comp_str)
+            f.write(entity_poly)
             f.write(entity_poly_seq_str)
             f.write(atom_site_str)
 
@@ -648,14 +680,6 @@ def chains(self):
             k = mmcif_chain_to_entity_id[mmcif_chain_id]
             id_map[k].add(author_chain_id)
 
-        # get the chem_comp type for each mon_id
-        chem_comp_type = {
-            mon_id: comp_type
-            for mon_id, comp_type in zip(
-                self.parsed_info["_chem_comp.id"], self.parsed_info["_chem_comp.type"]
-            )
-        }
-
         # parse full chains from "seqres"
         chains_full = defaultdict(Chain)
         for entity_id, mon_id, idx in zip(
@@ -673,14 +697,42 @@ def chains(self):
                     )
                 )
 
-        # Get chain/polymer types
         chain_type = {}
-        for entity_id, poly_type in zip(
-            self.parsed_info["_entity_poly.entity_id"],
-            self.parsed_info["_entity_poly.type"],
+        # we check if we have _entity_poly
+        if (
+            "_entity_poly.entity_id" in self.parsed_info
+            and "_entity_poly.type" in self.parsed_info
         ):
-            for author_id in id_map[entity_id]:
-                chain_type[author_id] = poly_type
+            # get chain/polymer types
+            for entity_id, poly_type in zip(
+                self.parsed_info["_entity_poly.entity_id"],
+                self.parsed_info["_entity_poly.type"],
+            ):
+                for author_id in id_map[entity_id]:
+                    chain_type[author_id] = poly_type
+        else:
+            # if we don't have _entity_poly, we fall back to chem_comp type for each mon_id
+            # this is for backwards compatibility with older RNA3BD version release mmCIFs
+            chem_comp_type = {
+                mon_id: comp_type
+                for mon_id, comp_type in zip(
+                    self.parsed_info["_chem_comp.id"],
+                    self.parsed_info["_chem_comp.type"],
+                )
+            }
+            for author_id, chain_data in chains_full.items():
+                # "keep" only chains that contain at least one `self.molecule_type`
+                if any(
+                    [
+                        self.molecule_type in chem_comp_type[i.three_letter_code]
+                        for i in chain_data.residues
+                    ]
+                ):
+                    # if RNA we set to self.polymer_type (i.e. "polyribonucleotide")
+                    chain_type[author_id] = self.polymer_type
+                else:
+                    # we just set to "other" if not an RNA
+                    chain_type[author_id] = "other"
 
         # keep only chains of the appropriate polymer type
         chains = {}
@@ -722,12 +774,6 @@ def chains(self):
                 )
 
                 # make sure that the sites actually match, should never be a mismatch
-                """
-                assert (
-                    site.three_letter_code
-                    == chains[site.author_chain_id][seq_idx].three_letter_code
-                ), f"residue mismatch at chain {site.author_chain_id} pos {seq_idx} (expected {site.three_letter_code}, got {chains[site.author_chain_id][seq_idx].three_letter_code})"
-                """
                 if (
                     site.three_letter_code
                     != chains[site.author_chain_id][seq_idx].three_letter_code