'MDAnalysis.analysis.nucleicacids' parallelization (#4727)

talagayev · web-flow · commit 7f686ca0daa8 · 2024-12-14T15:06:04.000-08:00
- Fixes #4670 - Parallelization of the backend support to the class `NucPairDist` in nucleicacids.py - Addition of parallelization tests in test_nucleicacids.py and fixtures in conftest.py - Updated Changelog
diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -14,7 +14,7 @@ The rules for this file:
 
 
 -------------------------------------------------------------------------------
-??/??/?? IAlibay, ChiahsinChu, RMeli, tanishy7777
+??/??/?? IAlibay, ChiahsinChu, RMeli, tanishy7777, talagayev
 
  * 2.9.0
 
@@ -25,6 +25,7 @@ Fixes
    the function to prevent shared state. (Issue #4655)
 
 Enhancements
+ * Enable parallelization for analysis.nucleicacids.NucPairDist (Issue #4670)
  * Add check and warning for empty (all zero) coordinates in RDKit converter (PR #4824)
  * Added `precision` for XYZWriter (Issue #4775, PR #4771)
 
@@ -98,11 +99,11 @@ Enhancements
  * Introduce parallelization API to `AnalysisBase` and to `analysis.rms.RMSD` class
    (Issue #4158, PR #4304)
  * Enables parallelization for analysis.gnm.GNMAnalysis (Issue #4672)
- * explicitly mark `analysis.pca.PCA` as not parallelizable (Issue #4680)
- * enables parallelization for analysis.bat.BAT (Issue #4663)
- * enable parallelization for analysis.dihedrals.{Dihedral,Ramachandran,Janin}
+ * Explicitly mark `analysis.pca.PCA` as not parallelizable (Issue #4680)
+ * Enables parallelization for analysis.bat.BAT (Issue #4663)
+ * Enable parallelization for analysis.dihedrals.{Dihedral,Ramachandran,Janin}
    (Issue #4673) 
- * enables parallelization for analysis.dssp.dssp.DSSP (Issue #4674)
+ * Enables parallelization for analysis.dssp.dssp.DSSP (Issue #4674)
  * Enables parallelization for analysis.hydrogenbonds.hbond_analysis.HydrogenBondAnalysis (Issue #4664)
  * Improve error message for `AtomGroup.unwrap()` when bonds are not present.(Issue #4436, PR #4642)
  * Add `analysis.DSSP` module for protein secondary structure assignment, based on [pydssp](https://github.com/ShintaroMinami/PyDSSP)
diff --git a/package/MDAnalysis/analysis/nucleicacids.py b/package/MDAnalysis/analysis/nucleicacids.py
@@ -70,7 +70,7 @@
 
 import MDAnalysis as mda
 from .distances import calc_bonds
-from .base import AnalysisBase, Results
+from .base import AnalysisBase, ResultsGroup
 from MDAnalysis.core.groups import Residue, ResidueGroup
 
 
@@ -159,13 +159,23 @@ class NucPairDist(AnalysisBase):
     .. versionchanged:: 2.7.0
         Added static method :attr:`select_strand_atoms` as a
         helper for selecting atom pairs for distance analysis.
+
+    .. versionchanged:: 2.9.0
+       Enabled **parallel execution** with the ``multiprocessing`` and ``dask``
+       backends; use the new method :meth:`get_supported_backends` to see all
+       supported backends.
     """
 
+    _analysis_algorithm_is_parallelizable = True
+
+    @classmethod
+    def get_supported_backends(cls):
+        return ('serial', 'multiprocessing', 'dask')
+
     _s1: mda.AtomGroup
     _s2: mda.AtomGroup
     _n_sel: int
-    _res_dict: Dict[int, List[float]]
-
+    
     def __init__(self, selection1: List[mda.AtomGroup],
                  selection2: List[mda.AtomGroup],
                  **kwargs) -> None:
@@ -276,7 +286,7 @@ def select_strand_atoms(
         return (sel1, sel2)
 
     def _prepare(self) -> None:
-        self._res_array: np.ndarray = np.zeros(
+        self.results.distances: np.ndarray = np.zeros(
             [self.n_frames, self._n_sel]
         )
 
@@ -285,13 +295,17 @@ def _single_frame(self) -> None:
             self._s1.positions, self._s2.positions
         )
 
-        self._res_array[self._frame_index, :] = dist
+        self.results.distances[self._frame_index, :] = dist
 
     def _conclude(self) -> None:
-        self.results['distances'] = self._res_array
         self.results['pair_distances'] = self.results['distances']
         # TODO: remove pair_distances in 3.0.0
 
+    def _get_aggregator(self):
+        return ResultsGroup(lookup={
+            'distances': ResultsGroup.ndarray_vstack,
+        }
+        )
 
 class WatsonCrickDist(NucPairDist):
     r"""
diff --git a/testsuite/MDAnalysisTests/analysis/conftest.py b/testsuite/MDAnalysisTests/analysis/conftest.py
@@ -14,6 +14,7 @@
 from MDAnalysis.analysis.hydrogenbonds.hbond_analysis import (
     HydrogenBondAnalysis,
 )
+from MDAnalysis.analysis.nucleicacids import NucPairDist
 from MDAnalysis.lib.util import is_installed
 
 
@@ -141,3 +142,10 @@ def client_DSSP(request):
 @pytest.fixture(scope='module', params=params_for_cls(HydrogenBondAnalysis))
 def client_HydrogenBondAnalysis(request):
     return request.param
+
+
+# MDAnalysis.analysis.nucleicacids
+
+@pytest.fixture(scope="module", params=params_for_cls(NucPairDist))
+def client_NucPairDist(request):
+    return request.param
diff --git a/testsuite/MDAnalysisTests/analysis/test_nucleicacids.py b/testsuite/MDAnalysisTests/analysis/test_nucleicacids.py
@@ -55,12 +55,12 @@ def test_empty_ag_error(strand):
 
 
 @pytest.fixture(scope='module')
-def wc_rna(strand):
+def wc_rna(strand, client_NucPairDist):
     strand1 = ResidueGroup([strand.residues[0], strand.residues[21]])
     strand2 = ResidueGroup([strand.residues[1], strand.residues[22]])
 
     WC = WatsonCrickDist(strand1, strand2)
-    WC.run()
+    WC.run(**client_NucPairDist)
     return WC
 
 
@@ -114,23 +114,23 @@ def test_wc_dis_results_keyerrs(wc_rna, key):
         wc_rna.results[key]
 
 
-def test_minor_dist(strand):
+def test_minor_dist(strand, client_NucPairDist):
     strand1 = ResidueGroup([strand.residues[2], strand.residues[19]])
     strand2 = ResidueGroup([strand.residues[16], strand.residues[4]])
 
     MI = MinorPairDist(strand1, strand2)
-    MI.run()
+    MI.run(**client_NucPairDist)
 
     assert MI.results.distances[0, 0] == approx(15.06506, rel=1e-3)
     assert MI.results.distances[0, 1] == approx(3.219116, rel=1e-3)
 
 
-def test_major_dist(strand):
+def test_major_dist(strand, client_NucPairDist):
     strand1 = ResidueGroup([strand.residues[1], strand.residues[4]])
     strand2 = ResidueGroup([strand.residues[11], strand.residues[8]])
 
     MA = MajorPairDist(strand1, strand2)
-    MA.run()
+    MA.run(**client_NucPairDist)
 
     assert MA.results.distances[0, 0] == approx(26.884272, rel=1e-3)
     assert MA.results.distances[0, 1] == approx(13.578535, rel=1e-3)