Merge pull request #65 from JecaTosovic/fix_water_clustering_defaults

Make WaterClustering defaults consistent with hyrdogen clustering
JecaTosovic · Apr 22, 2024 · b3ef0bc · b3ef0bc
2 parents 2170536 + 595269f
commit b3ef0bc
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 48 deletions.
diff --git a/ConservedWaterSearch/hydrogen_orientation.py b/ConservedWaterSearch/hydrogen_orientation.py
@@ -25,14 +25,14 @@ def hydrogen_orientation_analysis(
     HCW_angstd_cutoff: float = 17,
     WCW_angstd_cutoff: float = 20,
     weakly_explained: float = 0.7,
-    xiFCW: tuple[float]|list[float] = (0.03,),
-    xiHCW: tuple[float]|list[float] = (0.05, 0.01),
-    xiWCW: tuple[float]|list[float] = (0.05, 0.001),
+    xiFCW: tuple[float] | list[float] = (0.03,),
+    xiHCW: tuple[float] | list[float] = (0.05, 0.01),
+    xiWCW: tuple[float] | list[float] = (0.05, 0.001),
     njobs: int = 1,
     verbose: int = 0,
     debugH: int = 0,
     plotreach: bool = False,
-    which: tuple[str]|list[str] = ("FCW", "HCW", "WCW"),
+    which: tuple[str] | list[str] = ("FCW", "HCW", "WCW"),
     normalize_orientations: bool = True,
 ) -> list:
     """Determines if the water cluster is conserved and of what type.
@@ -66,13 +66,13 @@ def hydrogen_orientation_analysis(
             kmeans inertia (measure of spread of data in a cluster).
             Defaults to 0.4.
         FCW_angdiff_cutoff (float, optional): Maximum value of angle (in
-            deg) allowed for FCW in OPTICS/HDBSCAN clustering to be
+            deg) allowed for FCW in OPTICS clustering to be
             considered correct water angle. Defaults to 5.
         FCW_angstd_cutoff (float, optional): Maximal standard deviation
             of angle distribution of orientations of two hydrogens
             allowed for water to be considered FCW. Defaults to 17.
         min_samp_data_size_pct (float, optional): Minimum samples to
-            choose for OPTICS or HDBSCAN clustering as percentage of
+            choose for OPTICS clustering as percentage of
             number of water molecules considered for HCW and WCW.
             Defaults to 0.15.
         nonFCW_angdiff_cutoff (float, optional): Maximum standard
@@ -229,7 +229,7 @@ def find_fully_conserved_orientations(
             kmeans inertia (measure of spread of data in a cluster).
             Defaults to 0.4.
         angdiff_cutoff (float, optional): Maximum value of angle (in
-            deg) allowed for FCW in OPTICS/HDBSCAN clustering to be
+            deg) allowed for FCW in OPTICS clustering to be
             considered correct water angle. Defaults to 5.
         angstd_cutoff (float, optional): Maximal standard deviation
             of angle distribution of orientations of two hydrogens
@@ -431,7 +431,7 @@ def find_half_conserved_orientations(
         pct_size_buffer (float, optional): Minimum allowed size of the
             hydrogen orientation cluster. Defaults to 0.85.
         min_samp_data_size_pct (float, optional): Minimum samples to
-            choose for OPTICS or HDBSCAN clustering as percentage of
+            choose for OPTICS clustering as percentage of
             number of water molecules considered for HCW and WCW.
             Defaults to 0.15.
         angdiff_cutoff (float, optional): Maximum standard
@@ -585,7 +585,7 @@ def find_weakly_conserved_orientations(
         lower_bound_pct_buffer (float, optional): Minimum allowed size of the
             hydrogen orientation cluster. Defaults to 0.35.
         min_samp_data_size_pct (float, optional): Minimum samples to
-            choose for OPTICS or HDBSCAN clustering as percentage of
+            choose for OPTICS clustering as percentage of
             number of water molecules considered for HCW and WCW.
             Defaults to 0.15.
         pct_explained (float, optional): percentage of explained

diff --git a/ConservedWaterSearch/water_clustering.py b/ConservedWaterSearch/water_clustering.py
@@ -65,10 +65,19 @@ def __init__(
         self,
         nsnaps: int,
         clustering_algorithm: str = "OPTICS",
-        water_types_to_find: list[str] | None = None,
+        water_types_to_find: tuple[str] | list[str] = ("FCW", "HCW", "WCW"),
         restart_after_found: bool = False,
         min_samples: list[int] | None = None,
-        xis: list[float] | None = None,
+        xis: tuple[float] | list[float] = (
+            0.1,
+            0.05,
+            0.01,
+            0.005,
+            0.001,
+            0.0005,
+            0.0001,
+            1e-05,
+        ),
         numbpct_oxygen: float = 0.8,
         normalize_orientations: bool = True,
         numbpct_hyd_orient_analysis: float = 0.85,
@@ -81,9 +90,9 @@ def __init__(
         HCW_angstd_cutoff: float = 17,
         WCW_angstd_cutoff: float = 20,
         weakly_explained: float = 0.7,
-        xiFCW: list[float] | None = None,
-        xiHCW: list[float] | None = None,
-        xiWCW: list[float] | None = None,
+        xiFCW: tuple[float] | list[float] = (0.03,),
+        xiHCW: tuple[float] | list[float] = (0.05, 0.01),
+        xiWCW: tuple[float] | list[float] = (0.05, 0.001),
         njobs: int = 1,
         verbose: int = 0,
         debugO: int = 0,
@@ -104,24 +113,24 @@ def __init__(
             clustering_algorithm (str, optional): Options are "OPTICS"
                 or "HDBSCAN". OPTICS provides slightly better results,
                 but is also slightly slower. Defaults to "OPTICS".
-            water_types_to_find (list[str], optional): Defines which
+            water_types_to_find (tuple[str], optional): Defines which
                 water types to search for. Any combination of "FCW",
                 "HWC" and "WCW" is allowed, or "onlyO" for oxygen
-                clustering only. Defaults to ["FCW", "HCW", "WCW"].
+                clustering only. Defaults to ("FCW", "HCW", "WCW").
             restart_after_found (bool, optional): If ``True`` restarts
                 clustering after each water is found. ``False`` will
-                give the quick version of multi-stage reculstering
+                give the quick version of multi-stage reclustering
                 approach. Defaults to False.
             min_samples (list[int], optional): List of minimum samples
                 for OPTICS or HDBSCAN. If ``None`` following range is
                 used ``[int(0.25 * nsnaps), nsnaps]`` is used. For single
                 clustering users should provide a single integer between
                 0 and ``nsnaps`` in a list. Defaults to None.
-            xis (list[float], optional): List of xis for OPTICS
+            xis (tuple[float], optional): List or tuple of xis for OPTICS
                 clustering. This is ignored for HDBSCAN. Defaults to
-                [ 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001,
-                0.00001]. For single clustering users should provide a
-                single float between 0 and 1 in a list.
+                (0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001,
+                0.00001). For single clustering, users should provide a
+                single float between 0 and 1 in a list/tuple.
             numbpct_oxygen (float, optional): Percentage of
                 ``nsnaps`` required for oxygen cluster to be considered
                 valid and water conserved. The check is enforced on
@@ -161,15 +170,15 @@ def __init__(
             weakly_explained (float, optional): percentage of explained
                 hydrogen orientations for water to be considered WCW.
                 Defaults to 0.7.
-            xiFCW (list, optional): Xi value for OPTICS clustering for
-                FCW. Don't touch this unless you know what you are
-                doing. Defaults to [0.03].
-            xiHCW (list, optional): Xi value for OPTICS clustering for
-                HCW. Don't touch this unless you know what you are doing.
-                Defaults to [0.05, 0.01].
-            xiWCW (list, optional): Xi value for OPTICS clustering for
-                WCW. Don't touch this unless you know what you are doing.
-                Defaults to [0.05, 0.001].
+            xiFCW (tuple[float], optional): Xi value for hydrogen clustering of
+                FCWs for OPTICS algorithm. Avoid changing the defaults if
+                possible. Defaults to (0.03,).
+            xiHCW (tuple[float], optional): Xi value for OPTICS clustering for
+                HCW. Avoid changing the defaults if possible.
+                Defaults to (0.05, 0.01).
+            xiWCW (tuple[float], optional): Xi value for OPTICS clustering for
+                WCW. Avoid changing the defaults if possible.
+                Defaults to (0.05, 0.001).
             njobs (int, optional): how many cpu cores to use for clustering.
                 Defaults to 1.
             verbose (int, optional): verbosity of output. Defaults to 0.
@@ -191,16 +200,21 @@ def __init__(
                 ``output_file`` have to be provided for clustering
                 restarting. Defaults to None.
         """
-        if xiWCW is None:
-            xiWCW = [0.05, 0.001]
-        if xiHCW is None:
-            xiHCW = [0.05, 0.01]
-        if xiFCW is None:
-            xiFCW = [0.03]
-        if water_types_to_find is None:
-            water_types_to_find = ["FCW", "HCW", "WCW"]
-        if xis is None:
-            xis = [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 1e-05]
+        if not isinstance(water_types_to_find, (tuple, list)):
+            if isinstance(water_types_to_find, str):
+                water_types_to_find = (water_types_to_find,)
+        if not isinstance(xis, (tuple, list)):
+            if isinstance(xis, float):
+                xis = tuple(xis)
+        if not isinstance(xiFCW, (tuple, list)):
+            if isinstance(xiFCW, float):
+                xiFCW = tuple(xiFCW)
+        if not isinstance(xiHCW, (tuple, list)):
+            if isinstance(xiHCW, float):
+                xiHCW = tuple(xiHCW)
+        if not isinstance(xiWCW, (tuple, list)):
+            if isinstance(xiWCW, float):
+                xiWCW = tuple(xiWCW)
         if nsnaps <= 0:
             msg = f"nsnaps must be positive {nsnaps}"
             raise Exception(msg)
@@ -514,10 +528,10 @@ def read_and_set_water_clust_options(self, file_name: str) -> None:
                 lines: list[str] = f.read().splitlines()
                 self.nsnaps = int(lines[0].strip())
                 self.clustering_algorithm = lines[1].strip(" ")
-                self.water_types_to_find = [i for i in lines[2].split(" ")]
+                self.water_types_to_find = tuple([i for i in lines[2].split(" ")])
                 self.restart_after_find = lines[3] == "True"
                 self.min_samples = [int(i) for i in lines[4].split(" ")]
-                self.xis = [float(i) for i in lines[5].split(" ")]
+                self.xis = tuple([float(i) for i in lines[5].split(" ")])
                 self.numbpct_oxygen = float(lines[6])
                 self.normalize_orientations = lines[7] == "True"
                 self.numbpct_hyd_orient_analysis = float(lines[8])
@@ -530,9 +544,9 @@ def read_and_set_water_clust_options(self, file_name: str) -> None:
                 self.halfcon_angstd_cutoff = float(lines[15])
                 self.weakly_angstd_cutoff = float(lines[16])
                 self.weakly_explained = float(lines[17])
-                self.xiFCW = [float(i) for i in lines[18].split(" ")]
-                self.xiHCW = [float(i) for i in lines[19].split(" ")]
-                self.xiWCW = [float(i) for i in lines[20].split(" ")]
+                self.xiFCW = tuple([float(i) for i in lines[18].split(" ")])
+                self.xiHCW = tuple([float(i) for i in lines[19].split(" ")])
+                self.xiWCW = tuple([float(i) for i in lines[20].split(" ")])
                 self.njobs = int(lines[21])
                 self.verbose = int(lines[22])
                 self.debugO = int(lines[23])
@@ -1056,7 +1070,10 @@ def _check_cls_alg_and_whichH(self):
             raise Exception(msg)
         for i in self.water_types_to_find:
             if i not in ["FCW", "HCW", "WCW", "onlyO"]:
-                msg = "whichH supports onlyO or any combination of FCW, HCW and WCW"
+                msg = (
+                    "whichH supports onlyO or any combination of FCW, HCW and WCW"
+                    f" given option is invalid {i}"
+                )
                 raise Exception(msg)
         if "onlyO" in self.water_types_to_find and len(self.water_types_to_find) > 1:
             msg = "onlyO cannot be used with other water types"

diff --git a/docs/source/citing.rst b/docs/source/citing.rst
@@ -23,4 +23,3 @@ For citations, the following BibTeX entry can be used:
     doi = {10.1021/acs.jcim.2c00801},
     URL = {https://doi.org/10.1021/acs.jcim.2c00801},
     }
-
diff --git a/tests/test_water_clustering.py b/tests/test_water_clustering.py
@@ -128,7 +128,7 @@ def test_save_clustering_options():
 
 def test_create_from_file():
     ca = "OPTICS"
-    whichH = ["onlyO"]
+    whichH = ("onlyO")
     wc = WaterClustering(10, clustering_algorithm=ca, water_types_to_find=whichH)
     with tempfile.NamedTemporaryFile(mode="w+", delete=True) as f:
         wc._save_clustering_options(f.name)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -23,4 +23,3 @@ For citations, the following BibTeX entry can be used:
		doi = {10.1021/acs.jcim.2c00801},
		URL = {https://doi.org/10.1021/acs.jcim.2c00801},
		}