Merge pull request #97 from OpenDrugDiscovery/watercluster_fix

Fixed read_raw_entries for waterclusters
valence-labs · Jun 6, 2024 · 25e2a2f · 25e2a2f
2 parents 7b9fcbf + 31c5f4b
commit 25e2a2f
Showing 1 changed file with 9 additions and 7 deletions.
diff --git a/openqdc/datasets/potential/waterclusters3_30.py b/openqdc/datasets/potential/waterclusters3_30.py
@@ -1,3 +1,4 @@
+import zipfile
 from io import StringIO
 from os.path import join as p_join
 
@@ -8,8 +9,6 @@
 from openqdc.methods import PotentialMethod
 from openqdc.utils.constants import ATOM_TABLE, MAX_ATOMIC_NUMBER
 
-# we could use ase.io.read to read extxyz files
-
 
 def content_to_xyz(content, n_waters):
     content = content.strip()
@@ -80,12 +79,15 @@ class WaterClusters(BaseDataset):
 
     def read_raw_entries(self):
         samples = []
+        parent_folder = p_join(self.root, "W3-W30_all_geoms_TTM2.1-F/")
         for i in range(3, 31):
-            raw_path = p_join(self.root, f"W3-W30_all_geoms_TTM2.1-F/W{i}_geoms_all.xyz")
-            data = read_xyz(
-                raw_path,
-                i,
-            )
+            name = f"W{i}_geoms_all"
+            zip_path = p_join(parent_folder, f"{name}.zip")
+            xyz_path = p_join(parent_folder, f"{name}.xyz")
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
+                zip_ref.extractall(parent_folder)
+
+            data = read_xyz(xyz_path, i)
             samples += data
 
         return samples