diff --git a/openqdc/datasets/potential/waterclusters3_30.py b/openqdc/datasets/potential/waterclusters3_30.py index e473353..f4c7d88 100644 --- a/openqdc/datasets/potential/waterclusters3_30.py +++ b/openqdc/datasets/potential/waterclusters3_30.py @@ -1,3 +1,4 @@ +import zipfile from io import StringIO from os.path import join as p_join @@ -8,8 +9,6 @@ from openqdc.methods import PotentialMethod from openqdc.utils.constants import ATOM_TABLE, MAX_ATOMIC_NUMBER -# we could use ase.io.read to read extxyz files - def content_to_xyz(content, n_waters): content = content.strip() @@ -80,12 +79,15 @@ class WaterClusters(BaseDataset): def read_raw_entries(self): samples = [] + parent_folder = p_join(self.root, "W3-W30_all_geoms_TTM2.1-F/") for i in range(3, 31): - raw_path = p_join(self.root, f"W3-W30_all_geoms_TTM2.1-F/W{i}_geoms_all.xyz") - data = read_xyz( - raw_path, - i, - ) + name = f"W{i}_geoms_all" + zip_path = p_join(parent_folder, f"{name}.zip") + xyz_path = p_join(parent_folder, f"{name}.xyz") + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(parent_folder) + + data = read_xyz(xyz_path, i) samples += data return samples