From ba1430a96dea47a134cd89231a43406f273f400c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Semih=20Cant=C3=BCrk?= Date: Fri, 24 Jan 2025 08:47:35 -0500 Subject: [PATCH] add dipeptides dataset --- openqdc/datasets/potential/__init__.py | 2 + openqdc/datasets/potential/dipeptides.py | 81 ++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 openqdc/datasets/potential/dipeptides.py diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py index 7beba8c..a4ead5a 100644 --- a/openqdc/datasets/potential/__init__.py +++ b/openqdc/datasets/potential/__init__.py @@ -2,6 +2,7 @@ from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X from .bpa import BPA from .comp6 import COMP6 +from .dipeptides import Dipeptides from .dummy import Dummy, PredefinedDataset from .gdml import GDML from .geom import GEOM @@ -37,6 +38,7 @@ "ANI2X": ANI2X, "BPA": BPA, "COMP6": COMP6, + "Dipeptides": Dipeptides, "GDML": GDML, "GEOM": GEOM, "ISO17": ISO17, diff --git a/openqdc/datasets/potential/dipeptides.py b/openqdc/datasets/potential/dipeptides.py new file mode 100644 index 0000000..2d8d9eb --- /dev/null +++ b/openqdc/datasets/potential/dipeptides.py @@ -0,0 +1,81 @@ +import numpy as np +from openqdc.datasets.base import BaseDataset +from openqdc.methods import PotentialMethod + +def shape_atom_inputs(coords, atom_species): + reshaped_coords = coords.reshape(-1, 3) + frame, atoms, _ = coords.shape + z = np.tile(atom_species, frame) + xs = np.stack((z, np.zeros_like(z)), axis=-1) + return np.concatenate((xs, reshaped_coords), axis=-1, dtype=np.float32) + + +def read_npz_entry(folder): + data, name = create_path(folder) + data = np.load(data) + + nuclear_charges, coords, energies, forces = ( + data["nuclear_charges"], + data["coords"], + data["energies"], + data["forces"], + ) + frames = coords.shape[0] + res = dict( + name=np.array([name] * frames), + subset=np.array(["dipeptides"] * frames), + energies=energies[:, None].astype(np.float32), + forces=forces.reshape(-1, 3, 1).astype(np.float32), + atomic_inputs=shape_atom_inputs(coords, nuclear_charges), + n_atoms=np.array([len(nuclear_charges)] * frames, dtype=np.int32), + ) + return res + + +def create_path(folder): + name = folder.split("/")[-1] + return folder, name + +folder="/network/scratch/s/semih.canturk/cache/openqdc/dipeptides/npz_files/mol_73.npz" + +trajectories={ + "mol_73": folder +} + +class Dipeptides(BaseDataset): + """ + """ + + __name__ = "dipeptides" + + __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD] + + energy_target_names = [ + "", + ] + + __energy_unit__ = "kj/mol" + __distance_unit__ = "ang" + __forces_unit__ = "kj/mol/ang" + + __force_mask__ = [False] + + @property + def data_types(self): + return { + "atomic_inputs": np.float32, + "position_idx_range": np.int32, + "energies": np.float32, + "forces": np.float32, + } + + def read_raw_entries(self): + entries_list = [] + + for dummy_name, path_to_npz in trajectories.items(): + entries_list.append(read_npz_entry(path_to_npz)) + return entries_list + + +# to store it in the cache and loading back (add the dataset in the __init__) +# Dipeptides.no_init().preprocess(upload=False, overwrite=True) \ No newline at end of file