diff --git a/openqdc/__init__.py b/openqdc/__init__.py index c6be72d..56ba46c 100644 --- a/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -21,6 +21,7 @@ def get_project_root(): "ANI1CCX_V2": "openqdc.datasets.potential.ani", "ANI1X": "openqdc.datasets.potential.ani", "ANI2X": "openqdc.datasets.potential.ani", + "BPA": "openqdc.datasets.potential.bpa", "Spice": "openqdc.datasets.potential.spice", "SpiceV2": "openqdc.datasets.potential.spice", "SpiceVL2": "openqdc.datasets.potential.spice", @@ -31,6 +32,7 @@ def get_project_root(): "COMP6": "openqdc.datasets.potential.comp6", "GDML": "openqdc.datasets.potential.gdml", "Molecule3D": "openqdc.datasets.potential.molecule3d", + "MACEOFF": "openqdc.datasets.potential.maceoff", "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali", "SN2RXN": "openqdc.datasets.potential.sn2_rxn", "QM7X": "openqdc.datasets.potential.qm7x", @@ -117,11 +119,13 @@ def __dir__(): # POTENTIAL from .datasets.potential.alchemy import Alchemy from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X + from .datasets.potential.bpa import BPA from .datasets.potential.comp6 import COMP6 from .datasets.potential.dummy import Dummy, PredefinedDataset from .datasets.potential.gdml import GDML from .datasets.potential.geom import GEOM from .datasets.potential.iso_17 import ISO17 + from .datasets.potential.maceoff import MACEOFF from .datasets.potential.md22 import MD22 from .datasets.potential.molecule3d import Molecule3D from .datasets.potential.multixcqm9 import MultixcQM9, MultixcQM9_V2 diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py index 35721dd..7beba8c 100644 --- a/openqdc/datasets/potential/__init__.py +++ b/openqdc/datasets/potential/__init__.py @@ -1,10 +1,12 @@ from .alchemy import Alchemy from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X +from .bpa import BPA from .comp6 import COMP6 from .dummy import Dummy, PredefinedDataset from .gdml import GDML from .geom import GEOM from .iso_17 import ISO17 +from .maceoff import MACEOFF from .md22 import MD22 from .molecule3d import Molecule3D from .multixcqm9 import MultixcQM9, MultixcQM9_V2 @@ -33,11 +35,13 @@ "ANI1CCX_V2": ANI1CCX_V2, "ANI1X": ANI1X, "ANI2X": ANI2X, + "BPA": BPA, "COMP6": COMP6, "GDML": GDML, "GEOM": GEOM, "ISO17": ISO17, "Molecule3D": Molecule3D, + "MACEOFF": MACEOFF, "NablaDFT": NablaDFT, "OrbnetDenali": OrbnetDenali, "PCQM_B3LYP": PCQM_B3LYP, diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py index 42f5fd4..ed43aae 100644 --- a/openqdc/datasets/potential/ani.py +++ b/openqdc/datasets/potential/ani.py @@ -154,7 +154,7 @@ def convert_forces(self, x): return super().convert_forces(x) * 0.529177249 # correct the Dataset error def __smiles_converter__(self, x): - return x + return "-".join(x.decode("ascii").split("-")[:-1]) class ANI1CCX(ANI1): @@ -195,10 +195,7 @@ class ANI1CCX(ANI1): __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"} def __smiles_converter__(self, x): - """util function to convert string to smiles: useful if the smiles is - encoded in a different format than its display format - """ - return x + return x.decode("ascii") class ANI1CCX_V2(ANI1CCX): diff --git a/openqdc/datasets/potential/bpa.py b/openqdc/datasets/potential/bpa.py new file mode 100644 index 0000000..1681710 --- /dev/null +++ b/openqdc/datasets/potential/bpa.py @@ -0,0 +1,74 @@ +from typing import Any, Dict, List + +import numpy as np +from ase.atoms import Atoms + +from openqdc import BaseDataset +from openqdc.methods import PotentialMethod + + +def read_bpa_record(subset: str, atoms: Atoms) -> Dict[str, Any]: + return dict( + name=np.array([str(atoms.symbols)]), + subset=subset, + energies=np.array([atoms.get_potential_energy()], dtype=np.float64), + forces=atoms.get_forces().reshape(-1, 3, 1).astype(np.float32), + atomic_inputs=np.column_stack((atoms.numbers, atoms.get_initial_charges(), atoms.positions)).astype(np.float32), + n_atoms=np.array([len(atoms)], dtype=np.int32), + split=np.array([subset.item().split("_")[0]]), + ) + + +class BPA(BaseDataset): + """ + BPA (or 3BPA) dataset is a dataset consisting of a flexible druglike + molecule 3-(benzyloxy)pyridin-2-amine. This dataset features + complex dihedral potential energy surface with many local minima, + which can be challenging to approximate using classical or ML force fields. + The configuration were sampled from short (0.5 ps) MD simulations using the ANI-1x force field to + perturb the toward lower potential energies. Furthermore, long 25 ps MD simulation were performed at + three different temperatures (300, 600, and 1200 K) using the Langevin thermostat and a 1 fs time step. + The final configurations were re-evaluated using ORCA at the DFT level of + theory using the ωB97X exchange correlation functional and the 6-31G(d) basis set. + + Usage: + ```python + from openqdc.datasets import BPA + dataset = BPA() + ``` + + + References: + https://pubs.acs.org/doi/10.1021/acs.jctc.1c00647 + """ + + __name__ = "BPA" + __energy_unit__ = "ev" + __forces_unit__ = "ev/ang" + __distance_unit__ = "ang" + __force_mask__ = [True] + __energy_methods__ = [PotentialMethod.WB97X_6_31G_D] + __links__ = {"BPA.zip": "https://figshare.com/ndownloader/files/31325990"} + + def read_raw_entries(self) -> List[Dict]: + import os.path as osp + from glob import glob + + from ase.io import iread + + files = glob(osp.join(self.root, "dataset_3BPA", "*.xyz")) + files = [f for f in files if "iso_atoms.xyz" not in f] + all_records = [] + + for file in files: + subset = np.array([osp.basename(file).split(".")[0]]) + + for atoms in iread(file, format="extxyz"): + all_records.append(read_bpa_record(subset, atoms)) + + return all_records + + def __getitem__(self, idx): + data = super().__getitem__(idx) + data.__setattr__("split", self._convert_array(self.data["split"][idx])) + return data diff --git a/openqdc/datasets/potential/maceoff.py b/openqdc/datasets/potential/maceoff.py new file mode 100644 index 0000000..f90a3c4 --- /dev/null +++ b/openqdc/datasets/potential/maceoff.py @@ -0,0 +1,133 @@ +import re +from functools import partial +from os.path import join as p_join + +import datamol as dm +import numpy as np + +from openqdc.datasets.base import BaseDataset +from openqdc.methods import PotentialMethod +from openqdc.utils.constants import ATOMIC_NUMBERS +from openqdc.utils.molecule import get_atomic_number_and_charge + + +def parse_mace_xyz(xyzpath): + energy_re = re.compile(r"energy=(\S+)") + smiles_re = re.compile(r"smiles=(\S+)") + subset_re = re.compile(r"config_type=([^;]+)\ MACE_energy") + with open(xyzpath, "r") as f: + n_atoms = None + counter = 0 + positions = [] + numbers = [] + forces = [] + energy = None + for line in f: + if n_atoms is None: + n_atoms = int(line) + positions = [] + numbers = [] + forces = [] + energy = None + counter = 1 + continue + if counter == 1: + props = line + energy = float(energy_re.search(props).group(1)) + subset = subset_re.search(props).group(1) + try: + smiles = smiles_re.search(props).group(1) + except AttributeError: # water and qmugs subsets do not have smiles + smiles = "" + counter = 2 + continue + el, x, y, z, fx, fy, fz, _, _, _ = line.split() + numbers.append(ATOMIC_NUMBERS[el]) + positions.append([float(x), float(y), float(z)]) + forces.append([float(fx), float(fy), float(fz)]) + smiles = smiles.replace('"', "") + subset = subset.replace('"', "") + counter += 1 + if counter == n_atoms + 2: + n_atoms = None + yield energy, numbers, positions, forces, smiles, subset + + +def build_data_object(data, split): + energy, numbers, positions, forces, smiles, subset = data + if smiles == "": + x = np.concatenate((np.array(numbers)[:, None], np.zeros((len(numbers), 1))), axis=-1) + else: + x = get_atomic_number_and_charge(dm.to_mol(smiles, remove_hs=False, ordered=True)) + res = dict( + name=np.array([smiles]), + subset=np.array([subset]), + energies=np.array([[energy]], dtype=np.float64), + forces=np.array(forces, dtype=np.float32).reshape(-1, 3, 1), + atomic_inputs=np.concatenate((x, np.array(positions)), axis=-1, dtype=np.float32).reshape(-1, 5), + n_atoms=np.array([x.shape[0]], dtype=np.int32), + split=np.array([split]), + ) + return res + + +class MACEOFF(BaseDataset): + """ + MACEOFF dataset core of the dataset consist in the Spice V1 dataset. + 95% of the data are used for training and validation under the "train" split, + and 5% for testing. The dataset uses the Spice level of theory + ωB97M-D3(BJ)/def2-TZVPPD as implemented in the PSI4 software. + MACEOFF uses a subset of SPICE that contains the ten chemical elements + H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge. + MACEOFF doesn't contain ion pairs. To facilitate the learning of intramolecular + non-bonded interactions, MACEOFF dataset contains larger 50–90 atom molecules + randomly selected from the QMugs dataset. + MACEOFF contains a number of water clusters carved out of molecular dynamics simulations + of liquid water, with sizes of up to 50 water molecules and part of the + COMP6 tripeptide geometry dataset. + + Usage: + ```python + from openqdc.datasets import MACEOFF + dataset = MACEOFF() + ``` + + Species: + [H, C, N, O, F, P, S, Cl, Br, I] + + References: + https://arxiv.org/pdf/2312.15211\n + https://doi.org/10.17863/CAM.107498 + """ + + __name__ = "maceoff" + + __energy_methods__ = [PotentialMethod.WB97M_D3BJ_DEF2_TZVPPD] + __force_mask__ = [True] + __energy_unit__ = "ev" + __distance_unit__ = "ang" + __forces_unit__ = "ev/ang" + + energy_target_names = ["dft_total_energy"] + force_target_names = ["dft_total_gradient"] + + __links__ = { + "train_large_neut_no_bad_clean.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content", # noqa: E501 + "test_large_neut_all.tar.gz": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/cb8351dd-f09c-413f-921c-67a702a7f0c5/content", # noqa: E501 + } + + def read_raw_entries(self): + entries = [] + for filename in self.__links__: + filename = filename.split(".")[0] + xyzpath = p_join(self.root, f"{filename}.xyz") + split = filename.split("_")[0] + structure_iterator = parse_mace_xyz(xyzpath) + func = partial(build_data_object, split=split) + entries.extend(dm.utils.parallelized(func, structure_iterator)) + return entries + + def __getitem__(self, idx): + data = super().__getitem__(idx) + data.__setattr__("split", self._convert_array(self.data["split"][idx])) + return data