From 4bfb1c1d0ed990b5163b31732d78d449a9697cfe Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Fri, 9 Feb 2024 01:29:12 +0000 Subject: [PATCH 01/28] Updated dummy dataset --- src/openqdc/datasets/dummy.py | 67 ++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py index c87e03d..80fcb4f 100644 --- a/src/openqdc/datasets/dummy.py +++ b/src/openqdc/datasets/dummy.py @@ -1,15 +1,12 @@ -import numpy as np # noqa -from numpy import array -from sklearn.utils import Bunch +import numpy as np from openqdc.datasets.base import BaseDataset -from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory from openqdc.utils.constants import NOT_DEFINED class Dummy(BaseDataset): """ - Dummy dataset + Dummy dataset for testing. """ __name__ = "dummy" @@ -30,21 +27,26 @@ def _stats(self): return { "formation": { "energy": { - "mean": array([[-12.94348027, -9.83037297]]), - "std": array([[4.39971409, 3.3574188]]), + "mean": np.array([[-12.94348027, -9.83037297]]), + "std": np.array([[4.39971409, 3.3574188]]), }, "forces": NOT_DEFINED, }, "total": { "energy": { - "mean": array([[-89.44242, -1740.5336]]), - "std": array([[29.599571, 791.48663]]), + "mean": np.array([[-89.44242, -1740.5336]]), + "std": np.array([[29.599571, 791.48663]]), }, "forces": NOT_DEFINED, }, } - def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None: + def __init__( + self, + energy_unit=None, + distance_unit=None, + cache_dir=None, + ) -> None: try: super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir) @@ -52,10 +54,33 @@ def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None pass self._set_isolated_atom_energies() self.setup_dummy() - + def setup_dummy(self): - self._n_atoms = np.array([np.random.randint(1, 100) for _ in range(self.__len__())]) - self.__average_nb_atoms__ = self._n_atoms.mean() + n_atoms = np.array([np.random.randint(1, 100) for _ in range(len(self))]) + position_idx_range = np.concatenate([[0], np.cumsum(n_atoms)]).repeat(2)[1:-1].reshape(-1, 2) + atomic_inputs = np.concatenate([np.concatenate([ + # z, c, x, y, z + np.random.randint(1, 100, size=(size, 1)), + np.random.randint(-1, 2, size=(size, 1)), + np.random.randn(size, 3) + ], axis=1) for size in n_atoms], axis=0) # (sum(n_atoms), 5) + name=[f'dummy_{i}' for i in range(len(self))] + subset=["dummy" for i in range(len(self))] + energies = np.random.rand(len(self), len(self.__energy_methods__)) + forces = np.concatenate([ + np.random.randn(size, 3, len(self.__force_methods__)) * 100 + for size in n_atoms + ]) + self.data = dict( + n_atoms=n_atoms, + position_idx_range=position_idx_range, + name=name, + atomic_inputs=atomic_inputs, + subset=subset, + energies=energies, + forces=forces, + ) + self.__average_nb_atoms__ = self.data["n_atoms"].mean() def is_preprocessed(self): return True @@ -65,19 +90,3 @@ def read_raw_entries(self): def __len__(self): return 9999 - - def __getitem__(self, idx: int): - shift = IsolatedAtomEnergyFactory.max_charge - size = self._n_atoms[idx] - z = np.random.randint(1, 100, size) - c = np.random.randint(-1, 2, size) - return Bunch( - positions=np.random.rand(size, 3) * 10, - atomic_numbers=z, - charges=c, - e0=self.__isolated_atom_energies__[..., z, c + shift].T, - energies=np.random.randn(len(self.__energy_methods__)), - name="dummy_{}".format(idx), - subset="dummy", - forces=(np.random.randn(size, 3, len(self.__force_methods__)) * 100), - ) From 862dc608657fca3c82beba0aa9b1254562c5aed9 Mon Sep 17 00:00:00 2001 From: Nikhil Shenoy Date: Fri, 9 Feb 2024 01:29:53 +0000 Subject: [PATCH 02/28] ran pre-commit on dummy --- src/openqdc/datasets/dummy.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py index 80fcb4f..2f421a8 100644 --- a/src/openqdc/datasets/dummy.py +++ b/src/openqdc/datasets/dummy.py @@ -54,23 +54,29 @@ def __init__( pass self._set_isolated_atom_energies() self.setup_dummy() - + def setup_dummy(self): n_atoms = np.array([np.random.randint(1, 100) for _ in range(len(self))]) position_idx_range = np.concatenate([[0], np.cumsum(n_atoms)]).repeat(2)[1:-1].reshape(-1, 2) - atomic_inputs = np.concatenate([np.concatenate([ - # z, c, x, y, z - np.random.randint(1, 100, size=(size, 1)), - np.random.randint(-1, 2, size=(size, 1)), - np.random.randn(size, 3) - ], axis=1) for size in n_atoms], axis=0) # (sum(n_atoms), 5) - name=[f'dummy_{i}' for i in range(len(self))] - subset=["dummy" for i in range(len(self))] + atomic_inputs = np.concatenate( + [ + np.concatenate( + [ + # z, c, x, y, z + np.random.randint(1, 100, size=(size, 1)), + np.random.randint(-1, 2, size=(size, 1)), + np.random.randn(size, 3), + ], + axis=1, + ) + for size in n_atoms + ], + axis=0, + ) # (sum(n_atoms), 5) + name = [f"dummy_{i}" for i in range(len(self))] + subset = ["dummy" for i in range(len(self))] energies = np.random.rand(len(self), len(self.__energy_methods__)) - forces = np.concatenate([ - np.random.randn(size, 3, len(self.__force_methods__)) * 100 - for size in n_atoms - ]) + forces = np.concatenate([np.random.randn(size, 3, len(self.__force_methods__)) * 100 for size in n_atoms]) self.data = dict( n_atoms=n_atoms, position_idx_range=position_idx_range, From f8a6f0fc9833e2ed38375231323a453db08c1519 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 29 Feb 2024 07:15:55 -0700 Subject: [PATCH 03/28] Download and upload progress. No init method --- src/openqdc/datasets/base.py | 114 +++++++++++++--------- src/openqdc/utils/atomization_energies.py | 1 - src/openqdc/utils/io.py | 46 +++++++-- src/openqdc/utils/package_utils.py | 2 - 4 files changed, 107 insertions(+), 56 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index adf9ae6..f0c1ab5 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -82,6 +82,10 @@ def read_qc_archive_h5( class BaseDataset: + """ + Base class for datasets in the openQDC package. + """ + __energy_methods__ = [] __force_methods__ = [] energy_target_names = [] @@ -124,6 +128,14 @@ def _post_init( self._set_units(energy_unit, distance_unit) self._convert_data() self._set_isolated_atom_energies() + + @classmethod + def no_init(cls): + """ + Class method to avoid the __init__ method to be called when the class is instanciated. + Useful for debugging purposes or preprocessing data. + """ + return cls.__new__(cls) def _convert_data(self): logger.info( @@ -366,9 +378,9 @@ def _convert_on_loading(self, x, key): return x def read_preprocess(self, overwrite_local_cache=False): - logger.info("Reading preprocessed data") + logger.info("Reading preprocessed data.") logger.info( - f"{self.__name__} data with the following units:\n\ + f"Dataset {self.__name__} with the following units:\n\ Energy: {self.energy_unit},\n\ Distance: {self.distance_unit},\n\ Forces: {self.force_unit if self.__force_methods__ else 'None'}" @@ -380,7 +392,6 @@ def read_preprocess(self, overwrite_local_cache=False): self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key]) filename = p_join(self.preprocess_path, "props.pkl") - pull_locally(filename, overwrite=overwrite_local_cache) with open(filename, "rb") as f: tmp = pkl.load(f) for key in ["name", "subset", "n_atoms"]: @@ -519,49 +530,19 @@ def wrapper(idx): if return_idxs: datum["idxs"] = idxs return datum - - def __len__(self): - return self.data["energies"].shape[0] - - def __smiles_converter__(self, x): - """util function to convert string to smiles: useful if the smiles is - encoded in a different format than its display format + + def as_iter(self, atoms: bool = False): """ - return x - - def __getitem__(self, idx: int): - shift = IsolatedAtomEnergyFactory.max_charge - p_start, p_end = self.data["position_idx_range"][idx] - input = self.data["atomic_inputs"][p_start:p_end] - z, c, positions, energies = ( - np.array(input[:, 0], dtype=np.int32), - np.array(input[:, 1], dtype=np.int32), - np.array(input[:, -3:], dtype=np.float32), - np.array(self.data["energies"][idx], dtype=np.float32), - ) - name = self.__smiles_converter__(self.data["name"][idx]) - subset = self.data["subset"][idx] - - if "forces" in self.data: - forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32) - else: - forces = None - return Bunch( - positions=positions, - atomic_numbers=z, - charges=c, - e0=self.__isolated_atom_energies__[..., z, c + shift].T, - energies=energies, - name=name, - subset=subset, - forces=forces, - ) - - def __str__(self): - return f"{self.__name__}" - - def __repr__(self): - return f"{self.__name__}" + Return the dataset as an iterator. + + Parameters + ---------- + atoms : bool, optional + Whether to return the items as ASE atoms object, by default False + """ + func = self.get_ase_atoms if atoms else self.__getitem__ + for i in range(len(self)): + yield func(i) @property def _stats(self): @@ -620,3 +601,46 @@ def get_statistics(self, normalization: str = "formation", return_none: bool = T for key2 in selected_stats[key]: selected_stats[key][key2] = self.convert_energy(selected_stats[key][key2]) return selected_stats + + def __str__(self): + return f"{self.__name__}" + + def __repr__(self): + return f"{self.__name__}" + + def __len__(self): + return self.data["energies"].shape[0] + + def __smiles_converter__(self, x): + """util function to convert string to smiles: useful if the smiles is + encoded in a different format than its display format + """ + return x + + def __getitem__(self, idx: int): + shift = IsolatedAtomEnergyFactory.max_charge + p_start, p_end = self.data["position_idx_range"][idx] + input = self.data["atomic_inputs"][p_start:p_end] + z, c, positions, energies = ( + np.array(input[:, 0], dtype=np.int32), + np.array(input[:, 1], dtype=np.int32), + np.array(input[:, -3:], dtype=np.float32), + np.array(self.data["energies"][idx], dtype=np.float32), + ) + name = self.__smiles_converter__(self.data["name"][idx]) + subset = self.data["subset"][idx] + + if "forces" in self.data: + forces = np.array(self.data["forces"][p_start:p_end], dtype=np.float32) + else: + forces = None + return Bunch( + positions=positions, + atomic_numbers=z, + charges=c, + e0=self.__isolated_atom_energies__[..., z, c + shift].T, + energies=energies, + name=name, + subset=subset, + forces=forces, + ) \ No newline at end of file diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index 746ff66..7325de3 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -12,7 +12,6 @@ EF_KEY: TypeAlias = Tuple[str, int] -# didn t calculate for Pd, Pt, Mo, Ni, Fe, Cu, see DESS atomic_numbers = {} chemical_symbols = np.array( [ diff --git a/src/openqdc/utils/io.py b/src/openqdc/utils/io.py index 2503031..c6e3280 100644 --- a/src/openqdc/utils/io.py +++ b/src/openqdc/utils/io.py @@ -9,9 +9,13 @@ from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem from rdkit.Chem import MolFromXYZFile - -gcp_filesys = fsspec.filesystem("gs") -gcp_filesys_public = fsspec.filesystem("https") +import fsspec +from fsspec.callbacks import TqdmCallback +from tqdm import tqdm +from loguru import logger + +gcp_filesys = fsspec.filesystem("gs") # entry point for google bucket (need gsutil permission) +gcp_filesys_public = fsspec.filesystem("https") # public API for download local_filesys = LocalFileSystem() _OPENQDC_CACHE_DIR = ( @@ -45,28 +49,51 @@ def get_local_cache() -> str: def get_remote_cache(write_access=False) -> str: + """ + Returns the entry point based on the write access. + """ if write_access: remote_cache = "gs://qmdata-public/openqdc" else: remote_cache = "https://storage.googleapis.com/qmdata-public/openqdc" return remote_cache - def push_remote(local_path, overwrite=True): + """ + Attempt to push file to remote gs path + """ remote_path = local_path.replace(get_local_cache(), get_remote_cache(write_access=overwrite)) gcp_filesys.mkdirs(os.path.dirname(remote_path), exist_ok=False) - print(f"Pushing {local_path} file to {remote_path}, ({gcp_filesys.exists(os.path.dirname(remote_path))})") + #print(f"Pushing {local_path} file to {remote_path}, ({gcp_filesys.exists(os.path.dirname(remote_path))})") if not gcp_filesys.exists(remote_path) or overwrite: - gcp_filesys.put_file(local_path, remote_path) + gcp_filesys.put_file(local_path, remote_path, + callback=TqdmCallback( + tqdm_kwargs={ + "ascii": " ▖▘▝▗▚▞-", + "desc" : f"Uploading {os.path.basename(remote_path)}", + "unit" : "B", + }, + ) + ) return remote_path def pull_locally(local_path, overwrite=False): + """ + Retrieve file from remote gs path or local cache + """ remote_path = local_path.replace(get_local_cache(), get_remote_cache()) os.makedirs(os.path.dirname(local_path), exist_ok=True) if not os.path.exists(local_path) or overwrite: - # print(f"Pulling {remote_path} file to {local_path}") - gcp_filesys_public.get_file(remote_path, local_path) + gcp_filesys_public.get_file(remote_path, local_path, + callback=TqdmCallback( + tqdm_kwargs={ + "ascii": " ▖▘▝▗▚▞-", + "desc" : f"Downloading {os.path.basename(remote_path)}", + "unit" : "B", + }, + ) + ) return local_path @@ -161,6 +188,9 @@ def load_json(path): def load_xyz(path): + """ + Load XYZ file using RDKit + """ return MolFromXYZFile(path) diff --git a/src/openqdc/utils/package_utils.py b/src/openqdc/utils/package_utils.py index c7b8aac..c64653f 100644 --- a/src/openqdc/utils/package_utils.py +++ b/src/openqdc/utils/package_utils.py @@ -116,7 +116,6 @@ def get_dir(): if _hub_dir is not None: return _hub_dir - # return os.path.join(_get_torch_home(), 'hub') def set_dir(d): @@ -127,4 +126,3 @@ def set_dir(d): d (str): path to a local folder to save downloaded models & weights. """ global _hub_dir - # _hub_dir = os.path.expanduser(d) From c15960392cee6c4953408e335ca90afcbbb55936 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 29 Feb 2024 07:41:55 -0700 Subject: [PATCH 04/28] Basic versioning, refactor src->openqdc, updated .toml --- {src/openqdc => openqdc}/__init__.py | 3 ++ openqdc/_version.py | 13 ++++++++ {src/openqdc => openqdc}/datasets/__init__.py | 0 {src/openqdc => openqdc}/datasets/ani.py | 0 {src/openqdc => openqdc}/datasets/base.py | 0 {src/openqdc => openqdc}/datasets/comp6.py | 0 {src/openqdc => openqdc}/datasets/dess.py | 0 {src/openqdc => openqdc}/datasets/dummy.py | 0 {src/openqdc => openqdc}/datasets/gdml.py | 0 {src/openqdc => openqdc}/datasets/geom.py | 0 {src/openqdc => openqdc}/datasets/iso_17.py | 0 .../datasets/molecule3d.py | 0 {src/openqdc => openqdc}/datasets/nabladft.py | 0 .../datasets/orbnet_denali.py | 0 {src/openqdc => openqdc}/datasets/pcqm.py | 0 {src/openqdc => openqdc}/datasets/qm7x.py | 0 {src/openqdc => openqdc}/datasets/qmugs.py | 0 {src/openqdc => openqdc}/datasets/sn2_rxn.py | 0 .../datasets/solvated_peptides.py | 0 {src/openqdc => openqdc}/datasets/spice.py | 0 {src/openqdc => openqdc}/datasets/tmqm.py | 0 .../datasets/transition1x.py | 0 .../datasets/waterclusters3_30.py | 0 .../raws/config_factory.py | 0 {src/openqdc => openqdc}/raws/fetch.py | 0 {src/openqdc => openqdc}/raws/pubchemqc.py | 0 {src/openqdc => openqdc}/utils/__init__.py | 0 .../utils/atomization_energies.py | 0 {src/openqdc => openqdc}/utils/constants.py | 0 {src/openqdc => openqdc}/utils/exceptions.py | 0 {src/openqdc => openqdc}/utils/io.py | 0 {src/openqdc => openqdc}/utils/molecule.py | 0 .../utils/package_utils.py | 0 {src/openqdc => openqdc}/utils/preprocess.py | 0 {src/openqdc => openqdc}/utils/units.py | 0 pyproject.toml | 31 +++++++++++++++++-- 36 files changed, 45 insertions(+), 2 deletions(-) rename {src/openqdc => openqdc}/__init__.py (95%) create mode 100644 openqdc/_version.py rename {src/openqdc => openqdc}/datasets/__init__.py (100%) rename {src/openqdc => openqdc}/datasets/ani.py (100%) rename {src/openqdc => openqdc}/datasets/base.py (100%) rename {src/openqdc => openqdc}/datasets/comp6.py (100%) rename {src/openqdc => openqdc}/datasets/dess.py (100%) rename {src/openqdc => openqdc}/datasets/dummy.py (100%) rename {src/openqdc => openqdc}/datasets/gdml.py (100%) rename {src/openqdc => openqdc}/datasets/geom.py (100%) rename {src/openqdc => openqdc}/datasets/iso_17.py (100%) rename {src/openqdc => openqdc}/datasets/molecule3d.py (100%) rename {src/openqdc => openqdc}/datasets/nabladft.py (100%) rename {src/openqdc => openqdc}/datasets/orbnet_denali.py (100%) rename {src/openqdc => openqdc}/datasets/pcqm.py (100%) rename {src/openqdc => openqdc}/datasets/qm7x.py (100%) rename {src/openqdc => openqdc}/datasets/qmugs.py (100%) rename {src/openqdc => openqdc}/datasets/sn2_rxn.py (100%) rename {src/openqdc => openqdc}/datasets/solvated_peptides.py (100%) rename {src/openqdc => openqdc}/datasets/spice.py (100%) rename {src/openqdc => openqdc}/datasets/tmqm.py (100%) rename {src/openqdc => openqdc}/datasets/transition1x.py (100%) rename {src/openqdc => openqdc}/datasets/waterclusters3_30.py (100%) rename {src/openqdc => openqdc}/raws/config_factory.py (100%) rename {src/openqdc => openqdc}/raws/fetch.py (100%) rename {src/openqdc => openqdc}/raws/pubchemqc.py (100%) rename {src/openqdc => openqdc}/utils/__init__.py (100%) rename {src/openqdc => openqdc}/utils/atomization_energies.py (100%) rename {src/openqdc => openqdc}/utils/constants.py (100%) rename {src/openqdc => openqdc}/utils/exceptions.py (100%) rename {src/openqdc => openqdc}/utils/io.py (100%) rename {src/openqdc => openqdc}/utils/molecule.py (100%) rename {src/openqdc => openqdc}/utils/package_utils.py (100%) rename {src/openqdc => openqdc}/utils/preprocess.py (100%) rename {src/openqdc => openqdc}/utils/units.py (100%) diff --git a/src/openqdc/__init__.py b/openqdc/__init__.py similarity index 95% rename from src/openqdc/__init__.py rename to openqdc/__init__.py index 87fc457..040fa06 100644 --- a/src/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -8,6 +8,7 @@ # Dictionary of objects to lazily import; maps the object's name to its module path _lazy_imports_obj = { + "__version__": "openqdc._version", "ANI1": "openqdc.datasets.ani", "ANI1CCX": "openqdc.datasets.ani", "ANI1X": "openqdc.datasets.ani", @@ -62,3 +63,5 @@ def __dir__(): # These types are imported lazily at runtime, but we need to tell type # checkers what they are. from .datasets import * + from ._version import __version__ + from .utils import * diff --git a/openqdc/_version.py b/openqdc/_version.py new file mode 100644 index 0000000..0faccd4 --- /dev/null +++ b/openqdc/_version.py @@ -0,0 +1,13 @@ +try: + from importlib.metadata import version + from importlib.metadata import PackageNotFoundError +except ModuleNotFoundError: + # Try backported to PY<38 `importlib_metadata`. + from importlib_metadata import version + from importlib_metadata import PackageNotFoundError + +try: + __version__ = version("openqdc") +except PackageNotFoundError: + # package is not installed + __version__ = "dev" \ No newline at end of file diff --git a/src/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py similarity index 100% rename from src/openqdc/datasets/__init__.py rename to openqdc/datasets/__init__.py diff --git a/src/openqdc/datasets/ani.py b/openqdc/datasets/ani.py similarity index 100% rename from src/openqdc/datasets/ani.py rename to openqdc/datasets/ani.py diff --git a/src/openqdc/datasets/base.py b/openqdc/datasets/base.py similarity index 100% rename from src/openqdc/datasets/base.py rename to openqdc/datasets/base.py diff --git a/src/openqdc/datasets/comp6.py b/openqdc/datasets/comp6.py similarity index 100% rename from src/openqdc/datasets/comp6.py rename to openqdc/datasets/comp6.py diff --git a/src/openqdc/datasets/dess.py b/openqdc/datasets/dess.py similarity index 100% rename from src/openqdc/datasets/dess.py rename to openqdc/datasets/dess.py diff --git a/src/openqdc/datasets/dummy.py b/openqdc/datasets/dummy.py similarity index 100% rename from src/openqdc/datasets/dummy.py rename to openqdc/datasets/dummy.py diff --git a/src/openqdc/datasets/gdml.py b/openqdc/datasets/gdml.py similarity index 100% rename from src/openqdc/datasets/gdml.py rename to openqdc/datasets/gdml.py diff --git a/src/openqdc/datasets/geom.py b/openqdc/datasets/geom.py similarity index 100% rename from src/openqdc/datasets/geom.py rename to openqdc/datasets/geom.py diff --git a/src/openqdc/datasets/iso_17.py b/openqdc/datasets/iso_17.py similarity index 100% rename from src/openqdc/datasets/iso_17.py rename to openqdc/datasets/iso_17.py diff --git a/src/openqdc/datasets/molecule3d.py b/openqdc/datasets/molecule3d.py similarity index 100% rename from src/openqdc/datasets/molecule3d.py rename to openqdc/datasets/molecule3d.py diff --git a/src/openqdc/datasets/nabladft.py b/openqdc/datasets/nabladft.py similarity index 100% rename from src/openqdc/datasets/nabladft.py rename to openqdc/datasets/nabladft.py diff --git a/src/openqdc/datasets/orbnet_denali.py b/openqdc/datasets/orbnet_denali.py similarity index 100% rename from src/openqdc/datasets/orbnet_denali.py rename to openqdc/datasets/orbnet_denali.py diff --git a/src/openqdc/datasets/pcqm.py b/openqdc/datasets/pcqm.py similarity index 100% rename from src/openqdc/datasets/pcqm.py rename to openqdc/datasets/pcqm.py diff --git a/src/openqdc/datasets/qm7x.py b/openqdc/datasets/qm7x.py similarity index 100% rename from src/openqdc/datasets/qm7x.py rename to openqdc/datasets/qm7x.py diff --git a/src/openqdc/datasets/qmugs.py b/openqdc/datasets/qmugs.py similarity index 100% rename from src/openqdc/datasets/qmugs.py rename to openqdc/datasets/qmugs.py diff --git a/src/openqdc/datasets/sn2_rxn.py b/openqdc/datasets/sn2_rxn.py similarity index 100% rename from src/openqdc/datasets/sn2_rxn.py rename to openqdc/datasets/sn2_rxn.py diff --git a/src/openqdc/datasets/solvated_peptides.py b/openqdc/datasets/solvated_peptides.py similarity index 100% rename from src/openqdc/datasets/solvated_peptides.py rename to openqdc/datasets/solvated_peptides.py diff --git a/src/openqdc/datasets/spice.py b/openqdc/datasets/spice.py similarity index 100% rename from src/openqdc/datasets/spice.py rename to openqdc/datasets/spice.py diff --git a/src/openqdc/datasets/tmqm.py b/openqdc/datasets/tmqm.py similarity index 100% rename from src/openqdc/datasets/tmqm.py rename to openqdc/datasets/tmqm.py diff --git a/src/openqdc/datasets/transition1x.py b/openqdc/datasets/transition1x.py similarity index 100% rename from src/openqdc/datasets/transition1x.py rename to openqdc/datasets/transition1x.py diff --git a/src/openqdc/datasets/waterclusters3_30.py b/openqdc/datasets/waterclusters3_30.py similarity index 100% rename from src/openqdc/datasets/waterclusters3_30.py rename to openqdc/datasets/waterclusters3_30.py diff --git a/src/openqdc/raws/config_factory.py b/openqdc/raws/config_factory.py similarity index 100% rename from src/openqdc/raws/config_factory.py rename to openqdc/raws/config_factory.py diff --git a/src/openqdc/raws/fetch.py b/openqdc/raws/fetch.py similarity index 100% rename from src/openqdc/raws/fetch.py rename to openqdc/raws/fetch.py diff --git a/src/openqdc/raws/pubchemqc.py b/openqdc/raws/pubchemqc.py similarity index 100% rename from src/openqdc/raws/pubchemqc.py rename to openqdc/raws/pubchemqc.py diff --git a/src/openqdc/utils/__init__.py b/openqdc/utils/__init__.py similarity index 100% rename from src/openqdc/utils/__init__.py rename to openqdc/utils/__init__.py diff --git a/src/openqdc/utils/atomization_energies.py b/openqdc/utils/atomization_energies.py similarity index 100% rename from src/openqdc/utils/atomization_energies.py rename to openqdc/utils/atomization_energies.py diff --git a/src/openqdc/utils/constants.py b/openqdc/utils/constants.py similarity index 100% rename from src/openqdc/utils/constants.py rename to openqdc/utils/constants.py diff --git a/src/openqdc/utils/exceptions.py b/openqdc/utils/exceptions.py similarity index 100% rename from src/openqdc/utils/exceptions.py rename to openqdc/utils/exceptions.py diff --git a/src/openqdc/utils/io.py b/openqdc/utils/io.py similarity index 100% rename from src/openqdc/utils/io.py rename to openqdc/utils/io.py diff --git a/src/openqdc/utils/molecule.py b/openqdc/utils/molecule.py similarity index 100% rename from src/openqdc/utils/molecule.py rename to openqdc/utils/molecule.py diff --git a/src/openqdc/utils/package_utils.py b/openqdc/utils/package_utils.py similarity index 100% rename from src/openqdc/utils/package_utils.py rename to openqdc/utils/package_utils.py diff --git a/src/openqdc/utils/preprocess.py b/openqdc/utils/preprocess.py similarity index 100% rename from src/openqdc/utils/preprocess.py rename to openqdc/utils/preprocess.py diff --git a/src/openqdc/utils/units.py b/openqdc/utils/units.py similarity index 100% rename from src/openqdc/utils/units.py rename to openqdc/utils/units.py diff --git a/pyproject.toml b/pyproject.toml index e589cf3..d8c2135 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,34 @@ name = "openqdc" dynamic = ["version"] description = "ML ready Quantum Mechanical datasets" authors = [{ name = "Nikhil Shenoy", email = "nikhilshenoy98@gmail.com" }, - { name = "Prudencio Tossou", email = "tossouprudencio@gmail.com" }] + { name = "Prudencio Tossou", email = "tossouprudencio@gmail.com" }, + { name = "Cristian Gabellini", email = "cris.gabellini@gmail.com" }] +readme = "README.md" +requires-python = ">=3.8" +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Healthcare Industry", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Scientific/Engineering :: Medical Science Apps.", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", +] + +[project.urls] +Website = "https://github.com/OpenDrugDiscovery" +"Source Code" = "https://github.com/OpenDrugDiscovery/openQDC" +"Bug Tracker" = "https://github.com/OpenDrugDiscovery/openQDC/issues" +Documentation = "https://github.com/OpenDrugDiscovery/openQDC" [tool.setuptools] include-package-data = true @@ -19,7 +46,7 @@ fallback_version = "dev" profile = "black" [tool.setuptools.packages.find] -where = ["src"] +where = ["."] include = ["openqdc", "openqdc.*"] exclude = [] namespaces = true From 77acba2e8ea532b12b3af044dc7303e4a8a6fc52 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 29 Feb 2024 07:53:05 -0700 Subject: [PATCH 05/28] __init__.py clean --- openqdc/__init__.py | 70 ++++++++++++------- openqdc/datasets/__init__.py | 127 ++++++----------------------------- 2 files changed, 65 insertions(+), 132 deletions(-) diff --git a/openqdc/__init__.py b/openqdc/__init__.py index 040fa06..7fd3450 100644 --- a/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -9,31 +9,33 @@ _lazy_imports_obj = { "__version__": "openqdc._version", - "ANI1": "openqdc.datasets.ani", - "ANI1CCX": "openqdc.datasets.ani", - "ANI1X": "openqdc.datasets.ani", - "Spice": "openqdc.datasets.spice", - "GEOM": "openqdc.datasets.geom", - "QMugs": "openqdc.datasets.qmugs", - "ISO17": "openqdc.datasets.iso_17", - "COMP6": "openqdc.datasets.comp6", - "GDML": "openqdc.datasets.gdml", - "Molecule3D": "openqdc.datasets.molecule3d", - "OrbnetDenali": "openqdc.datasets.orbnet_denali", - "SN2RXN": "openqdc.datasets.sn2_rxn", - "QM7X": "openqdc.datasets.qm7x", - "DESS": "openqdc.datasets.dess", - "NablaDFT": "openqdc.datasets.nabladft", - "SolvatedPeptides": "openqdc.datasets.solvated_peptides", - "WaterClusters": "openqdc.datasets.waterclusters3_30", - "TMQM": "openqdc.datasets.tmqm", - "Dummy": "openqdc.datasets.dummy", - "PCQM_B3LYP": "openqdc.datasets.pcqm", - "PCQM_PM6": "openqdc.datasets.pcqm", - "Transition1X": "openqdc.datasets.transition1x", + "BaseDataset" : "openqdc.datasets", + "ANI1": "openqdc.datasets", + "ANI1CCX": "openqdc.datasets", + "ANI1X": "openqdc.datasets", + "Spice": "openqdc.datasets", + "GEOM": "openqdc.datasets", + "QMugs": "openqdc.datasets", + "ISO17": "openqdc.datasets", + "COMP6": "openqdc.datasets", + "GDML": "openqdc.datasets", + "Molecule3D": "openqdc.datasets", + "OrbnetDenali": "openqdc.datasets", + "SN2RXN": "openqdc.datasets", + "QM7X": "openqdc.datasets", + "DESS": "openqdc.datasets", + "NablaDFT": "openqdc.datasets", + "SolvatedPeptides": "openqdc.datasets", + "WaterClusters": "openqdc.datasets", + "TMQM": "openqdc.datasets", + "Dummy": "openqdc.datasets", + "PCQM_B3LYP": "openqdc.datasets", + "PCQM_PM6": "openqdc.datasets", + "Transition1X": "openqdc.datasets", } -_lazy_imports_mod = {"datasets": "openqdc.datasets", "utils": "openqdc.utils"} +_lazy_imports_mod = {"datasets": "openqdc.datasets", + "utils": "openqdc.utils"} def __getattr__(name): @@ -62,6 +64,24 @@ def __dir__(): if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": # These types are imported lazily at runtime, but we need to tell type # checkers what they are. - from .datasets import * from ._version import __version__ - from .utils import * + from .datasets.ani import ANI1, ANI1CCX, ANI1X # noqa + from .datasets.comp6 import COMP6 # noqa + from .datasets.dess import DESS # noqa + from .datasets.dummy import Dummy # noqa + from .datasets.gdml import GDML # noqa + from .datasets.geom import GEOM # noqa + from .datasets.iso_17 import ISO17 # noqa + from .datasets.molecule3d import Molecule3D # noqa + from .datasets.nabladft import NablaDFT # noqa + from .datasets.orbnet_denali import OrbnetDenali # noqa + from .datasets.pcqm import PCQM_B3LYP, PCQM_PM6 # noqa + from .datasets.qm7x import QM7X # noqa + from .datasets.qmugs import QMugs # noqa + from .datasets.sn2_rxn import SN2RXN # noqa + from .datasets.solvated_peptides import SolvatedPeptides # noqa + from .datasets.spice import Spice # noqa + from .datasets.tmqm import TMQM # noqa + from .datasets.transition1x import Transition1X # noqa + from .datasets.waterclusters3_30 import WaterClusters # noqa + from .datasets.base import BaseDataset # noqa diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py index d989935..36e5cd0 100644 --- a/openqdc/datasets/__init__.py +++ b/openqdc/datasets/__init__.py @@ -1,107 +1,20 @@ -import importlib -import os -from typing import TYPE_CHECKING # noqa F401 - -# The below lazy import logic is coming from openff-toolkit: -# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44 - -# Dictionary of objects to lazily import; maps the object's name to its module path - -_lazy_imports_obj = { - "ANI1": "openqdc.datasets.ani", - "ANI1CCX": "openqdc.datasets.ani", - "ANI1X": "openqdc.datasets.ani", - "Spice": "openqdc.datasets.spice", - "GEOM": "openqdc.datasets.geom", - "QMugs": "openqdc.datasets.qmugs", - "ISO17": "openqdc.datasets.iso_17", - "COMP6": "openqdc.datasets.comp6", - "GDML": "openqdc.datasets.gdml", - "Molecule3D": "openqdc.datasets.molecule3d", - "OrbnetDenali": "openqdc.datasets.orbnet_denali", - "SN2RXN": "openqdc.datasets.sn2_rxn", - "QM7X": "openqdc.datasets.qm7x", - "DESS": "openqdc.datasets.dess", - "NablaDFT": "openqdc.datasets.nabladft", - "SolvatedPeptides": "openqdc.datasets.solvated_peptides", - "WaterClusters": "openqdc.datasets.waterclusters3_30", - "TMQM": "openqdc.datasets.tmqm", - "Dummy": "openqdc.datasets.dummy", - "PCQM_B3LYP": "openqdc.datasets.pcqm", - "PCQM_PM6": "openqdc.datasets.pcqm", - "Transition1X": "openqdc.datasets.transition1x", -} - -_lazy_imports_mod = {} - - -def __getattr__(name): - """Lazily import objects from _lazy_imports_obj or _lazy_imports_mod - - Note that this method is only called by Python if the name cannot be found - in the current module.""" - obj_mod = _lazy_imports_obj.get(name) - if obj_mod is not None: - mod = importlib.import_module(obj_mod) - return mod.__dict__[name] - - lazy_mod = _lazy_imports_mod.get(name) - if lazy_mod is not None: - return importlib.import_module(lazy_mod) - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") - - -def __dir__(): - """Add _lazy_imports_obj and _lazy_imports_mod to dir()""" - keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys()) - return sorted(keys) - - -if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": - # These types are imported lazily at runtime, but we need to tell type - # checkers what they are. - from .ani import ANI1, ANI1CCX, ANI1X # noqa - from .comp6 import COMP6 # noqa - from .dess import DESS # noqa - from .dummy import Dummy # noqa - from .gdml import GDML # noqa - from .geom import GEOM # noqa - from .iso_17 import ISO17 # noqa - from .molecule3d import Molecule3D # noqa - from .nabladft import NablaDFT # noqa - from .orbnet_denali import OrbnetDenali # noqa - from .pcqm import PCQM_B3LYP, PCQM_PM6 # noqa - from .qm7x import QM7X # noqa - from .qmugs import QMugs # noqa - from .sn2_rxn import SN2RXN # noqa - from .solvated_peptides import SolvatedPeptides # noqa - from .spice import Spice # noqa - from .tmqm import TMQM # noqa - from .transition1x import Transition1X # noqa - from .waterclusters3_30 import WaterClusters # noqa - - __all__ = [ - "ANI1", - "ANI1X", - "ANI1CCX", - "Spice", - "GEOM", - "QMugs", - "ISO17", - "COMP6", - "GDML", - "Molecule3D", - "OrbnetDenali", - "SN2RXN", - "QM7X", - "DESS", - "NablaDFT", - "SolvatedPeptides", - "WaterClusters", - "TMQM", - "PCQM_B3LYP", - "PCQM_PM6", - "Transition1X", - "Dummy", - ] +from .ani import ANI1, ANI1CCX, ANI1X # noqa +from .comp6 import COMP6 # noqa +from .dess import DESS # noqa +from .dummy import Dummy # noqa +from .gdml import GDML # noqa +from .geom import GEOM # noqa +from .iso_17 import ISO17 # noqa +from .molecule3d import Molecule3D # noqa +from .nabladft import NablaDFT # noqa +from .orbnet_denali import OrbnetDenali # noqa +from .pcqm import PCQM_B3LYP, PCQM_PM6 # noqa +from .qm7x import QM7X # noqa +from .qmugs import QMugs # noqa +from .sn2_rxn import SN2RXN # noqa +from .solvated_peptides import SolvatedPeptides # noqa +from .spice import Spice # noqa +from .tmqm import TMQM # noqa +from .transition1x import Transition1X # noqa +from .waterclusters3_30 import WaterClusters # noqa +from .base import BaseDataset # noqa \ No newline at end of file From 8b54153d138aa86cd168c60f440317d6588014d5 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 29 Feb 2024 08:33:48 -0700 Subject: [PATCH 06/28] black + isort --- openqdc/__init__.py | 9 ++++----- openqdc/_version.py | 8 +++----- openqdc/datasets/__init__.py | 2 +- openqdc/datasets/base.py | 12 ++++++------ openqdc/utils/io.py | 38 +++++++++++++++++++----------------- 5 files changed, 34 insertions(+), 35 deletions(-) diff --git a/openqdc/__init__.py b/openqdc/__init__.py index 7fd3450..e4d9ac3 100644 --- a/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -9,7 +9,7 @@ _lazy_imports_obj = { "__version__": "openqdc._version", - "BaseDataset" : "openqdc.datasets", + "BaseDataset": "openqdc.datasets", "ANI1": "openqdc.datasets", "ANI1CCX": "openqdc.datasets", "ANI1X": "openqdc.datasets", @@ -34,8 +34,7 @@ "Transition1X": "openqdc.datasets", } -_lazy_imports_mod = {"datasets": "openqdc.datasets", - "utils": "openqdc.utils"} +_lazy_imports_mod = {"datasets": "openqdc.datasets", "utils": "openqdc.utils"} def __getattr__(name): @@ -64,8 +63,9 @@ def __dir__(): if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": # These types are imported lazily at runtime, but we need to tell type # checkers what they are. - from ._version import __version__ + from ._version import __version__ # noqa from .datasets.ani import ANI1, ANI1CCX, ANI1X # noqa + from .datasets.base import BaseDataset # noqa from .datasets.comp6 import COMP6 # noqa from .datasets.dess import DESS # noqa from .datasets.dummy import Dummy # noqa @@ -84,4 +84,3 @@ def __dir__(): from .datasets.tmqm import TMQM # noqa from .datasets.transition1x import Transition1X # noqa from .datasets.waterclusters3_30 import WaterClusters # noqa - from .datasets.base import BaseDataset # noqa diff --git a/openqdc/_version.py b/openqdc/_version.py index 0faccd4..63dc510 100644 --- a/openqdc/_version.py +++ b/openqdc/_version.py @@ -1,13 +1,11 @@ try: - from importlib.metadata import version - from importlib.metadata import PackageNotFoundError + from importlib.metadata import PackageNotFoundError, version except ModuleNotFoundError: # Try backported to PY<38 `importlib_metadata`. - from importlib_metadata import version - from importlib_metadata import PackageNotFoundError + from importlib_metadata import PackageNotFoundError, version try: __version__ = version("openqdc") except PackageNotFoundError: # package is not installed - __version__ = "dev" \ No newline at end of file + __version__ = "dev" diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py index 36e5cd0..3a6cef7 100644 --- a/openqdc/datasets/__init__.py +++ b/openqdc/datasets/__init__.py @@ -1,4 +1,5 @@ from .ani import ANI1, ANI1CCX, ANI1X # noqa +from .base import BaseDataset # noqa from .comp6 import COMP6 # noqa from .dess import DESS # noqa from .dummy import Dummy # noqa @@ -17,4 +18,3 @@ from .tmqm import TMQM # noqa from .transition1x import Transition1X # noqa from .waterclusters3_30 import WaterClusters # noqa -from .base import BaseDataset # noqa \ No newline at end of file diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index f0c1ab5..66cd42e 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -128,8 +128,8 @@ def _post_init( self._set_units(energy_unit, distance_unit) self._convert_data() self._set_isolated_atom_energies() - - @classmethod + + @classmethod def no_init(cls): """ Class method to avoid the __init__ method to be called when the class is instanciated. @@ -530,11 +530,11 @@ def wrapper(idx): if return_idxs: datum["idxs"] = idxs return datum - + def as_iter(self, atoms: bool = False): """ Return the dataset as an iterator. - + Parameters ---------- atoms : bool, optional @@ -607,7 +607,7 @@ def __str__(self): def __repr__(self): return f"{self.__name__}" - + def __len__(self): return self.data["energies"].shape[0] @@ -643,4 +643,4 @@ def __getitem__(self, idx: int): name=name, subset=subset, forces=forces, - ) \ No newline at end of file + ) diff --git a/openqdc/utils/io.py b/openqdc/utils/io.py index c6e3280..fa24480 100644 --- a/openqdc/utils/io.py +++ b/openqdc/utils/io.py @@ -6,16 +6,13 @@ import fsspec import h5py from ase.atoms import Atoms +from fsspec.callbacks import TqdmCallback from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem from rdkit.Chem import MolFromXYZFile -import fsspec -from fsspec.callbacks import TqdmCallback -from tqdm import tqdm -from loguru import logger - -gcp_filesys = fsspec.filesystem("gs") # entry point for google bucket (need gsutil permission) -gcp_filesys_public = fsspec.filesystem("https") # public API for download + +gcp_filesys = fsspec.filesystem("gs") # entry point for google bucket (need gsutil permission) +gcp_filesys_public = fsspec.filesystem("https") # public API for download local_filesys = LocalFileSystem() _OPENQDC_CACHE_DIR = ( @@ -58,22 +55,25 @@ def get_remote_cache(write_access=False) -> str: remote_cache = "https://storage.googleapis.com/qmdata-public/openqdc" return remote_cache + def push_remote(local_path, overwrite=True): """ Attempt to push file to remote gs path """ remote_path = local_path.replace(get_local_cache(), get_remote_cache(write_access=overwrite)) gcp_filesys.mkdirs(os.path.dirname(remote_path), exist_ok=False) - #print(f"Pushing {local_path} file to {remote_path}, ({gcp_filesys.exists(os.path.dirname(remote_path))})") + # print(f"Pushing {local_path} file to {remote_path}, ({gcp_filesys.exists(os.path.dirname(remote_path))})") if not gcp_filesys.exists(remote_path) or overwrite: - gcp_filesys.put_file(local_path, remote_path, + gcp_filesys.put_file( + local_path, + remote_path, callback=TqdmCallback( tqdm_kwargs={ "ascii": " ▖▘▝▗▚▞-", - "desc" : f"Uploading {os.path.basename(remote_path)}", - "unit" : "B", - }, - ) + "desc": f"Uploading {os.path.basename(remote_path)}", + "unit": "B", + }, + ), ) return remote_path @@ -85,14 +85,16 @@ def pull_locally(local_path, overwrite=False): remote_path = local_path.replace(get_local_cache(), get_remote_cache()) os.makedirs(os.path.dirname(local_path), exist_ok=True) if not os.path.exists(local_path) or overwrite: - gcp_filesys_public.get_file(remote_path, local_path, + gcp_filesys_public.get_file( + remote_path, + local_path, callback=TqdmCallback( tqdm_kwargs={ "ascii": " ▖▘▝▗▚▞-", - "desc" : f"Downloading {os.path.basename(remote_path)}", - "unit" : "B", - }, - ) + "desc": f"Downloading {os.path.basename(remote_path)}", + "unit": "B", + }, + ), ) return local_path From e6995a7da8c6d3d88268964d676953d35b2b75d7 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 29 Feb 2024 11:10:54 -0700 Subject: [PATCH 07/28] DES fix, base utilies, CLI --- env.yml | 1 + openqdc/__init__.py | 6 +- openqdc/cli.py | 85 ++++++++++++++++++++++++++++ openqdc/datasets/__init__.py | 26 ++++++++- openqdc/datasets/base.py | 9 +++ openqdc/datasets/{dess.py => des.py} | 4 +- openqdc/utils/constants.py | 12 ++-- openqdc/utils/exceptions.py | 2 +- openqdc/utils/preprocess.py | 4 +- pyproject.toml | 3 + 10 files changed, 139 insertions(+), 13 deletions(-) create mode 100644 openqdc/cli.py rename openqdc/datasets/{dess.py => des.py} (98%) diff --git a/env.yml b/env.yml index a88b919..76d2c22 100644 --- a/env.yml +++ b/env.yml @@ -9,6 +9,7 @@ dependencies: - loguru - fsspec - gcsfs + - typer # Scientific - pandas diff --git a/openqdc/__init__.py b/openqdc/__init__.py index e4d9ac3..abf50ad 100644 --- a/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -23,7 +23,7 @@ "OrbnetDenali": "openqdc.datasets", "SN2RXN": "openqdc.datasets", "QM7X": "openqdc.datasets", - "DESS": "openqdc.datasets", + "DES": "openqdc.datasets", "NablaDFT": "openqdc.datasets", "SolvatedPeptides": "openqdc.datasets", "WaterClusters": "openqdc.datasets", @@ -32,6 +32,7 @@ "PCQM_B3LYP": "openqdc.datasets", "PCQM_PM6": "openqdc.datasets", "Transition1X": "openqdc.datasets", + "AVAILABLE_DATASETS": "openqdc.datasets", } _lazy_imports_mod = {"datasets": "openqdc.datasets", "utils": "openqdc.utils"} @@ -64,10 +65,11 @@ def __dir__(): # These types are imported lazily at runtime, but we need to tell type # checkers what they are. from ._version import __version__ # noqa + from .datasets import AVAILABLE_DATASETS # noqa from .datasets.ani import ANI1, ANI1CCX, ANI1X # noqa from .datasets.base import BaseDataset # noqa from .datasets.comp6 import COMP6 # noqa - from .datasets.dess import DESS # noqa + from .datasets.des import DES # noqa from .datasets.dummy import Dummy # noqa from .datasets.gdml import GDML # noqa from .datasets.geom import GEOM # noqa diff --git a/openqdc/cli.py b/openqdc/cli.py new file mode 100644 index 0000000..72a57a3 --- /dev/null +++ b/openqdc/cli.py @@ -0,0 +1,85 @@ +import os +from typing import List, Optional + +import typer +from typing_extensions import Annotated +from openqdc import AVAILABLE_DATASETS +from loguru import logger +from prettytable import PrettyTable +from openqdc.raws.config_factory import DataConfigFactory +from openqdc.raws.fetch import DataDownloader + +app = typer.Typer(help="OpenQDC CLI") + +def exist_dataset(dataset): + if dataset not in AVAILABLE_DATASETS: + logger.error(f"{dataset} is not available. Please open an issue on Github for the team to look into it.") + return False + return True + +@app.command() +def download( + datasets: Annotated[ + List[str], + typer.Option( + help="List of datasets to download", + ), + ], + overwrite: Annotated[ + bool, + typer.Option( + help="Whether to overwrite the datasets", + ), + ] = False, + cache_dir : Annotated[ + Optional[str], + typer.Option( + help="Path to cache directory", + ), + ] = None, +): + """ + Download preprocessed datasets from openQDC. + """ + for dataset in list(map(lambda x : x.lower().replace("_",""),datasets)): + if exist_dataset(dataset): + if AVAILABLE_DATASETS[dataset].no_init().is_cached() and not overwrite: + logger.info(f"{dataset} is already cached. Skipping download") + else: + AVAILABLE_DATASETS[dataset](overwrite_local_cache=True, cache_dir=cache_dir) + +@app.command() +def datasets(): + """ + Print the available datasets. + """ + table = PrettyTable(['Name', 'Level of theories', 'Has forces'], max_width=25) + table._max_width={} + for dataset in AVAILABLE_DATASETS: + empty_dataset=AVAILABLE_DATASETS[dataset].no_init() + has_forces = False if not empty_dataset.__force_methods__ else True + table.add_row([dataset, ",".join(empty_dataset.__energy_methods__),has_forces]) + table.align="l" + print(table) + +@app.command() +def fetch( + datasets: Annotated[ + List[str], + typer.Option( + help="List of datasets to fetch", + ), + ], +): + """ + Download the raw datasets files from openQDC. + """ + if datasets[0] == "all": + dataset_names = DataConfigFactory.available_datasets + + for dataset_name in dataset_names: + dd = DataDownloader() + dd.from_name(dataset_name) + +if __name__ == "__main__": + app() \ No newline at end of file diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py index 3a6cef7..7e5e0cf 100644 --- a/openqdc/datasets/__init__.py +++ b/openqdc/datasets/__init__.py @@ -1,7 +1,7 @@ from .ani import ANI1, ANI1CCX, ANI1X # noqa from .base import BaseDataset # noqa from .comp6 import COMP6 # noqa -from .dess import DESS # noqa +from .des import DES # noqa from .dummy import Dummy # noqa from .gdml import GDML # noqa from .geom import GEOM # noqa @@ -18,3 +18,27 @@ from .tmqm import TMQM # noqa from .transition1x import Transition1X # noqa from .waterclusters3_30 import WaterClusters # noqa + +AVAILABLE_DATASETS = { + "ani1": ANI1, + "ani1ccx": ANI1CCX, + "ani1x": ANI1X, + "comp6": COMP6, + "des": DES, + "gdml": GDML, + "geom": GEOM, + "iso17": ISO17, + "molecule3d": Molecule3D, + "nabladft": NablaDFT, + "orbnetdenali": OrbnetDenali, + "pcqmb3lyp": PCQM_B3LYP, + "pcqmpm6": PCQM_PM6, + "qm7x": QM7X, + "qmugs": QMugs, + "sn2rxn": SN2RXN, + "solvatedpeptides": SolvatedPeptides, + "spice": Spice, + "tmqm": TMQM, + "transition1x": Transition1X, + "watercluster": WaterClusters, +} diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index 66cd42e..c65fdbc 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -392,6 +392,7 @@ def read_preprocess(self, overwrite_local_cache=False): self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(self.data_shapes[key]) filename = p_join(self.preprocess_path, "props.pkl") + pull_locally(filename, overwrite=overwrite_local_cache) with open(filename, "rb") as f: tmp = pkl.load(f) for key in ["name", "subset", "n_atoms"]: @@ -409,6 +410,14 @@ def is_preprocessed(self): predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))] return all(predicats) + def is_cached(self): + """ + Check if the dataset is cached locally. + """ + predicats = [os.path.exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] + predicats += [os.path.exists(p_join(self.preprocess_path, "props.pkl"))] + return all(predicats) + def is_preprocessed_statistics(self): return bool(copy_exists(p_join(self.preprocess_path, "stats.pkl"))) diff --git a/openqdc/datasets/dess.py b/openqdc/datasets/des.py similarity index 98% rename from openqdc/datasets/dess.py rename to openqdc/datasets/des.py index 80b1e1c..337a8e8 100644 --- a/openqdc/datasets/dess.py +++ b/openqdc/datasets/des.py @@ -32,8 +32,8 @@ def read_mol(mol_path, smiles, subset, targets): return res -class DESS(BaseDataset): - __name__ = "dess" +class DES(BaseDataset): + __name__ = "des" __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" diff --git a/openqdc/utils/constants.py b/openqdc/utils/constants.py index 7bf570b..642eb49 100644 --- a/openqdc/utils/constants.py +++ b/openqdc/utils/constants.py @@ -1,12 +1,14 @@ -NB_ATOMIC_FEATURES = 5 +from typing import Final, List -MAX_ATOMIC_NUMBER = 119 +NB_ATOMIC_FEATURES: Final[int] = 5 -HAR2EV = 27.211386246 +MAX_ATOMIC_NUMBER: Final[int] = 119 -BOHR2ANG = 0.52917721092 +HAR2EV: Final[float] = 27.211386246 -POSSIBLE_NORMALIZATION = ["formation", "total", "inter"] +BOHR2ANG: Final[float] = 0.52917721092 + +POSSIBLE_NORMALIZATION: Final[List[str]] = ["formation", "total", "inter"] NOT_DEFINED = { "mean": None, diff --git a/openqdc/utils/exceptions.py b/openqdc/utils/exceptions.py index 246d01c..e564a2c 100644 --- a/openqdc/utils/exceptions.py +++ b/openqdc/utils/exceptions.py @@ -60,7 +60,7 @@ def __init__(self, in_unit, out_unit): class ConversionAlreadyDefined(ConversionNotDefinedError): - """Raised when a conversion is not defined""" + """Raised when a conversion is already defined""" _error_message = """ Conversion from {in_unit} to {out_unit} is alread defined in the conversion registry. diff --git a/openqdc/utils/preprocess.py b/openqdc/utils/preprocess.py index b34499e..9e55412 100644 --- a/openqdc/utils/preprocess.py +++ b/openqdc/utils/preprocess.py @@ -9,7 +9,7 @@ datasets.ANI1CCX, datasets.ANI1X, datasets.COMP6, - datasets.DESS, + datasets.DES, datasets.GDML, datasets.GEOM, datasets.ISO17, @@ -40,7 +40,7 @@ def preprocess(dataset): else: data_class = options_map[dataset] - data_class().preprocess(overwrite=False) + data_class.no_init().preprocess(overwrite=False) data = data_class() logger.info(f"Preprocessing {data.__name__}") diff --git a/pyproject.toml b/pyproject.toml index d8c2135..dfd19c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,9 @@ classifiers = [ "Programming Language :: Python :: 3.11", ] +[project.scripts] +openqdc = "openqdc.cli:app" + [project.urls] Website = "https://github.com/OpenDrugDiscovery" "Source Code" = "https://github.com/OpenDrugDiscovery/openQDC" From 2a066371683ca9f7f7070cbb028383fe9e622de1 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 29 Feb 2024 11:35:16 -0700 Subject: [PATCH 08/28] mkdocs dependency + fix --- env.yml | 1 + mkdocs.yml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/env.yml b/env.yml index 76d2c22..43c7df7 100644 --- a/env.yml +++ b/env.yml @@ -43,3 +43,4 @@ dependencies: - mkdocs-jupyter - markdown-include - mdx_truly_sane_lists + - mkdocstrings-python diff --git a/mkdocs.yml b/mkdocs.yml index e174b70..c1be218 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -58,13 +58,13 @@ plugins: - search - mkdocstrings: watch: - - src/ + - openqdc/ handlers: python: setup_commands: - import sys - sys.path.append("docs") - - sys.path.append("src") + - sys.path.append("openqdc") selection: new_path_syntax: yes rendering: From 5e179d0d18c03932055524987fd09378c7bda768 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 29 Feb 2024 12:28:50 -0700 Subject: [PATCH 09/28] Tutorial update + mkdocs fix --- docs/tutorials/usage.ipynb | 774 ++++++++++++++++++++++++++++--------- env.yml | 1 + openqdc/cli.py | 5 +- 3 files changed, 594 insertions(+), 186 deletions(-) diff --git a/docs/tutorials/usage.ipynb b/docs/tutorials/usage.ipynb index b494396..cd71b6a 100644 --- a/docs/tutorials/usage.ipynb +++ b/docs/tutorials/usage.ipynb @@ -4,243 +4,267 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Really Hard example\n", + "# OpenQDC Hands On tutorial\n", "\n", "## Instantiate and GO!\n", "\n", - "If you don't have the dataset downloaded it will be downloaded automatically and cached. You just instantiate the class and you are ready to go." + "If you don't have the dataset downloaded it will be downloaded automatically and cached. You just instantiate the class and you are ready to go.\n", + "Change of units are done automatically on loading based on the units in the dataset." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/opt/homebrew/Caskroom/miniconda/base/envs/qdc/lib/python3.11/site-packages/google/auth/_default.py:76: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", - " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", - "\u001b[32m2023-10-31 11:43:09.510\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mopenqdc.datasets.base\u001b[0m:\u001b[36mread_preprocess\u001b[0m:\u001b[36m236\u001b[0m - \u001b[1mReading preprocessed data\u001b[0m\n", - "\u001b[32m2023-10-31 11:43:09.511\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mopenqdc.datasets.base\u001b[0m:\u001b[36mread_preprocess\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mspice data with the following units:\n", + "\u001b[32m2024-02-29 12:17:13.349\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mopenqdc.datasets.base\u001b[0m:\u001b[36mread_preprocess\u001b[0m:\u001b[36m381\u001b[0m - \u001b[1mReading preprocessed data.\u001b[0m\n", + "\u001b[32m2024-02-29 12:17:13.349\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mopenqdc.datasets.base\u001b[0m:\u001b[36mread_preprocess\u001b[0m:\u001b[36m382\u001b[0m - \u001b[1mDataset spice with the following units:\n", " Energy: hartree,\n", " Distance: bohr,\n", - " Forces: hartree/bohr\u001b[0m\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded atomic_inputs with shape (33175288, 5), dtype float32\n", - "Loaded position_idx_range with shape (1110165, 2), dtype int32\n", - "Loaded energies with shape (1110165, 1), dtype float32\n", - "Loaded forces with shape (33175288, 3, 1), dtype float32\n", - "Loaded name_uniques with shape (19155,), dtype " + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "plt.scatter(\n", " embedding[:, 0],\n", " embedding[:, 1],\n", - " c=[ds[i] for i in datum[\"indices\"]])\n", + " c=[(ds[i].energies - ds[i][\"e0\"].sum() )/ ds.data[\"n_atoms\"][i] for i in datum[\"idxs\"]])\n", "plt.colorbar()\n" ] } ], "metadata": { "kernelspec": { - "display_name": "qdc", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -416,7 +824,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/env.yml b/env.yml index 43c7df7..53663c9 100644 --- a/env.yml +++ b/env.yml @@ -10,6 +10,7 @@ dependencies: - fsspec - gcsfs - typer + - prettytable # Scientific - pandas diff --git a/openqdc/cli.py b/openqdc/cli.py index 72a57a3..2a857d0 100644 --- a/openqdc/cli.py +++ b/openqdc/cli.py @@ -53,12 +53,11 @@ def datasets(): """ Print the available datasets. """ - table = PrettyTable(['Name', 'Level of theories', 'Has forces'], max_width=25) - table._max_width={} + table = PrettyTable(['Name', 'Forces', 'Level of theory']) for dataset in AVAILABLE_DATASETS: empty_dataset=AVAILABLE_DATASETS[dataset].no_init() has_forces = False if not empty_dataset.__force_methods__ else True - table.add_row([dataset, ",".join(empty_dataset.__energy_methods__),has_forces]) + table.add_row([dataset,has_forces, ",".join(empty_dataset.__energy_methods__)]) table.align="l" print(table) From caf4976d5bf1574dcc2808b582e32fff5ca3f653 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 29 Feb 2024 12:35:29 -0700 Subject: [PATCH 10/28] Updated readme --- README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/README.md b/README.md index c143e95..4c5097b 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,26 @@ You can run tests locally with: pytest ``` +### Documentation + +You can build the documentation locally with: + +```bash +mkdocs serve +``` + +# Downloading Datasets + +A command line interface is available to download datasets or see which dataset is available, please run openqdc --help. + +```bash +# Display the available datasets +openqdc datasets + +# Download the Spice and QMugs dataset +openqdc download --datasets Spice QMugs +``` + # Overview of Datasets