Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Interaction Datasets #40

Merged
merged 51 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
bd3fcf9
started splitting datasets into 'interaction' and 'potential'
mcneela Mar 1, 2024
a800ea5
add num_unique_molecules property
mcneela Mar 1, 2024
9d6fca6
added logging
mcneela Mar 1, 2024
794e63f
started base interaction dataset
mcneela Mar 1, 2024
0db4765
add interaction __init__ file and revise potential __init__ file
mcneela Mar 4, 2024
6e5a002
add des370k interaction to config_factory.py
mcneela Mar 4, 2024
8e1e003
have BaseInteractionDataset inherit BaseDataset
mcneela Mar 4, 2024
d68bae6
implemented read_raw_entries for DES370K
mcneela Mar 4, 2024
5e94d67
finished implementation of DES370K interaction
mcneela Mar 4, 2024
3c9508b
finished implementation of DES370K interaction
mcneela Mar 4, 2024
768fb2e
update BaseDataset import path
mcneela Mar 4, 2024
8aeadd8
added Metcalf dataset
mcneela Mar 5, 2024
9cf6034
updated DES370K based on Prudencio's comments
mcneela Mar 5, 2024
ce2c53b
Merge branch 'interaction' into metcalf
mcneela Mar 5, 2024
6206665
added const molecule_groups lookup for DES370K dataset
mcneela Mar 5, 2024
5cb57d9
updated subsets for DES370K
mcneela Mar 5, 2024
e18b710
added download url for des5m_interaction
mcneela Mar 5, 2024
54cadbf
updated README with new datasets
mcneela Mar 5, 2024
7f83eb5
Merge branch 'metcalf' into interaction
mcneela Mar 5, 2024
a922ef7
Added DES5M dataset
mcneela Mar 5, 2024
2146058
added des_s66 dataset
mcneela Mar 6, 2024
4d9a4ba
added DESS66x8 dataset
mcneela Mar 6, 2024
c2229e3
small update to __init__ file
mcneela Mar 6, 2024
9349454
added L7 dataset
mcneela Mar 6, 2024
c3bdc64
added X40 dataset
mcneela Mar 6, 2024
23c0739
add new datasets to __init__.py
mcneela Mar 6, 2024
74f87a6
added splinter dataset
mcneela Mar 7, 2024
f046ea9
fixed a couple splinter things
mcneela Mar 7, 2024
3c84ee9
update default data shapes for interaction datasets
mcneela Mar 7, 2024
04c81ae
updated test_dummy.py with new import structure
mcneela Mar 7, 2024
11e2858
fix test_import.py
mcneela Mar 7, 2024
78f0423
code cleanup for the linter
mcneela Mar 8, 2024
bd58fdf
fix ani import
mcneela Mar 8, 2024
5dfcf55
Merge branch 'refactoring' into interaction
mcneela Mar 8, 2024
4bc3a49
fix base dataset import
mcneela Mar 8, 2024
b046eea
black formatting
mcneela Mar 8, 2024
fe54044
ran precommit
mcneela Mar 8, 2024
ef2528c
removed DES from datasets/__init__.py
mcneela Mar 8, 2024
c0ef5b1
removed DES from datasets/__init__.py
mcneela Mar 8, 2024
ad55296
fix X40 energy methods
mcneela Mar 8, 2024
0a51e7c
added interaction dataset docstrings
mcneela Mar 8, 2024
b6c3a6a
update readme with all interaction datasets
mcneela Mar 8, 2024
07f70b8
update metcalf __energy_methods__
mcneela Mar 8, 2024
1443450
refactored des370k and des5m
mcneela Mar 8, 2024
802b70b
update base interaction dataset to add n_atoms_first property
mcneela Mar 8, 2024
e969b54
update L7 and X40 to use python base yaml package
mcneela Mar 12, 2024
5725fed
modify interaction/base.py to save keys other than force/energy in pr…
mcneela Mar 13, 2024
6c6b286
fix base dataset issue
mcneela Mar 13, 2024
46c5ebe
fix circular imports
mcneela Mar 13, 2024
d5ec053
merge origin/develop into interaction
mcneela Mar 13, 2024
cb9987c
removed print statements
mcneela Mar 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions src/openqdc/datasets/interaction/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import importlib
import os
from typing import TYPE_CHECKING # noqa F401

# The below lazy import logic is coming from openff-toolkit:
# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44

# Dictionary of objects to lazily import; maps the object's name to its module path

_lazy_imports_obj = {
"BaseInteractionDataset": "openqdc.datasets.interaction.base",
"DES370K": "openqdc.datasets.interaction.des370k",
}

_lazy_imports_mod = {}


def __getattr__(name):
"""Lazily import objects from _lazy_imports_obj or _lazy_imports_mod

Note that this method is only called by Python if the name cannot be found
in the current module."""
obj_mod = _lazy_imports_obj.get(name)
if obj_mod is not None:
mod = importlib.import_module(obj_mod)
return mod.__dict__[name]

lazy_mod = _lazy_imports_mod.get(name)
if lazy_mod is not None:
return importlib.import_module(lazy_mod)

raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


def __dir__():
"""Add _lazy_imports_obj and _lazy_imports_mod to dir(<module>)"""
keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys())
return sorted(keys)


if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1":
from .base import BaseInteractionDataset
from .des370k import DES370K

__all__ = [
"BaseInteractionDataset",
"DES370K",
]
43 changes: 43 additions & 0 deletions src/openqdc/datasets/interaction/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Dict, List, Optional, Union
from openqdc.utils.io import (
copy_exists,
dict_to_atoms,
get_local_cache,
load_hdf5_file,
load_pkl,
pull_locally,
push_remote,
set_cache_dir,
)
from openqdc.datasets.potential.base import BaseDataset

from loguru import logger

import numpy as np

class BaseInteractionDataset(BaseDataset):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The write and read prepossessed must be changed here no? There are news keys been added so the base class must adapt those functions no?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We must also change the logic to avoid of a few other functions to avoid the normalization of interaction energies no @FNTwin.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I still need to update the preprocessing functions to add the new keys. I'm not familiar with the normalization of the energies, Cristian will probably be able to help with that.

def __init__(
self,
energy_unit: Optional[str] = None,
distance_unit: Optional[str] = None,
overwrite_local_cache: bool = False,
cache_dir: Optional[str] = None,
) -> None:
super().__init__(
energy_unit=energy_unit,
distance_unit=distance_unit,
overwrite_local_cache=overwrite_local_cache,
cache_dir=cache_dir
)

def collate_list(self, list_entries: List[Dict]):
# concatenate entries
res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) \
for key in list_entries[0] if not isinstance(list_entries[0][key], dict)}

csum = np.cumsum(res.get("n_atoms"))
x = np.zeros((csum.shape[0], 2), dtype=np.int32)
x[1:, 0], x[:, 1] = csum[:-1], csum
res["position_idx_range"] = x

return res
115 changes: 115 additions & 0 deletions src/openqdc/datasets/interaction/des370k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import os
import numpy as np
import pandas as pd

from typing import Dict, List

from tqdm import tqdm
from loguru import logger
from openqdc.datasets.interaction import BaseInteractionDataset
from openqdc.utils.molecule import atom_table


class DES370K(BaseInteractionDataset):
__name__ = "des370k_interaction"
__energy_unit__ = "hartree"
__distance_unit__ = "ang"
__forces_unit__ = "hartree/ang"
__energy_methods__ = [
"mp2/cc-pvdz",
"mp2/cc-pvqz",
"mp2/cc-pvtz",
"mp2/cbs",
"ccsd(t)/cc-pvdz",
"ccsd(t)/cbs", # cbs
"ccsd(t)/nn", # nn
"sapt0/aug-cc-pwcvxz",
mcneela marked this conversation as resolved.
Show resolved Hide resolved
"sapt0/aug-cc-pwcvxz",
"sapt0/aug-cc-pwcvxz",
"sapt0/aug-cc-pwcvxz",
"sapt0/aug-cc-pwcvxz",
"sapt0/aug-cc-pwcvxz",
"sapt0/aug-cc-pwcvxz",
"sapt0/aug-cc-pwcvxz",
"sapt0/aug-cc-pwcvxz",
"sapt0/aug-cc-pwcvxz",
]

energy_target_names = [
"cc_MP2_all",
"qz_MP2_all",
"tz_MP2_all",
"cbs_MP2_all",
"cc_CCSD(T)_all",
"cbs_CCSD(T)_all",
"nn_CCSD(T)_all",
"sapt_all",
"sapt_es",
"sapt_ex",
"sapt_exs2",
"sapt_ind",
"sapt_exind",
"sapt_disp",
"sapt_exdisp_os",
"sapt_exdisp_ss",
"sapt_delta_HF",
]

def read_raw_entries(self) -> List[Dict]:
self.filepath = os.path.join(self.root, "DES370K.csv")
logger.info(f"Reading DES370K interaction data from {self.filepath}")
df = pd.read_csv(self.filepath)
data = []
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
smiles0, smiles1 = row["smiles0"], row["smiles1"]
charge0, charge1 = row["charge0"], row["charge1"]
natoms0, natoms1 = row["natoms0"], row["natoms1"]
pos = np.array(list(map(float, row["xyz"].split()))).reshape(-1, 3)
pos0 = pos[:natoms0]
pos1 = pos[natoms0:]

elements = row["elements"].split()
elements0 = np.array(elements[:natoms0])
elements1 = np.array(elements[natoms0:])

atomic_nums = np.expand_dims(np.array([atom_table.GetAtomicNumber(x) for x in elements]), axis=1)
atomic_nums0 = np.array(atomic_nums[:natoms0])
atomic_nums1 = np.array(atomic_nums[natoms0:])

charges = np.expand_dims(np.array([charge0] * natoms0 + [charge1] * natoms1), axis=1)

atomic_inputs = np.concatenate((atomic_nums, charges, pos), axis=-1, dtype=np.float32)
atomic_inputs0 = atomic_inputs[:natoms0, :]
atomic_inputs1 = atomic_inputs[natoms0:, :]

energies = np.array(row[self.energy_target_names].values).astype(np.float32)[None, :]

name = np.array([smiles0 + "." + smiles1])

item = dict(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we only need this. a lot of the information can in the sub dict can be retrieved only with the info below!

        item = dict(
            energies=energies,
            subset=np.array(["DES370K"]), #In dess they have subsets for each monomer no? so mabe the subset here can be "subset1.subset2"
            n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
            n_atoms_first=np.array([natoms0], dtype=np.int32),
            atomic_inputs=atomic_inputs, # with n_atoms_first we can resplit this so we can leave this and split in the getitem
            name=name, # already smiles1 and smiles2 can be 
        )

mol0=dict(
smiles=smiles0,
atomic_inputs=atomic_inputs0,
n_atoms=natoms0,
charge=charge0,
elements=elements0,
atomic_nums=atomic_nums0,
pos=pos0,
),
mol1=dict(
smiles=smiles1,
atomic_inputs=atomic_inputs1,
n_atoms=natoms1,
charge=charge1,
elements=elements1,
atomic_nums=atomic_nums1,
pos=pos1,
),
energies=energies,
subset=np.array(["DES370K"]),
n_atoms=np.array([natoms0 + natoms1], dtype=np.int32),
atomic_inputs=atomic_inputs,
name=name,
)
data.append(item)
return data
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,28 @@
# Dictionary of objects to lazily import; maps the object's name to its module path

_lazy_imports_obj = {
"ANI1": "openqdc.datasets.ani",
"ANI1CCX": "openqdc.datasets.ani",
"ANI1X": "openqdc.datasets.ani",
"Spice": "openqdc.datasets.spice",
"GEOM": "openqdc.datasets.geom",
"QMugs": "openqdc.datasets.qmugs",
"ISO17": "openqdc.datasets.iso_17",
"COMP6": "openqdc.datasets.comp6",
"GDML": "openqdc.datasets.gdml",
"Molecule3D": "openqdc.datasets.molecule3d",
"OrbnetDenali": "openqdc.datasets.orbnet_denali",
"SN2RXN": "openqdc.datasets.sn2_rxn",
"QM7X": "openqdc.datasets.qm7x",
"DESS": "openqdc.datasets.dess",
"NablaDFT": "openqdc.datasets.nabladft",
"SolvatedPeptides": "openqdc.datasets.solvated_peptides",
"WaterClusters": "openqdc.datasets.waterclusters3_30",
"TMQM": "openqdc.datasets.tmqm",
"Dummy": "openqdc.datasets.dummy",
"PCQM_B3LYP": "openqdc.datasets.pcqm",
"PCQM_PM6": "openqdc.datasets.pcqm",
"Transition1X": "openqdc.datasets.transition1x",
"ANI1": "openqdc.datasets.potential.ani",
"ANI1CCX": "openqdc.datasets.potential.ani",
"ANI1X": "openqdc.datasets.potential.ani",
"Spice": "openqdc.datasets.potential.spice",
"GEOM": "openqdc.datasets.potential.geom",
"QMugs": "openqdc.datasets.potential.qmugs",
"ISO17": "openqdc.datasets.potential.iso_17",
"COMP6": "openqdc.datasets.potential.comp6",
"GDML": "openqdc.datasets.potential.gdml",
"Molecule3D": "openqdc.datasets.potential.molecule3d",
"OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
"SN2RXN": "openqdc.datasets.potential.sn2_rxn",
"QM7X": "openqdc.datasets.potential.qm7x",
"DESS": "openqdc.datasets.potential.dess",
"NablaDFT": "openqdc.datasets.potential.nabladft",
"SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
"WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
"TMQM": "openqdc.datasets.potential.tmqm",
"Dummy": "openqdc.datasets.potential.dummy",
"PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
"PCQM_PM6": "openqdc.datasets.potential.pcqm",
"Transition1X": "openqdc.datasets.potential.transition1x",
}

_lazy_imports_mod = {}
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ def _post_init(
self._convert_data()
self._set_isolated_atom_energies()

@classmethod
def no_init(cls):
return cls.__new__(cls)

def _convert_data(self):
logger.info(
f"Converting {self.__name__} data to the following units:\n\
Expand Down Expand Up @@ -325,6 +329,7 @@ def read_raw_entries(self):

def collate_list(self, list_entries):
# concatenate entries
logger.info(f"list entries: {type(list_entries)}")
res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}

csum = np.cumsum(res.get("n_atoms"))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from os.path import join as p_join

from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5


class COMP6(BaseDataset):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
from tqdm import tqdm

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils.molecule import get_atomic_number_and_charge


Expand Down Expand Up @@ -58,7 +58,6 @@ class DESS(BaseDataset):
"nn_CCSD(T)_all",
"sapt_all",
]
# ['qz_MP2_all', 'tz_MP2_all', 'cbs_MP2_all', 'sapt_all', 'nn_CCSD(T)_all']

partitions = ["DES370K", "DES5M"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from numpy import array
from sklearn.utils import Bunch

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory
from openqdc.utils.constants import NOT_DEFINED

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from os.path import join as p_join

from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5


class GDML(BaseDataset):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import datamol as dm
import numpy as np

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils import load_json, load_pkl
from openqdc.utils.molecule import get_atomic_number_and_charge

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from os.path import join as p_join

from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5


class ISO17(BaseDataset):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from rdkit import Chem
from tqdm import tqdm

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils.molecule import get_atomic_number_and_charge


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
import pandas as pd

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils.molecule import z_to_formula
from openqdc.utils.package_utils import requires_package

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import numpy as np
import pandas as pd

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils.molecule import atom_table


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pandas as pd
from loguru import logger

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils.io import get_local_cache, push_remote


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np
from tqdm import tqdm

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils.io import load_hdf5_file


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import datamol as dm
import numpy as np

from openqdc.datasets.base import BaseDataset
from openqdc.datasets.potential.base import BaseDataset
from openqdc.utils.molecule import get_atomic_number_and_charge


Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from os.path import join as p_join

from openqdc.datasets.base import BaseDataset, read_qc_archive_h5
from openqdc.datasets.potential.base import BaseDataset, read_qc_archive_h5


class SN2RXN(BaseDataset):
Expand Down
Loading
Loading