Skip to content

Commit

Permalink
Merge pull request #76 from OpenDrugDiscovery/release
Browse files Browse the repository at this point in the history
Release
  • Loading branch information
prtos authored Jun 8, 2024
2 parents dddcea2 + 10dc009 commit 96465c5
Show file tree
Hide file tree
Showing 83 changed files with 6,567 additions and 4,763 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@ jobs:
- name: Install library
run: python -m pip install --no-deps .

- name: Check directory
run: ls

- name: Run tests
run: pytest
run: python -m pytest

- name: Test building the doc
run: mkdocs build
#- name: Test building the doc
# run: mkdocs build
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,8 @@ We also provide support for the following publicly available QM Noncovalent Inte
| [Splinter](https://www.nature.com/articles/s41597-023-02443-1) |
| [X40](https://pubs.acs.org/doi/10.1021/ct300647k) |
| [L7](https://pubs.acs.org/doi/10.1021/ct400036b) |

# How to cite
All data presented in the OpenQDC are already published in scientific journals, full reference to the respective paper is attached to each dataset class. When citing data obtained from OpenQDC, you should cite both the original paper(s) the data come from and our paper on OpenQDC itself. The reference is:

ADD REF HERE LATER
5 changes: 0 additions & 5 deletions docs/API/isolated_atom_energies.md

This file was deleted.

3 changes: 3 additions & 0 deletions docs/API/methods.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# QM Methods

::: openqdc.methods
11 changes: 6 additions & 5 deletions docs/tutorials/usage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,7 @@
"\n",
"$U(A_1, A_2, ...) = \\sum_{i_1}^N e_0(A_i) + e(A_1, A_2, ...)$\n",
"\n",
"The isolated atoms energies are automatically used inside the datasets for the correct level of theory, but you can also use them directly by accessing the IsolatedAtomEnergyFactor class."
"The isolated atoms energies are automatically associated with the correct level of theory, and you can get access as follow"
]
},
{
Expand Down Expand Up @@ -715,10 +715,11 @@
}
],
"source": [
"from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory\n",
"from openqdc.methods import QmMethod\n",
"\n",
"# Get the hasmap of isolated atom energies for the b3lyp/6-31g* method\n",
"IsolatedAtomEnergyFactory.get(\"b3lyp/6-31g*\")"
"# Get the b3lyp/6-31g* method\n",
"method = QmMethod.B3LYP_6_31G_D\n",
"method.atom_energies_dict"
]
},
{
Expand All @@ -745,7 +746,7 @@
],
"source": [
"# Get the matrix of atomization energies for the b3lyp/6-31g* method\n",
"IsolatedAtomEnergyFactory.get_matrix(\"b3lyp/6-31g*\")"
"method.atom_energies_matrix"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ docs_dir: "docs"
nav:
- Overview: index.md
- Available Datasets: datasets.md
- Tutorials:
- Really hard example: tutorials/usage.ipynb
#- Tutorials:
# #- Really hard example: tutorials/usage.ipynb
- API:
- Datasets: API/available_datasets.md
- Isolated Atoms Energies: API/isolated_atom_energies.md
Expand Down
80 changes: 55 additions & 25 deletions openqdc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,61 @@
import importlib
import os
from typing import TYPE_CHECKING # noqa F401
from typing import TYPE_CHECKING

# The below lazy import logic is coming from openff-toolkit:
# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44


# Dictionary of objects to lazily import; maps the object's name to its module path
def get_project_root():
return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))


_lazy_imports_obj = {
"__version__": "openqdc._version",
"BaseDataset": "openqdc.datasets.base",
# POTENTIAL
"ANI1": "openqdc.datasets.potential.ani",
"ANI1CCX": "openqdc.datasets.potential.ani",
"ANI1CCX_V2": "openqdc.datasets.potential.ani",
"ANI1X": "openqdc.datasets.potential.ani",
"Spice": "openqdc.datasets.potential.spice",
"SpiceV2": "openqdc.datasets.potential.spice",
"SpiceVL2": "openqdc.datasets.potential.spice",
"GEOM": "openqdc.datasets.potential.geom",
"QMugs": "openqdc.datasets.potential.qmugs",
"QMugs_V2": "openqdc.datasets.potential.qmugs",
"ISO17": "openqdc.datasets.potential.iso_17",
"COMP6": "openqdc.datasets.potential.comp6",
"GDML": "openqdc.datasets.potential.gdml",
"Molecule3D": "openqdc.datasets.potential.molecule3d",
"OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
"SN2RXN": "openqdc.datasets.potential.sn2_rxn",
"QM7X": "openqdc.datasets.potential.qm7x",
"QM7X_V2": "openqdc.datasets.potential.qm7x",
"NablaDFT": "openqdc.datasets.potential.nabladft",
"SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
"WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
"TMQM": "openqdc.datasets.potential.tmqm",
"Dummy": "openqdc.datasets.potential.dummy",
"PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
"PCQM_PM6": "openqdc.datasets.potential.pcqm",
"RevMD17": "openqdc.datasets.potential.revmd17",
"MD22": "openqdc.datasets.potential.md22",
"Transition1X": "openqdc.datasets.potential.transition1x",
"MultixcQM9": "openqdc.datasets.potential.multixcqm9",
"MultixcQM9_V2": "openqdc.datasets.potential.multixcqm9",
# INTERACTION
"DES5M": "openqdc.datasets.interaction.des",
"DES370K": "openqdc.datasets.interaction.des",
"DESS66": "openqdc.datasets.interaction.des",
"DESS66x8": "openqdc.datasets.interaction.des",
"L7": "openqdc.datasets.interaction.l7",
"X40": "openqdc.datasets.interaction.x40",
"Metcalf": "openqdc.datasets.interaction.metcalf",
"Splinter": "openqdc.datasets.interaction.splinter",
# DEBUG
"Dummy": "openqdc.datasets.potential.dummy",
# ALL
"AVAILABLE_DATASETS": "openqdc.datasets",
"AVAILABLE_POTENTIAL_DATASETS": "openqdc.datasets.potential",
"AVAILABLE_INTERACTION_DATASETS": "openqdc.datasets.interaction",
Expand Down Expand Up @@ -68,26 +90,34 @@ def __dir__():
if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1":
# These types are imported lazily at runtime, but we need to tell type
# checkers what they are.
from ._version import __version__ # noqa
from .datasets import AVAILABLE_DATASETS # noqa
from .datasets.base import BaseDataset # noqa
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1X # noqa
from .datasets.potential.comp6 import COMP6 # noqa
from .datasets.potential.dummy import Dummy # noqa
from .datasets.potential.gdml import GDML # noqa
from .datasets.potential.geom import GEOM # noqa
from .datasets.potential.iso_17 import ISO17 # noqa
from .datasets.potential.molecule3d import Molecule3D # noqa
from .datasets.potential.multixcqm9 import MultixcQM9 # noqa
from .datasets.potential.nabladft import NablaDFT # noqa
from .datasets.potential.orbnet_denali import OrbnetDenali # noqa
from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6 # noqa
from .datasets.potential.qm7x import QM7X # noqa
from .datasets.potential.qmugs import QMugs # noqa
from .datasets.potential.revmd17 import RevMD17 # noqa
from .datasets.potential.sn2_rxn import SN2RXN # noqa
from .datasets.potential.solvated_peptides import SolvatedPeptides # noqa
from .datasets.potential.spice import Spice, SpiceV2 # noqa
from .datasets.potential.tmqm import TMQM # noqa
from .datasets.potential.transition1x import Transition1X # noqa
from .datasets.potential.waterclusters3_30 import WaterClusters # noqa
from ._version import __version__
from .datasets import AVAILABLE_DATASETS
from .datasets.base import BaseDataset

# INTERACTION
from .datasets.interaction.des import DES5M, DES370K, DESS66, DESS66x8
from .datasets.interaction.l7 import L7
from .datasets.interaction.metcalf import Metcalf
from .datasets.interaction.splinter import Splinter
from .datasets.interaction.x40 import X40
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X
from .datasets.potential.comp6 import COMP6
from .datasets.potential.dummy import Dummy
from .datasets.potential.gdml import GDML
from .datasets.potential.geom import GEOM
from .datasets.potential.iso_17 import ISO17
from .datasets.potential.md22 import MD22
from .datasets.potential.molecule3d import Molecule3D
from .datasets.potential.multixcqm9 import MultixcQM9, MultixcQM9_V2
from .datasets.potential.nabladft import NablaDFT
from .datasets.potential.orbnet_denali import OrbnetDenali
from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6
from .datasets.potential.qm7x import QM7X, QM7X_V2
from .datasets.potential.qmugs import QMugs, QMugs_V2
from .datasets.potential.revmd17 import RevMD17
from .datasets.potential.sn2_rxn import SN2RXN
from .datasets.potential.solvated_peptides import SolvatedPeptides
from .datasets.potential.spice import Spice, SpiceV2, SpiceVL2
from .datasets.potential.tmqm import TMQM
from .datasets.potential.transition1x import Transition1X
from .datasets.potential.waterclusters3_30 import WaterClusters
92 changes: 77 additions & 15 deletions openqdc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@
import typer
from loguru import logger
from prettytable import PrettyTable
from rich import print
from typing_extensions import Annotated

from openqdc import AVAILABLE_DATASETS, AVAILABLE_POTENTIAL_DATASETS
from openqdc.raws.config_factory import DataConfigFactory
from openqdc.raws.fetch import DataDownloader
from openqdc.datasets import COMMON_MAP_POTENTIALS # noqa
from openqdc.datasets import (
AVAILABLE_DATASETS,
AVAILABLE_INTERACTION_DATASETS,
AVAILABLE_POTENTIAL_DATASETS,
)

app = typer.Typer(help="OpenQDC CLI")

Expand All @@ -20,10 +24,12 @@ def exist_dataset(dataset):


def format_entry(empty_dataset):
if len(empty_dataset.__energy_methods__) > 10:
entry = ",".join(empty_dataset.__energy_methods__[:10]) + "..."
energy_methods = [str(x) for x in empty_dataset.__energy_methods__]
max_num_to_display = 6
if len(energy_methods) > 6:
entry = ",".join(energy_methods[:max_num_to_display]) + "..."
else:
entry = ",".join(empty_dataset.__energy_methods__[:10])
entry = ",".join(energy_methods[:max_num_to_display])
return entry


Expand Down Expand Up @@ -65,7 +71,7 @@ def datasets():
table = PrettyTable(["Name", "Type of Energy", "Forces", "Level of theory"])
for dataset in AVAILABLE_DATASETS:
empty_dataset = AVAILABLE_DATASETS[dataset].no_init()
has_forces = False if not empty_dataset.__force_methods__ else True
has_forces = False if not any(empty_dataset.force_mask) else True
en_type = "Potential" if dataset in AVAILABLE_POTENTIAL_DATASETS else "Interaction"
table.add_row(
[
Expand All @@ -80,22 +86,78 @@ def datasets():


@app.command()
def fetch(datasets: List[str]):
def fetch(
datasets: List[str],
overwrite: Annotated[
bool,
typer.Option(
help="Whether to overwrite or force the re-download of the files.",
),
] = False,
cache_dir: Annotated[
Optional[str],
typer.Option(
help="Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used.",
),
] = None,
):
"""
Download the raw datasets files from the main openQDC hub.
Special case: if the dataset is "all", all available datasets will be downloaded.
overwrite: bool = False,
If True, the files will be re-downloaded and overwritten.
cache_dir: Optional[str] = None,
Path to the cache. If not provided, the default cache directory will be used.
Special case: if the dataset is "all", "potential", "interaction".
all: all available datasets will be downloaded.
potential: all the potential datasets will be downloaded
interaction: all the interaction datasets will be downloaded
Example:
openqdc fetch Spice
"""
if datasets[0] == "all":
dataset_names = DataConfigFactory.available_datasets
if datasets[0].lower() == "all":
dataset_names = AVAILABLE_DATASETS
elif datasets[0].lower() == "potential":
dataset_names = AVAILABLE_POTENTIAL_DATASETS
elif datasets[0].lower() == "interaction":
dataset_names = AVAILABLE_INTERACTION_DATASETS
else:
dataset_names = datasets

for dataset_name in dataset_names:
dd = DataDownloader()
dd.from_name(dataset_name)
for dataset in list(map(lambda x: x.lower().replace("_", ""), dataset_names)):
if exist_dataset(dataset):
try:
AVAILABLE_DATASETS[dataset].fetch(cache_dir, overwrite)
except Exception as e:
logger.error(f"Something unexpected happended while fetching {dataset}: {repr(e)}")


@app.command()
def preprocess(
datasets: List[str],
overwrite: Annotated[
bool,
typer.Option(
help="Whether to overwrite or force the re-download of the datasets.",
),
] = True,
upload: Annotated[
bool,
typer.Option(
help="Whether to try the upload to the remote storage.",
),
] = False,
):
"""
Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.
"""
for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
if exist_dataset(dataset):
logger.info(f"Preprocessing {AVAILABLE_DATASETS[dataset].__name__}")
try:
AVAILABLE_DATASETS[dataset].no_init().preprocess(upload=upload, overwrite=overwrite)
except Exception as e:
logger.error(f"Error while preprocessing {dataset}. {e}. Did you fetch the dataset first?")
raise e


if __name__ == "__main__":
Expand Down
28 changes: 26 additions & 2 deletions openqdc/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,28 @@
from .interaction import AVAILABLE_INTERACTION_DATASETS # noqa
from .potential import AVAILABLE_POTENTIAL_DATASETS # noqa
from .interaction import *
from .potential import *

AVAILABLE_DATASETS = {**AVAILABLE_POTENTIAL_DATASETS, **AVAILABLE_INTERACTION_DATASETS}


def _level_of_theory_overlap(dataset_collection):
import itertools
from itertools import groupby

dataset_map = {}
for dataset in dataset_collection:
dataset_map[dataset.lower().replace("_", "")] = dataset_collection[dataset].no_init().energy_methods

common_values_dict = {}

for key, values in dataset_map.items():
for value in values:
if value in common_values_dict:
common_values_dict[value].append(key)
else:
common_values_dict[value] = [key]

return dict(filter(lambda x: len(x[1]) > 1, common_values_dict.items()))


COMMON_MAP_POTENTIALS = _level_of_theory_overlap(AVAILABLE_POTENTIAL_DATASETS)
COMMON_MAP_INTERACTIONS = _level_of_theory_overlap(AVAILABLE_INTERACTION_DATASETS)
Loading

0 comments on commit 96465c5

Please sign in to comment.