valence-labs · FNTwin · Mar 8, 2024 · Feb 9, 2024 · Feb 9, 2024 · Feb 29, 2024
diff --git a/.github/workflows/code-check.yml b/.github/workflows/code-check.yml
@@ -24,7 +24,7 @@ jobs:
 
       - name: Install black
         run: |
-          pip install black>=23
+          pip install black>=24
 
       - name: Lint
         run: black --check .

diff --git a/README.md b/README.md
@@ -19,6 +19,29 @@ You can run tests locally with:
 pytest
 ```
 
+### Documentation
+
+You can build the documentation locally with:
+
+```bash
+mkdocs serve
+```
+
+# Downloading Datasets
+
+A command line interface is available to download datasets or see which dataset is available, for more information please run openqdc --help.
+
+```bash
+# Display the available datasets
+openqdc datasets
+
+# Display the help message for the download command
+openqdc download --help
+
+# Download the Spice and QMugs dataset
+openqdc download Spice QMugs
+```
+
 # Overview of Datasets
 
 <!-- Create a table with the following columns
@@ -32,17 +55,30 @@ pytest
 
 We provide support for the following publicly available QM Datasets.
 
+# Potential Energy
+
 | Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations|
 | --- | --- | --- | --- | --- | --- | --- | --- |
+| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) |  57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes |
 | [GEOM](https://www.nature.com/articles/s41597-022-01288-4) |  450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No |
 | [Molecule3D](https://arxiv.org/abs/2110.01717) |  3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No |
 | [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) |  1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | |
+| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes |
+| [PCQM_PM6](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740) | | | 1| No| | PM6 | No
+| [PCQM_B3LYP](https://arxiv.org/abs/2305.18454) | 85,938,443|85,938,443 | 1| No| | B3LYP/6-31G* | No
 | [QMugs](https://www.nature.com/articles/s41597-022-01390-7) |  665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | No |
+| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) |  6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes |
+| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | |
+| [SolvatedPeptides](https://doi.org/10.1021/acs.jctc.9b00181) |   | 2,731,180 |  | Yes |  | revPBE-D3(BJ)/def2-TZVP |  |
 | [Spice](https://arxiv.org/abs/2209.10702) |  19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes |
-| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) |  57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes |
-| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) |  86,665 | |  | No | | TPSSh-D3BJ/def2-SVP | |
+| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) |  86,665 | 86,665| 1| No | | TPSSh-D3BJ/def2-SVP | |
+| [Transition1X](https://www.nature.com/articles/s41597-022-01870-w) |   | 9,654,813| | Yes | | ωB97x/6–31 G(d) | Yes |
+| [WaterClusters](https://doi.org/10.1063/1.5128378) | 1  | 4,464,740| | No | 2 | TTM2.1-F | Yes|
+
+
+# Interaction energy
+
+| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations|
+| --- | --- | --- | --- | --- | --- | --- | --- |
 | [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes |
 | [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes |
-| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes |
-| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | |
-| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) |  6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes |
diff --git a/docs/tutorials/usage.ipynb b/docs/tutorials/usage.ipynb
diff --git a/env.yml b/env.yml
@@ -9,6 +9,8 @@ dependencies:
   - loguru
   - fsspec
   - gcsfs
+  - typer
+  - prettytable
 
   # Scientific
   - pandas
@@ -28,7 +30,7 @@ dependencies:
   - pytest >=6.0
   - pytest-cov
   - nbconvert
-  - black >=23
+  - black >=24
   - jupyterlab
   - pre-commit
   - ruff
@@ -42,3 +44,4 @@ dependencies:
   - mkdocs-jupyter
   - markdown-include
   - mdx_truly_sane_lists
+  - mkdocstrings-python
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -58,13 +58,13 @@ plugins:
   - search
   - mkdocstrings:
       watch:
-        - src/
+        - openqdc/
       handlers:
         python:
           setup_commands:
             - import sys
             - sys.path.append("docs")
-            - sys.path.append("src")
+            - sys.path.append("openqdc")
           selection:
             new_path_syntax: yes
           rendering:

diff --git a/openqdc/__init__.py b/openqdc/__init__.py
@@ -0,0 +1,88 @@
+import importlib
+import os
+from typing import TYPE_CHECKING  # noqa F401
+
+# The below lazy import logic is coming from openff-toolkit:
+# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44
+
+# Dictionary of objects to lazily import; maps the object's name to its module path
+
+_lazy_imports_obj = {
+    "__version__": "openqdc._version",
+    "BaseDataset": "openqdc.datasets.base",
+    "ANI1": "openqdc.datasets.potential.ani",
+    "ANI1CCX": "openqdc.datasets.potential.ani",
+    "ANI1X": "openqdc.datasets.potential.ani",
+    "Spice": "openqdc.datasets.potential.spice",
+    "GEOM": "openqdc.datasets.potential.geom",
+    "QMugs": "openqdc.datasets.potential.qmugs",
+    "ISO17": "openqdc.datasets.potential.iso_17",
+    "COMP6": "openqdc.datasets.potential.comp6",
+    "GDML": "openqdc.datasets.potential.gdml",
+    "Molecule3D": "openqdc.datasets.potential.molecule3d",
+    "OrbnetDenali": "openqdc.datasets.potential.orbnet_denali",
+    "SN2RXN": "openqdc.datasets.potential.sn2_rxn",
+    "QM7X": "openqdc.datasets.potential.qm7x",
+    "DES": "openqdc.datasets.interaction.des",
+    "NablaDFT": "openqdc.datasets.potential.nabladft",
+    "SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
+    "WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
+    "TMQM": "openqdc.datasets.potential.tmqm",
+    "Dummy": "openqdc.datasets.potential.dummy",
+    "PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
+    "PCQM_PM6": "openqdc.datasets.potential.pcqm",
+    "Transition1X": "openqdc.datasets.potential.transition1x",
+    "AVAILABLE_DATASETS": "openqdc.datasets",
+}
+
+_lazy_imports_mod = {"datasets": "openqdc.datasets", "utils": "openqdc.utils"}
+
+
+def __getattr__(name):
+    """Lazily import objects from _lazy_imports_obj or _lazy_imports_mod
+
+    Note that this method is only called by Python if the name cannot be found
+    in the current module."""
+    obj_mod = _lazy_imports_obj.get(name)
+    if obj_mod is not None:
+        mod = importlib.import_module(obj_mod)
+        return mod.__dict__[name]
+
+    lazy_mod = _lazy_imports_mod.get(name)
+    if lazy_mod is not None:
+        return importlib.import_module(lazy_mod)
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+def __dir__():
+    """Add _lazy_imports_obj and _lazy_imports_mod to dir(<module>)"""
+    keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys())
+    return sorted(keys)
+
+
+if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1":
+    # These types are imported lazily at runtime, but we need to tell type
+    # checkers what they are.
+    from ._version import __version__  # noqa
+    from .datasets import AVAILABLE_DATASETS  # noqa
+    from .datasets.base import BaseDataset  # noqa
+    from .datasets.interaction.des import DES  # noqa
+    from .datasets.potential.ani import ANI1, ANI1CCX, ANI1X  # noqa
+    from .datasets.potential.comp6 import COMP6  # noqa
+    from .datasets.potential.dummy import Dummy  # noqa
+    from .datasets.potential.gdml import GDML  # noqa
+    from .datasets.potential.geom import GEOM  # noqa
+    from .datasets.potential.iso_17 import ISO17  # noqa
+    from .datasets.potential.molecule3d import Molecule3D  # noqa
+    from .datasets.potential.nabladft import NablaDFT  # noqa
+    from .datasets.potential.orbnet_denali import OrbnetDenali  # noqa
+    from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6  # noqa
+    from .datasets.potential.qm7x import QM7X  # noqa
+    from .datasets.potential.qmugs import QMugs  # noqa
+    from .datasets.potential.sn2_rxn import SN2RXN  # noqa
+    from .datasets.potential.solvated_peptides import SolvatedPeptides  # noqa
+    from .datasets.potential.spice import Spice  # noqa
+    from .datasets.potential.tmqm import TMQM  # noqa
+    from .datasets.potential.transition1x import Transition1X  # noqa
+    from .datasets.potential.waterclusters3_30 import WaterClusters  # noqa
diff --git a/openqdc/_version.py b/openqdc/_version.py
@@ -0,0 +1,11 @@
+try:
+    from importlib.metadata import PackageNotFoundError, version
+except ModuleNotFoundError:
+    # Try backported to PY<38 `importlib_metadata`.
+    from importlib_metadata import PackageNotFoundError, version
+
+try:
+    __version__ = version("openqdc")
+except PackageNotFoundError:
+    # package is not installed
+    __version__ = "dev"
diff --git a/openqdc/cli.py b/openqdc/cli.py
@@ -0,0 +1,86 @@
+from typing import List, Optional
+
+import typer
+from loguru import logger
+from prettytable import PrettyTable
+from typing_extensions import Annotated
+
+from openqdc import AVAILABLE_DATASETS
+from openqdc.raws.config_factory import DataConfigFactory
+from openqdc.raws.fetch import DataDownloader
+
+app = typer.Typer(help="OpenQDC CLI")
+
+
+def exist_dataset(dataset):
+    if dataset not in AVAILABLE_DATASETS:
+        logger.error(f"{dataset} is not available. Please open an issue on Github for the team to look into it.")
+        return False
+    return True
+
+
+@app.command()
+def download(
+    datasets: List[str],
+    overwrite: Annotated[
+        bool,
+        typer.Option(
+            help="Whether to overwrite or force the re-download of the datasets.",
+        ),
+    ] = False,
+    cache_dir: Annotated[
+        Optional[str],
+        typer.Option(
+            help="Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used.",
+        ),
+    ] = None,
+):
+    """
+    Download preprocessed ml-ready datasets from the main openQDC hub.
+
+    Example:
+        openqdc download Spice QMugs
+    """
+    for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
+        if exist_dataset(dataset):
+            if AVAILABLE_DATASETS[dataset].no_init().is_cached() and not overwrite:
+                logger.info(f"{dataset} is already cached. Skipping download")
+            else:
+                AVAILABLE_DATASETS[dataset](overwrite_local_cache=True, cache_dir=cache_dir)
+
+
+@app.command()
+def datasets():
+    """
+    Print a table of the available openQDC datasets and some informations.
+    """
+    table = PrettyTable(["Name", "Forces", "Level of theory"])
+    for dataset in AVAILABLE_DATASETS:
+        empty_dataset = AVAILABLE_DATASETS[dataset].no_init()
+        has_forces = False if not empty_dataset.__force_methods__ else True
+        table.add_row([dataset, has_forces, ",".join(empty_dataset.__energy_methods__)])
+    table.align = "l"
+    print(table)
+
+
+@app.command()
+def fetch(datasets: List[str]):
+    """
+    Download the raw datasets files from the main openQDC hub.
+    Special case: if the dataset is "all", all available datasets will be downloaded.
+
+    Example:
+        openqdc fetch Spice
+    """
+    if datasets[0] == "all":
+        dataset_names = DataConfigFactory.available_datasets
+    else:
+        dataset_names = datasets
+
+    for dataset_name in dataset_names:
+        dd = DataDownloader()
+        dd.from_name(dataset_name)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/openqdc/datasets/__init__.py b/openqdc/datasets/__init__.py
@@ -0,0 +1,44 @@
+from .base import BaseDataset  # noqa
+from .interaction import DES  # noqa
+from .potential.ani import ANI1, ANI1CCX, ANI1X  # noqa
+from .potential.comp6 import COMP6  # noqa
+from .potential.dummy import Dummy  # noqa
+from .potential.gdml import GDML  # noqa
+from .potential.geom import GEOM  # noqa
+from .potential.iso_17 import ISO17  # noqa
+from .potential.molecule3d import Molecule3D  # noqa
+from .potential.nabladft import NablaDFT  # noqa
+from .potential.orbnet_denali import OrbnetDenali  # noqa
+from .potential.pcqm import PCQM_B3LYP, PCQM_PM6  # noqa
+from .potential.qm7x import QM7X  # noqa
+from .potential.qmugs import QMugs  # noqa
+from .potential.sn2_rxn import SN2RXN  # noqa
+from .potential.solvated_peptides import SolvatedPeptides  # noqa
+from .potential.spice import Spice  # noqa
+from .potential.tmqm import TMQM  # noqa
+from .potential.transition1x import Transition1X  # noqa
+from .potential.waterclusters3_30 import WaterClusters  # noqa
+
+AVAILABLE_DATASETS = {
+    "ani1": ANI1,
+    "ani1ccx": ANI1CCX,
+    "ani1x": ANI1X,
+    "comp6": COMP6,
+    "des": DES,
+    "gdml": GDML,
+    "geom": GEOM,
+    "iso17": ISO17,
+    "molecule3d": Molecule3D,
+    "nabladft": NablaDFT,
+    "orbnetdenali": OrbnetDenali,
+    "pcqmb3lyp": PCQM_B3LYP,
+    "pcqmpm6": PCQM_PM6,
+    "qm7x": QM7X,
+    "qmugs": QMugs,
+    "sn2rxn": SN2RXN,
+    "solvatedpeptides": SolvatedPeptides,
+    "spice": Spice,
+    "tmqm": TMQM,
+    "transition1x": Transition1X,
+    "watercluster": WaterClusters,
+}