From 1d1d2f398c77e3d6bd632915fc44a8f06609958d Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 20 Oct 2023 15:59:48 -0400 Subject: [PATCH 01/20] Docs, atom En, others --- .gitignore | 1 + docs/_overrides/main.html | 46 ++ docs/css/custom.css | 33 + docs/datasets.md | 27 + docs/index.md | 30 + env.yml | 32 +- mkdocs.yml | 69 ++ src/openqdc/datasets/molecule3d.py | 4 +- src/openqdc/datasets/spice.py | 10 +- src/openqdc/utils/atomization_energies.py | 864 ++++++++++++++++++++++ tests/test_dummy.py | 2 + 11 files changed, 1092 insertions(+), 26 deletions(-) create mode 100644 docs/_overrides/main.html create mode 100644 docs/css/custom.css create mode 100644 docs/datasets.md create mode 100644 docs/index.md create mode 100644 mkdocs.yml create mode 100644 src/openqdc/utils/atomization_energies.py create mode 100644 tests/test_dummy.py diff --git a/.gitignore b/.gitignore index 5d3cc32..02443b8 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,4 @@ nohup.out *.csv *.txt *.sh +.DS_Store diff --git a/docs/_overrides/main.html b/docs/_overrides/main.html new file mode 100644 index 0000000..2eafd76 --- /dev/null +++ b/docs/_overrides/main.html @@ -0,0 +1,46 @@ +{% extends "base.html" %} + +{% block content %} +{{ super() }} + + +{% endblock content %} diff --git a/docs/css/custom.css b/docs/css/custom.css new file mode 100644 index 0000000..65db8ea --- /dev/null +++ b/docs/css/custom.css @@ -0,0 +1,33 @@ +/* Indentation. */ +div.doc-contents:not(.first) { + padding-left: 25px; + border-left: 4px solid rgba(230, 230, 230); + margin-bottom: 80px; + } + + /* Don't capitalize names. */ + h5.doc-heading { + text-transform: none !important; + } + + /* Don't use vertical space on hidden ToC entries. */ + .hidden-toc::before { + margin-top: 0 !important; + padding-top: 0 !important; + } + + /* Don't show permalink of hidden ToC entries. */ + .hidden-toc a.headerlink { + display: none; + } + + /* Avoid breaking parameters name, etc. in table cells. */ + td code { + word-break: normal !important; + } + + /* For pieces of Markdown rendered in table cells. */ + td p { + margin-top: 0 !important; + margin-bottom: 0 !important; + } diff --git a/docs/datasets.md b/docs/datasets.md new file mode 100644 index 0000000..a2323fb --- /dev/null +++ b/docs/datasets.md @@ -0,0 +1,27 @@ +# Overview of Datasets + + + +We provide support for the following publicly available QM Datasets. + +| Dataset | # Molecules | # Conformers | Average Conformers per Molecule | Force Labels | Atom Types | QM Level of Theory | Off-Equilibrium Conformations| +| --- | --- | --- | --- | --- | --- | --- | --- | +| [GEOM](https://www.nature.com/articles/s41597-022-01288-4) | 450,000 | 37,000,000 | 82 | No | 18 | GFN2-xTB | No | +| [Molecule3D](https://arxiv.org/abs/2110.01717) | 3,899,647 | 3,899,647 | 1 | No | 5 | B3LYP/6-31G* | No | +| [NablaDFT](https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D) | 1,000,000 | 5,000,000 | 5 | No | 6 | ωB97X-D/def2-SVP | | +| [QMugs](https://www.nature.com/articles/s41597-022-01390-7) | 665,000 | 2,000,000 | 3 | No | 10 | GFN2-xTB, ωB97X-D/def2-SVP | No | +| [Spice](https://arxiv.org/abs/2209.10702) | 19,238 | 1,132,808 | 59 | Yes | 15 | ωB97M-D3(BJ)/def2-TZVPPD | Yes | +| [ANI](https://pubs.rsc.org/en/content/articlelanding/2017/SC/C6SC05720A) | 57,462 | 20,000,000 | 348 | No | 4 | ωB97x:6-31G(d) | Yes | +| [tmQM](https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041) | 86,665 | | | No | | TPSSh-D3BJ/def2-SVP | | +| [DES370K](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 370,000 | 100 | No | 20 | CCSD(T) | Yes | +| [DES5M](https://www.nature.com/articles/s41597-021-00833-x) | 3,700 | 5,000,000 | 1351 | No | 20 | SNS-MP2 | Yes | +| [OrbNet Denali](https://arxiv.org/abs/2107.00299) | 212,905 | 2,300,000 | 11 | No | 16 | GFN1-xTB | Yes | +| [SN2RXN](https://pubs.acs.org/doi/10.1021/acs.jctc.9b00181) | 39 | 452709 | 11,600 | Yes | 6 | DSD-BLYP-D3(BJ)/def2-TZVP | | +| [QM7X](https://www.nature.com/articles/s41597-021-00812-2) | 6,950 | 4,195,237 | 603 | Yes | 7 | PBE0+MBD | Yes | diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..264211f --- /dev/null +++ b/docs/index.md @@ -0,0 +1,30 @@ +# openQDC + +Open Quantum Data Commons + +## Setup Datasets + +Use the scripts in `setup/` to download the datasets. For more information, see the [README](setup/README.md) in the `setup/` directory. + +# Install the library in dev mode +```bash +# Install the deps +mamba env create -n qdc -f env.yml + +# Activate the environment +mamba activate qdc + +# Install the qdc library in dev mode +pip install -e . + +``` + +## Development lifecycle + +### Tests + +You can run tests locally with: + +```bash +pytest . +``` diff --git a/env.yml b/env.yml index 932af25..b5ccda6 100644 --- a/env.yml +++ b/env.yml @@ -28,25 +28,12 @@ dependencies: - ase # ML - - e3nn =0.5.1 - - einops =0.6.0 - - pytorch =2.0.0 - - lightning =2.0.4 - - torchmetrics =0.11.4 - - tensorboard =2.11.2 - - umap-learn =0.5.3 - - pytorch_geometric >=2.3.1 - - pytorch_sparse >=0.6.17 - - pytorch_cluster >=1.6 - - pytorch_scatter >=2.1 - - torch-ema + #- einops =0.6.0 + - pytorch # other stuffs - h5py >=3.8.0 - - omegaconf #==2.3.0 - gdown #==4.6.4 - - hydra-core #==1.3.1 - - wandb #==0.13.10 # Viz - matplotlib @@ -63,7 +50,14 @@ dependencies: - pre-commit - ruff - ipykernel - - pydantic <= 2.0 - - - pip: - - torch-nl + - isort + + # Doc + - mkdocs + - mkdocs-material + - mkdocs-material-extensions + - mkdocstrings + - mkdocs-click + - mkdocs-jupyter + - markdown-include + - mdx_truly_sane_lists diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..7041169 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,69 @@ +site_name: "Open Quantum Data Commons (openQDC)" +site_description: "I don't know... Something about data and Quantum stuff I guess :D" +site_url: "https://github.com/OpenDrugDiscovery/openQDC" +repo_url: "https://github.com/OpenDrugDiscovery/openQDC" +repo_name: "openQDC" +copyright: Copyright 2023 Valence Labs + +remote_branch: "privpage" +use_directory_urls: false +docs_dir: "docs" + +nav: + - Overview: index.md + - Available Datasets: datasets.md +theme: + name: material + custom_dir: docs/_overrides + palette: + primary: teal + accent: purple + features: + - navigation.expand + +extra_css: + - css/custom.css + +extra_javascript: + - javascripts/config.js + - https://polyfill.io/v3/polyfill.min.js?features=es6 + - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js + +markdown_extensions: + - admonition + - markdown_include.include + - pymdownx.emoji + - pymdownx.highlight + - pymdownx.magiclink + - pymdownx.superfences + - pymdownx.tabbed + - pymdownx.tasklist + # For `tab_length=2` in the markdown extension + # See https://github.com/mkdocs/mkdocs/issues/545 + - mdx_truly_sane_lists + - mkdocs-click + - attr_list + - md_in_html + - toc: + permalink: true + +plugins: + - search + - mkdocstrings: + watch: + - src/ + handlers: + python: + setup_commands: + - import sys + - sys.path.append("docs") + - sys.path.append("src") + selection: + new_path_syntax: yes + rendering: + show_root_heading: yes + heading_level: 3 + show_if_no_docstring: true + - mkdocs-jupyter: + execute: False + # kernel_name: python3 diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index 150672a..e2ff212 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -8,7 +8,7 @@ from tqdm import tqdm from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER +from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import get_atomic_number_and_charge @@ -16,7 +16,7 @@ def read_mol(mol, energy): smiles = dm.to_smiles(mol, explicit_hs=False) # subset = dm.to_smiles(dm.to_scaffold_murcko(mol, make_generic=True), explicit_hs=False) x = get_atomic_number_and_charge(mol) - positions = mol.GetConformer().GetPositions() * BOHR2ANG + positions = mol.GetConformer().GetPositions() res = dict( name=np.array([smiles]), diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index aa12843..4c03050 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -6,7 +6,7 @@ from openqdc.datasets.base import BaseDataset from openqdc.utils import load_hdf5_file -from openqdc.utils.constants import BOHR2ANG, MAX_ATOMIC_NUMBER +from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import get_atomic_number_and_charge @@ -15,13 +15,13 @@ def read_record(r): subset = r["subset"][0].decode("utf-8") n_confs = r["conformations"].shape[0] x = get_atomic_number_and_charge(dm.to_mol(smiles, add_hs=True)) - positions = r["conformations"][:] * BOHR2ANG + positions = r["conformations"][:] res = dict( smiles=np.array([smiles] * n_confs), subset=np.array([Spice.subset_mapping[subset]] * n_confs), energies=r[Spice.energy_target_names[0]][:][:, None].astype(np.float32), - forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1) / BOHR2ANG, + forces=r[Spice.force_target_names[0]][:].reshape(-1, 3, 1), atomic_inputs=np.concatenate( (x[None, ...].repeat(n_confs, axis=0), positions), axis=-1, dtype=np.float32 ).reshape(-1, 5), @@ -36,8 +36,8 @@ class Spice(BaseDataset): __energy_methods__ = ["wb97x/def2-tzvp"] __force_methods__ = ["wb97x/def2-tzvp"] __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" + __distance_unit__ = "bohr" + __forces_unit__ = "hartree/bohr" energy_target_names = ["dft_total_energy"] diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py new file mode 100644 index 0000000..01d749a --- /dev/null +++ b/src/openqdc/utils/atomization_energies.py @@ -0,0 +1,864 @@ +ORBENT_DENALI = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I" + +# Energy in atomic unit/ Hartree + +SPICE = { + ("Br", -1): -2574.2451510945853, + ("Br", 0): -2574.1167240829964, + ("C", -1): -37.91424135791358, + ("C", 0): -37.87264507233593, + ("C", 1): -37.45349214963933, + ("Ca", 2): -676.9528465198214, + ("Cl", -1): -460.3350243496703, + ("Cl", 0): -460.1988762285739, + ("F", -1): -99.91298732343974, + ("F", 0): -99.78611622985483, + ("H", 0): -0.4987605100487341, + ("I", -1): -297.8813829975981, + ("I", 0): -297.76228914445625, + ("K", 1): -599.8025677513111, + ("Li", 1): -7.285254714046546, + ("Mg", 2): -199.2688420040449, + ("N", -1): -54.602291095426494, + ("N", 0): -54.62327513368922, + ("N", 1): -54.08594142587869, + ("Na", 1): -162.11366478783253, + ("O", -1): -75.17101657391741, + ("O", 0): -75.11317840410095, + ("O", 1): -74.60241514396725, + ("P", 0): -341.3059197024934, + ("P", 1): -340.9258392474849, + ("S", -1): -398.2405387031612, + ("S", 0): -398.1599636677874, + ("S", 1): -397.7746615977658, +} + + +GFN1 = { + ("H", 0): -0.4014294744618301, + ("Li", 1): 0.13691666666666666, + ("B", -3): -1.652343221335327, + ("B", -1): -1.3514075648859643, + ("B", 0): -1.1998696279038876, + ("B", 3): 2.7107996287190113, + ("C", -1): -1.9170116002810327, + ("C", 0): -1.7411359557542052, + ("C", 1): -1.1060742863488982, + ("N", -1): -3.128423313087365, + ("N", 0): -2.8988862104065958, + ("N", 1): -2.1782414865973068, + ("O", -1): -4.705386032968986, + ("O", 0): -4.352652340864803, + ("O", 1): -3.3929027848641797, + ("F", -1): -5.322297034311178, + ("F", 0): -4.9969448424630265, + ("Na", 1): 0.12295400000000001, + ("Mg", 2): 1.0016353333333334, + ("Si", 4): 5.448927240930351, + ("Si", 0): -1.625263132618416, + ("Si", -4): -4.503876330547808, + ("P", 0): -2.4250620380497385, + ("P", 1): -1.7319786163576927, + ("S", -1): -3.761566793286506, + ("S", 0): -3.535920743315634, + ("S", 1): -2.772567335542398, + ("Cl", -1): -4.527948236258716, + ("Cl", 0): -4.166353944016668, + ("K", 1): 0.08160976666666667, + ("Ca", 2): 0.5662308, + ("Br", -1): -3.957113536482028, + ("Br", 0): -3.818039553459528, + ("I", -1): -4.043592677461303, + ("I", 0): -3.885757275227844, +} +GFN2 = { + ("H", 0): -0.3934827590437188, + ("Li", 1): 0.1659637, + ("B", -3): 0.4947743711421284, + ("B", -1): -0.8833252789733281, + ("B", 0): -0.9524366145568732, + ("B", 3): 2.886742362272, + ("C", -1): -1.9209221941523813, + ("C", 0): -1.7951105194038206, + ("C", 1): -1.7951105194038206, + ("N", -1): -2.8228473813671173, + ("N", 0): -2.609452454632062, + ("N", 1): -1.9127945803017519, + ("O", -1): -4.0689442489122944, + ("O", 0): -3.769421095414337, + ("O", 1): -2.948538063156781, + ("F", -1): -4.909635517185826, + ("F", 0): -4.619339955465996, + ("Na", 1): 0.19548556666666667, + ("Mg", 2): 1.3160877333333334, + ("Si", 4): 4.473259319583333, + ("Si", 0): -1.5714240856447492, + ("Si", -4): -1.0243162958137662, + ("P", 0): -2.377807088085606, + ("P", 1): -1.8635041144652795, + ("S", -1): -3.4046900452338025, + ("S", 0): -3.1482710158768508, + ("S", 1): -2.5869831371080387, + ("Cl", -1): -4.785133953760966, + ("Cl", 0): -4.482525134292114, + ("K", 1): 0.19157049999999998, + ("Ca", 2): 1.1759288, + ("Br", -1): -4.332231166471951, + ("Br", 0): -4.048339370569741, + ("I", -1): -4.060355599036047, + ("I", 0): -3.7796302627467933, +} + +DFTB = { + ("H", 0): -0.2386004000, + ("Li", 1): 0.000000000, + ("B", -3): 0.1087536003, + ("B", -1): -0.8108828001, + ("B", 0): -0.8263560001, + ("B", 3): 1.3330350000, + ("C", -1): -1.4104987700, + ("C", 0): -1.3984936602, + ("C", 1): -1.0217885507, + ("N", -1): -2.1474619199, + ("N", 0): -2.1021839400, + ("N", 1): -1.6260059609, + ("O", -1): -3.1706232699, + ("O", 0): -3.0861916005, + ("O", 1): -2.5063599300, + ("F", -1): -4.3647240000, + ("F", 0): -4.2352190003, + ("Na", 1): 0.0825500000, + ("Mg", 2): 0.4492000000, + ("Si", 4): 0.2875390800, + ("Si", 0): -1.0920777201, + ("Si", -4): 1.9808720000, + ("P", 0): -1.6295741400, + ("P", 1): -1.2821088196, + ("S", -1): -2.3857500900, + ("S", 0): -2.2921235603, + ("S", 1): -1.8696970300, + ("Cl", -1): -3.2238180000, + ("Cl", 0): -3.0908230002, + ("K", 1): 0.0678210000, + ("Ca", 2): 0.3528980000, + ("Br", -1): -3.0478250000, + ("Br", 0): -2.9228540002, + ("I", -1): -2.6981275000, + ("I", 0): -2.5796080002, +} + +# PM6 is in kcal/mol need to change it to hartree +PM6 = { + ("H", 0): 0.08302988483033709, + ("Li", 1): 0.23429648020984556, + ("B", -3): 1.042845967149475, + ("B", -1): 0.2915413006028599, + ("B", 0): 0.2162518784591137, + ("B", 3): 2.036692812374006, + ("C", -1): 0.3702885058222273, + ("C", 0): 0.34355728762455995, + ("C", 1): 0.5942116527412356, + ("N", -1): 0.29851662685316066, + ("N", 0): 0.3266578327960236, + ("N", 1): 0.8167661499675701, + ("O", -1): 0.06245921572439598, + ("O", 0): 0.2760200570828466, + ("O", 1): 0.6881966155067099, + ("F", -1): -0.09819551592088718, + ("F", 0): 0.030103153898987902, + ("Na", 1): 0.20761332506784766, + ("Mg", 2): 0.8654790767941177, + ("Si", 4): 2.6874249452995893, + ("Si", 0): 0.19559781612694002, + ("Si", -4): 0.909424581958187, + ("P", 0): 0.1881765839215055, + ("P", 1): 0.5283679118546506, + ("S", -1): 0.00773920374050412, + ("S", 0): 0.15340740929612162, + ("S", 1): 0.5198027279290017, + ("Cl", -1): -0.09598933242391743, + ("Cl", 0): 0.04614458119325779, + ("K", 1): 0.17382321209735638, + ("Ca", 2): 0.6490542924483952, + ("Br", -1): -0.0878626123290662, + ("Br", 0): 0.04068832478896717, + ("I", -1): -0.06868953273976947, + ("I", 0): 0.038916541436059084, +} + + +ISOLATED_ATOM_ENERGIES = { + "wb97m-d3(bj)": {"def2-tzvp": SPICE}, + "wb97x-d3": {"def2-tzvp": ORBENT_DENALI}, + "gfn2": GFN2, + "gfn1": GFN1, + "dftb3": DFTB, +} + +CHARGES_LOOKUP = { + "Br": [-1, 0], + "C": [-1, 0, 1], + "B": [-3, -1, 0, 3], + "Ca": [2], + "Cl": [-1, 0], + "F": [-1, 0], + "H": [0], + "I": [-1, 0], + "K": [1], + "Li": [1], + "Mg": [2], + "N": [-1, 0, 1], + "Na": [1], + "O": [-1, 0, 1], + "P": [0, 1], + "S": [-1, 0, 1], + "Si": [+4, 0, -4], +} + + +# "tpssh/def2-tzvp" +bas1 = { + ("H", 0): -0.4998936035891093, + ("Li", 1): -7.285942861425713, + ("B", -3): -24.011884397333016, + ("B", -1): -24.671478908940745, + ("B", 0): -24.66555991803692, + ("B", 3): -22.03729209090186, + ("C", -1): -37.902383828698945, + ("C", 0): -37.8619600939805, + ("C", 1): -37.44108173595555, + ("N", -1): -54.58878376740317, + ("N", 0): -54.61011499135528, + ("N", 1): -54.07150720832228, + ("O", -1): -75.12797596615384, + ("O", 0): -75.0993524949928, + ("O", 1): -74.58770047919643, + ("F", -1): -99.86387164958151, + ("F", 0): -99.76596802854195, + ("Na", 1): -162.0916076478938, + ("Mg", 2): -199.24528576913457, + ("Si", 4): -285.59703939232946, + ("Si", 0): -289.3842044105128, + ("Si", -4): -288.1798768489279, + ("P", 0): -341.2798907965112, + ("P", 1): -340.89320025019333, + ("S", -1): -398.19525449701325, + ("S", 0): -398.130358877624, + ("S", 1): -397.7467993687058, + ("Cl", -1): -460.28412127843484, + ("Cl", 0): -460.1641720279233, + ("K", 1): -599.7644436257333, + ("Ca", 2): -676.9154959968483, + ("Br", -1): -2574.1448096288846, + ("Br", 0): -2574.0232838745055, + ("I", -1): -297.70580680306847, + ("I", 0): -297.5887657326151, +} +# "wb97m-d3bj/def2-TZVPPD" +bas2 = { + ("H", 0): -0.4987605100487541, + ("Li", 1): -7.285254714046117, + ("B", -3): -24.191211616488623, + ("B", -1): -24.677421752607636, + ("B", 0): -24.671520535412856, + ("B", 3): -22.051237471894204, + ("C", -1): -37.914241357934024, + ("C", 0): -37.872645072317844, + ("C", 1): -37.45349214963851, + ("N", -1): -54.602291095940885, + ("N", 0): -54.62327513391132, + ("N", 1): -54.08594142612827, + ("O", -1): -75.17101657361833, + ("O", 0): -75.11317840403545, + ("O", 1): -74.6024151438455, + ("F", -1): -99.9129873233742, + ("F", 0): -99.78611622966918, + ("Na", 1): -162.11366478753402, + ("Mg", 2): -199.26884200420963, + ("Si", 4): -285.6283113353237, + ("Si", 0): -289.413135230185, + ("Si", -4): -288.27589059244787, + ("P", 0): -341.3059197004091, + ("P", 1): -340.92583924542475, + ("S", -1): -398.24053870171247, + ("S", 0): -398.15996366615616, + ("S", 1): -397.7746615960709, + ("Cl", -1): -460.33502435018204, + ("Cl", 0): -460.1988762286936, + ("K", 1): -599.8025677532396, + ("Ca", 2): -676.9528465165403, + ("Br", -1): -2574.2451510820465, + ("Br", 0): -2574.1167240800246, + ("I", -1): -297.88138299501395, + ("I", 0): -297.7622891423178, +} +# "revpbe-d3(bj)/def2-tzvp" +bas3 = { + ("H", 0): -0.5041476427597161, + ("Li", 1): -7.280731201437635, + ("B", -3): -24.006372610643076, + ("B", -1): -24.660992037766704, + ("B", 0): -24.652853868669744, + ("B", 3): -22.023688582481086, + ("C", -1): -37.88698396215454, + ("C", 0): -37.845600548516586, + ("C", 1): -37.42375720909004, + ("N", -1): -54.56844448819074, + ("N", 0): -54.58772405988695, + ("N", 1): -54.04957647943518, + ("O", -1): -75.10545816278959, + ("O", 0): -75.07120398742593, + ("O", 1): -74.55841255571633, + ("F", -1): -99.83653702337733, + ("F", 0): -99.7348800787186, + ("Na", 1): -162.04202541023028, + ("Mg", 2): -199.1857779742493, + ("Si", 4): -285.5196533711662, + ("Si", 0): -289.31537776907356, + ("Si", -4): -288.11458640061954, + ("P", 0): -341.20094262951534, + ("P", 1): -340.81665455610573, + ("S", -1): -398.10497764958086, + ("S", 0): -398.04159371790865, + ("S", 1): -397.6599146755941, + ("Cl", -1): -460.1836953722962, + ("Cl", 0): -460.0661711540315, + ("K", 1): -599.6472569880391, + ("Ca", 2): -676.7916386065199, + ("Br", -1): -2574.0081469191155, + ("Br", 0): -2573.890240418883, + ("I", -1): -297.8357436124949, + ("I", 0): -297.72268439613055, +} +# "DSD-BLYP-D3BJ/def2-TZVPPD" +bas3 = { + ("H", 0): -0.4990585651127987, + ("Li", 1): -7.2751828330696995, + ("B", -3): -24.127790514752746, + ("B", -1): -24.62825292497449, + ("B", 0): -24.628518170377323, + ("B", 3): -22.01440439226537, + ("C", -1): -37.85187643574064, + ("C", 0): -37.81800653654633, + ("C", 1): -37.4026616247957, + ("N", -1): -54.529773519860626, + ("N", 0): -54.55929475542038, + ("N", 1): -54.02654716655024, + ("O", -1): -75.08730105751656, + ("O", 0): -75.03632370546934, + ("O", 1): -74.53620016366052, + ("F", -1): -99.82374475663487, + ("F", 0): -99.6990797359127, + ("Na", 1): -161.96633141740327, + ("Mg", 2): -199.1186151803418, + ("Si", 4): -285.4592439444118, + ("Si", 0): -289.2354767511652, + ("Si", -4): -288.12487758144147, + ("P", 0): -341.1278868392075, + ("P", 1): -340.7469511203367, + ("S", -1): -398.0441756257772, + ("S", 0): -397.9705195592595, + ("S", 1): -397.5944122508692, + ("Cl", -1): -460.13181548141955, + ("Cl", 0): -460.0006937311494, + ("K", 1): -599.4901238823808, + ("Ca", 2): -676.6456698988475, + ("Br", -1): -2573.604327011817, + ("Br", 0): -2573.477602568216, + ("I", -1): -297.5733470600828, + ("I", 0): -297.4541938789708, +} +# "b3lyp/6-31g*" +bas4 = { + ("H", 0): -0.5002733301377901, + ("Li", 1): -7.284546111273075, + ("B", -3): -23.577268753399462, + ("B", -1): -24.614577395156598, + ("B", 0): -24.65435524492553, + ("B", 3): -22.018169862974275, + ("C", -1): -37.844269871879376, + ("C", 0): -37.84628033285479, + ("C", 1): -37.42731164237431, + ("N", -1): -54.52864356359092, + ("N", 0): -54.584488815424095, + ("N", 1): -54.0458621835885, + ("O", -1): -75.05272792994404, + ("O", 0): -75.06062109946738, + ("O", 1): -74.54659271939704, + ("F", -1): -99.75408410035712, + ("F", 0): -99.71553471526475, + ("Na", 1): -162.081235395777, + ("Mg", 2): -199.22734695613283, + ("Si", 4): -285.5564410277949, + ("Si", 0): -289.3717359984153, + ("Si", -4): -288.02795351148654, + ("P", 0): -341.2580911838578, + ("P", 1): -340.8765976669208, + ("S", -1): -398.16568433994024, + ("S", 0): -398.1049932797066, + ("S", 1): -397.7199808615457, + ("Cl", -1): -460.25223446009306, + ("Cl", 0): -460.13624346967765, + ("K", 1): -599.7247062673807, + ("Ca", 2): -676.8667395990246, + ("Br", -1): -2573.824201570383, + ("Br", 0): -2573.705283744811, + ("I", -1): None, + ("I", 0): None, +} +# "wb97x-d3/def2-tzvp" +bas3 = { + ("H", 0): -0.5025865385814652, + ("Li", 1): -7.289728176048259, + ("B", -3): -23.984063702375366, + ("B", -1): -24.655892805089884, + ("B", 0): -24.652426319775287, + ("B", 3): -22.068923453406843, + ("C", -1): -37.88249635015094, + ("C", 0): -37.84495506623085, + ("C", 1): -37.42572594563294, + ("N", -1): -54.566013571722955, + ("N", 0): -54.58956332659741, + ("N", 1): -54.053510120855016, + ("O", -1): -75.10770262264376, + ("O", 0): -75.07371685344017, + ("O", 1): -74.56770852466894, + ("F", -1): -99.84730255807874, + ("F", 0): -99.74441357744517, + ("Na", 1): -162.08090997566165, + ("Mg", 2): -199.2423311291131, + ("Si", 4): -285.61307018231093, + ("Si", 0): -289.36007009205474, + ("Si", -4): -288.13938913442, + ("P", 0): -341.2535866489386, + ("P", 1): -340.8713081439191, + ("S", -1): -398.17523835330115, + ("S", 0): -398.1081144325829, + ("S", 1): -397.7235371215097, + ("Cl", -1): -460.26962615981756, + ("Cl", 0): -460.1472726772528, + ("K", 1): -599.7560426196044, + ("Ca", 2): -676.9122500284535, + ("Br", -1): -2574.293316484485, + ("Br", 0): -2574.1721188129304, + ("I", -1): -297.8647496186801, + ("I", 0): -297.7482461760336, +} +# "wb97x-d/def2-svp" +bas3 = { + ("H", 0): -0.5024927493280441, + ("Li", 1): -7.289461512680954, + ("B", -3): -23.76326340520956, + ("B", -1): -24.616565541453497, + ("B", 0): -24.62229041950939, + ("B", 3): -22.05799995059738, + ("C", -1): -37.819977678758974, + ("C", 0): -37.79809943233551, + ("C", 1): -37.37569908192604, + ("N", -1): -54.459277717462086, + ("N", 0): -54.522416758144296, + ("N", 1): -53.98339066860825, + ("O", -1): -74.96664546628877, + ("O", 0): -74.97667950172594, + ("O", 1): -74.47138898492452, + ("F", -1): -99.66683980036512, + ("F", 0): -99.61447206028255, + ("Na", 1): -162.0226698276339, + ("Mg", 2): -199.1739400418112, + ("Si", 4): -285.52441678317916, + ("Si", 0): -289.2630396380861, + ("Si", -4): -287.76522279776617, + ("P", 0): -341.13939934765074, + ("P", 1): -340.75715448577955, + ("S", -1): -398.0129589348639, + ("S", 0): -397.9719510287289, + ("S", 1): -397.58695970543334, + ("Cl", -1): -460.0809386171713, + ("Cl", 0): -459.9885726673416, + ("K", 1): -599.6772169304438, + ("Ca", 2): -676.8244048230532, + ("Br", -1): -2573.9600885084546, + ("Br", 0): -2573.856581446253, + ("I", -1): -297.8445820598362, + ("I", 0): -297.7376955031015, +} +# "wb97x/6-31g(d)" +bas3 = { + ("H", 0): -0.4993457316092281, + ("Li", 1): -7.2856300653219614, + ("B", -3): -23.575157416550805, + ("B", -1): -24.603134775026213, + ("B", 0): -24.642610267398982, + ("B", 3): -22.07124234970699, + ("C", -1): -37.834042127064706, + ("C", 0): -37.83384116353608, + ("C", 1): -37.41881056856161, + ("N", -1): -54.513028620185864, + ("N", 0): -54.573313922039716, + ("N", 1): -54.036340248157515, + ("O", -1): -75.03386211245754, + ("O", 0): -75.04249624495868, + ("O", 1): -74.53884510892807, + ("F", -1): -99.7350451879463, + ("F", 0): -99.69494212517318, + ("Na", 1): -162.0682250235374, + ("Mg", 2): -199.22919949102433, + ("Si", 4): -285.5967323489095, + ("Si", 0): -289.3398443488577, + ("Si", -4): -288.0053873657048, + ("P", 0): -341.2319240654614, + ("P", 1): -340.85012602930203, + ("S", -1): -398.14261145000256, + ("S", 0): -398.0814606242194, + ("S", 1): -397.6998359561112, + ("Cl", -1): -460.2341096421279, + ("Cl", 0): -460.1166957612669, + ("K", 1): -599.7184666927276, + ("Ca", 2): -676.8704088358037, + ("Br", -1): -2573.8502718776604, + ("Br", 0): -2573.733913792756, + ("I", -1): None, + ("I", 0): None, +} +# "WB97X/6-31g*" +bas3 = { + ("H", 0): -0.4993457316092281, + ("Li", 1): -7.285630065321961, + ("B", -3): -23.5751574165508, + ("B", -1): -24.603134775026216, + ("B", 0): -24.64261026739898, + ("B", 3): -22.071242349706992, + ("C", -1): -37.834042127064706, + ("C", 0): -37.83384116353608, + ("C", 1): -37.4188105685616, + ("N", -1): -54.5130286201859, + ("N", 0): -54.57331392203972, + ("N", 1): -54.03634024815754, + ("O", -1): -75.03386211245756, + ("O", 0): -75.0424962449587, + ("O", 1): -74.5388451089281, + ("F", -1): -99.7350451879463, + ("F", 0): -99.69494212517317, + ("Na", 1): -162.06822502353745, + ("Mg", 2): -199.2291994910244, + ("Si", 4): -285.5967323489095, + ("Si", 0): -289.3398443488578, + ("Si", -4): -288.00538736570485, + ("P", 0): -341.2319240654613, + ("P", 1): -340.85012602930215, + ("S", -1): -398.14261145000256, + ("S", 0): -398.0814606242193, + ("S", 1): -397.6998359561114, + ("Cl", -1): -460.23410964212803, + ("Cl", 0): -460.1166957612671, + ("K", 1): -599.7184666927277, + ("Ca", 2): -676.8704088358036, + ("Br", -1): -2573.8502718776604, + ("Br", 0): -2573.7339137927547, + ("I", -1): None, + ("I", 0): None, +} +# "ccsd/aug-cc-pVDZ" +bas3 = { + ("H", 0): -0.49933431543958506, + ("Li", 1): -7.23623079003172, + ("B", -3): -24.135298809957895, + ("B", -1): -24.595731151135812, + ("B", 0): -24.591070884515084, + ("B", 3): -21.985913735106703, + ("C", -1): -37.80520563794191, + ("C", 0): -37.76484921430014, + ("C", 1): -37.35862660518426, + ("N", -1): -54.46561904421205, + ("N", 0): -54.48723914213882, + ("N", 1): -53.959899854043286, + ("O", -1): -74.96558003564495, + ("O", 0): -74.9255348291028, + ("O", 1): -74.4432579985748, + ("F", -1): -99.66462266282274, + ("F", 0): -99.54960172383534, + ("Na", 1): -161.67194573263333, + ("Mg", 2): -198.8268633109654, + ("Si", 4): -285.1795420310209, + ("Si", 0): -288.9225171059681, + ("Si", -4): -288.13012523255236, + ("P", 0): -340.80119511758613, + ("P", 1): -340.42190068851625, + ("S", -1): -397.67826887815926, + ("S", 0): -397.6146112492681, + ("S", 1): -397.2542253763525, + ("Cl", -1): -459.7398865093852, + ("Cl", 0): -459.6156482951034, + ("K", 1): None, + ("Ca", 2): None, + ("Br", -1): -2572.6265539931533, + ("Br", 0): -2572.5063313966352, + ("I", -1): None, + ("I", 0): None, +} +# "ccsd(t)/aug-cc-pVDZ" +bas3 = { + ("H", 0): -0.4993343154395853, + ("Li", 1): -7.236230790031718, + ("B", -3): -24.14659676027675, + ("B", -1): -24.59834841644963, + ("B", 0): -24.592013924578307, + ("B", 3): -21.98591373510674, + ("C", -1): -37.80822234639533, + ("C", 0): -37.7661399495972, + ("C", 1): -37.3593489962868, + ("N", -1): -54.46970203317129, + ("N", 0): -54.488530163663306, + ("N", 1): -53.96079905255966, + ("O", -1): -74.97107484978555, + ("O", 0): -74.92736838177342, + ("O", 1): -74.44405741349318, + ("F", -1): -99.67058259815346, + ("F", 0): -99.55194323117622, + ("Na", 1): -161.67196199847683, + ("Mg", 2): -198.8269101640321, + ("Si", 4): -285.1796031904412, + ("Si", 0): -288.9239884021825, + ("Si", -4): -288.14250182593497, + ("P", 0): -340.80293105856066, + ("P", 1): -340.4231288782063, + ("S", -1): -397.68239119590464, + ("S", 0): -397.61679149962197, + ("S", 1): -397.2555638941634, + ("Cl", -1): -459.74421517568555, + ("Cl", 0): -459.6181191157645, + ("K", 1): None, + ("Ca", 2): None, + ("Br", -1): -2572.630606833861, + ("Br", 0): -2572.508930744571, + ("I", -1): None, + ("I", 0): None, +} +# "mp2/aug-cc-pVDZ" +bas3 = { + ("H", 0): -0.4993343154395852, + ("Li", 1): -7.2362434239942885, + ("B", -3): -24.11454063530035, + ("B", -1): -24.57403291869507, + ("B", 0): -24.568723938484855, + ("B", 3): -21.98592739023366, + ("C", -1): -37.78658968444089, + ("C", 0): -37.74289655875525, + ("C", 1): -37.33330128905729, + ("N", -1): -54.44347106000461, + ("N", 0): -54.46985977846849, + ("N", 1): -53.93770877612693, + ("O", -1): -74.95558042845218, + ("O", 0): -74.90882930239204, + ("O", 1): -74.42742702171483, + ("F", -1): -99.66810645703836, + ("F", 0): -99.5377379527871, + ("Na", 1): -161.67200581779124, + ("Mg", 2): -198.8269131203642, + ("Si", 4): -285.17950758651557, + ("Si", 0): -288.90336148257995, + ("Si", -4): -288.12382709478203, + ("P", 0): -340.78346939708916, + ("P", 1): -340.4015180393644, + ("S", -1): -397.6614469463811, + ("S", 0): -397.5953187556735, + ("S", 1): -397.236034450623, + ("Cl", -1): -459.7293671162834, + ("Cl", 0): -459.5986332871817, + ("K", 1): None, + ("Ca", 2): None, + ("Br", -1): -2571.9455214335435, + ("Br", 0): -2571.8203622687925, + ("I", -1): None, + ("I", 0): None, +} +# "mp2/def2-TZVP" +bas3 = { + ("H", 0): -0.4998098322318883, + ("Li", 1): -7.26625465274989, + ("B", -3): -23.89130329586724, + ("B", -1): -24.58967154224317, + ("B", 0): -24.59074548143485, + ("B", 3): -21.99943494200725, + ("C", -1): -37.81110910609783, + ("C", 0): -37.77471406753249, + ("C", 1): -37.36120515772786, + ("N", -1): -54.474221753525356, + ("N", 0): -54.51486367243164, + ("N", 1): -53.97922862858532, + ("O", -1): -75.00152176187984, + ("O", 0): -74.97513105465687, + ("O", 1): -74.48759502971161, + ("F", -1): -99.73457909250294, + ("F", 0): -99.62808382176112, + ("Na", 1): -161.83073450947992, + ("Mg", 2): -198.9798405609494, + ("Si", 4): -285.26774080524564, + ("Si", 0): -289.0086162111446, + ("Si", -4): -287.737519515362, + ("P", 0): -340.89251993087385, + ("P", 1): -340.5074615537276, + ("S", -1): -397.7717421040001, + ("S", 0): -397.71573728264894, + ("S", 1): -397.34975334831165, + ("Cl", -1): -459.84969455647206, + ("Cl", 0): -459.7312731162239, + ("K", 1): -599.1623610013563, + ("Ca", 2): -676.3191334447123, + ("Br", -1): -2572.8329868011315, + ("Br", 0): -2572.7140648042205, + ("I", -1): -297.32915651116025, + ("I", 0): -297.2135511448063, +} +# "SVWN/def2-TZVP" +bas3 = { + ("H", 0): -0.4961415246858913, + ("Li", 1): -7.182160595407815, + ("B", -3): -23.858154175760482, + ("B", -1): -24.477102446655582, + ("B", 0): -24.446672986035107, + ("B", 3): -21.78388674779827, + ("C", -1): -37.648803413486476, + ("C", 0): -37.57960202253736, + ("C", 1): -37.13377025356311, + ("N", -1): -54.268858501552714, + ("N", 0): -54.264236284313675, + ("N", 1): -53.69660297293359, + ("O", -1): -74.75021611814427, + ("O", 0): -74.68022879998783, + ("O", 1): -74.14595350398997, + ("F", -1): -99.4308126971536, + ("F", 0): -99.2855801211432, + ("Na", 1): -161.43940087938617, + ("Mg", 2): -198.482989208704, + ("Si", 4): -284.6095063412437, + ("Si", 0): None, + ("Si", -4): -287.36361152706985, + ("P", 0): -340.28781390909336, + ("P", 1): None, + ("S", -1): -396.74391290562517, + ("S", 0): -397.0472344910708, + ("S", 1): -396.6400428334645, + ("Cl", -1): -459.1427217366059, + ("Cl", 0): None, + ("K", 1): -598.3826110301004, + ("Ca", 2): -675.4148005786843, + ("Br", -1): -2571.43279407191, + ("Br", 0): None, + ("I", -1): -297.89817894897124, + ("I", 0): None, +} +# "PBE-D3BJ2B/def2-TZVP" +bas3 = { + ("H", 0): -0.49963874688778964, + ("Li", 1): -7.256644236856915, + ("B", -3): -23.965651173919607, + ("B", -1): -24.61987718656591, + ("B", 0): -24.610084509857693, + ("B", 3): -21.981186468975643, + ("C", -1): -37.839839802893856, + ("C", 0): -37.79597394493031, + ("C", 1): -37.37216480722536, + ("N", -1): -54.51524854184836, + ("N", 0): -54.53214830302369, + ("N", 1): -53.99133373760564, + ("O", -1): -75.04792601078884, + ("O", 0): -75.00968214869428, + ("O", 1): -74.49434051926339, + ("F", -1): -99.77558183886408, + ("F", 0): -99.6691400940838, + ("Na", 1): -161.96413737180777, + ("Mg", 2): -199.10001096170987, + ("Si", 4): -285.4180171255296, + ("Si", 0): -289.2228701070572, + ("Si", -4): -288.0227167833236, + ("P", 0): -341.1030537066697, + ("P", 1): -340.7177213193741, + ("S", -1): -398.00391422389356, + ("S", 0): -397.93836821335026, + ("S", 1): -397.5554184472038, + ("Cl", -1): -460.0784728779802, + ("Cl", 0): -459.9584144179813, + ("K", 1): -599.5277926006078, + ("Ca", 2): -676.665524794864, + ("Br", -1): -2573.8415230490864, + ("Br", 0): -2573.720729522128, + ("I", -1): -297.7815346863239, + ("I", 0): -297.66553802500096, +} +# "B3LYP-D3MBJ2B/def2-TZVP" +bas3 = { + ("H", 0): -0.5021763508982502, + ("Li", 1): -7.28605166725753, + ("B", -3): -24.00227248681287, + ("B", -1): -24.670150534162623, + ("B", 0): -24.66392221445664, + ("B", 3): -22.020454695632036, + ("C", -1): -37.89817823158867, + ("C", 0): -37.85948152785869, + ("C", 1): -37.43552078960403, + ("N", -1): -54.58873727556918, + ("N", 0): -54.60398141018468, + ("N", 1): -54.065523148633176, + ("O", -1): -75.13521710860505, + ("O", 0): -75.09628346877744, + ("O", 1): -74.57769937644677, + ("F", -1): -99.87634645410799, + ("F", 0): -99.77016379237457, + ("Na", 1): -162.09255440877646, + ("Mg", 2): -199.2394349246892, + ("Si", 4): -285.575845762374, + ("Si", 0): -289.3920722437195, + ("Si", -4): -288.17382798168956, + ("P", 0): -341.28064911053326, + ("P", 1): -340.89904032318145, + ("S", -1): -398.200223492228, + ("S", 0): -398.1324076067549, + ("S", 1): -397.7448455107872, + ("Cl", -1): -460.2889124003806, + ("Cl", 0): -460.16699382696663, + ("K", 1): -599.7602668684151, + ("Ca", 2): -676.9064118669689, + ("Br", -1): -2574.264312179195, + ("Br", 0): -2574.140975849301, + ("I", -1): -297.89704873064437, + ("I", 0): -297.7784640477503, +} +# "b3lyp/def2-TZVP" +bas3 = { + ("H", 0): -0.5021763508982502, + ("Li", 1): -7.2860516672575315, + ("B", -3): -24.002272486812885, + ("B", -1): -24.67015053416263, + ("B", 0): -24.663922214456655, + ("B", 3): -22.020454695632043, + ("C", -1): -37.89817823158866, + ("C", 0): -37.85948152785869, + ("C", 1): -37.435520789604034, + ("N", -1): -54.588737275569194, + ("N", 0): -54.603981410184666, + ("N", 1): -54.065523148633176, + ("O", -1): -75.13521710860508, + ("O", 0): -75.09628346877746, + ("O", 1): -74.57769937644687, + ("F", -1): -99.8763464541079, + ("F", 0): -99.7701637923746, + ("Na", 1): -162.0925544087764, + ("Mg", 2): -199.23943492468925, + ("Si", 4): -285.5758457623741, + ("Si", 0): -289.3920722437192, + ("Si", -4): -288.1738279816895, + ("P", 0): -341.28064911053326, + ("P", 1): -340.8990403231815, + ("S", -1): -398.2002234922283, + ("S", 0): -398.1324076067552, + ("S", 1): -397.744845510787, + ("Cl", -1): -460.28891240038075, + ("Cl", 0): -460.1669938269668, + ("K", 1): -599.7602668684153, + ("Ca", 2): -676.9064118669687, + ("Br", -1): -2574.264312179194, + ("Br", 0): -2574.140975849301, + ("I", -1): -297.8970487306444, + ("I", 0): -297.7784640477502, +} diff --git a/tests/test_dummy.py b/tests/test_dummy.py new file mode 100644 index 0000000..3887df5 --- /dev/null +++ b/tests/test_dummy.py @@ -0,0 +1,2 @@ +def test_this_file(): + print("this is a dummy test") From 9d61d0b625ce7a34d7e783befee44d6be1b5b8e1 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 20 Oct 2023 16:16:47 -0400 Subject: [PATCH 02/20] Tutorial --- docs/tutorials/usage.ipynb | 43 ++++++++++++++++++++++++++++++++++++ mkdocs.yml | 4 ++++ src/openqdc/datasets/base.py | 1 + 3 files changed, 48 insertions(+) create mode 100644 docs/tutorials/usage.ipynb diff --git a/docs/tutorials/usage.ipynb b/docs/tutorials/usage.ipynb new file mode 100644 index 0000000..d813272 --- /dev/null +++ b/docs/tutorials/usage.ipynb @@ -0,0 +1,43 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Really Hard example\n", + "\n", + "## Instantiate and GO!\n", + "\n", + "If you don't have the dataset downloaded it will be downloaded automatically and cached. You just instantiate the class and you are ready to go." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from openqdc.datasets.geom import GEOM\n", + "ds = GEOM()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the item at index 0\n", + "\n", + "ds[0]" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/mkdocs.yml b/mkdocs.yml index 7041169..c159906 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,6 +12,8 @@ docs_dir: "docs" nav: - Overview: index.md - Available Datasets: datasets.md + - Tutorials: + - Really hard example: tutorials/usage.ipynb theme: name: material custom_dir: docs/_overrides @@ -19,8 +21,10 @@ theme: primary: teal accent: purple features: + - navigation.tabs - navigation.expand + extra_css: - css/custom.css diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 9e6f67e..dfe1be2 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -67,6 +67,7 @@ class BaseDataset(torch.utils.data.Dataset): __force_methods__ = [] energy_target_names = [] force_target_names = [] + # convert force gradient -1 __energy_unit__ = "hartree" __distance_unit__ = "bohr" From 494b28870e8ebde7405c39b5a0b0fd0d83ad49b3 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 20 Oct 2023 16:22:22 -0400 Subject: [PATCH 03/20] Clean --- docs/API/available_datasets.md | 3 +++ mkdocs.yml | 2 ++ 2 files changed, 5 insertions(+) create mode 100644 docs/API/available_datasets.md diff --git a/docs/API/available_datasets.md b/docs/API/available_datasets.md new file mode 100644 index 0000000..fa630b8 --- /dev/null +++ b/docs/API/available_datasets.md @@ -0,0 +1,3 @@ +# Available Datasets + +::: openqdc.datasets diff --git a/mkdocs.yml b/mkdocs.yml index c159906..9aca43c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -14,6 +14,8 @@ nav: - Available Datasets: datasets.md - Tutorials: - Really hard example: tutorials/usage.ipynb + - API: + - Datasets: API/available_datasets.md theme: name: material custom_dir: docs/_overrides From 3bf26d4558bf18f6a3152590d9a014e9ecff4711 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 24 Oct 2023 11:50:42 -0400 Subject: [PATCH 04/20] Isolated Atom En --- src/openqdc/datasets/ani.py | 32 +- src/openqdc/datasets/base.py | 11 + src/openqdc/datasets/comp6.py | 16 +- src/openqdc/datasets/dess.py | 16 +- src/openqdc/datasets/dummy.py | 47 ++ src/openqdc/datasets/gdml.py | 8 +- src/openqdc/datasets/iso_17.py | 4 +- src/openqdc/datasets/qm7x.py | 4 +- src/openqdc/datasets/sn2_rxn.py | 4 +- src/openqdc/datasets/spice.py | 4 +- src/openqdc/datasets/tmqm.py | 6 +- src/openqdc/datasets/waterclusters3_30.py | 6 +- src/openqdc/utils/atomization_energies.py | 831 ++++++++++++++++++++-- tests/test_dummy.py | 11 +- 14 files changed, 878 insertions(+), 122 deletions(-) create mode 100644 src/openqdc/datasets/dummy.py diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index c8c417c..49f5c1e 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -82,10 +82,10 @@ class ANI1CCX(ANI1): atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) __energy_methods__ = [ - "ccsd(t)_cbs", - "npno_ccsd(t)_dz", - "npno_ccsd(t)_tz", - "tpno_ccsd(t)_dz", + "ccsd(t)/cbs", + "ccsd(t)/cc-pvdz", + "ccsd(t)/cc-pvtz", + "ccsd(t)/cc-pvdz", ] energy_target_names = [ @@ -98,8 +98,8 @@ class ANI1CCX(ANI1): __force_methods__ = [] force_target_names = [] - def __init__(self) -> None: - super().__init__() + def __init__(self, energy_unit=None, distance_unit=None) -> None: + super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) class ANI1X(ANI1): @@ -124,14 +124,14 @@ class ANI1X(ANI1): atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) __energy_methods__ = [ - "hf_dz", - "hf_qz", - "hf_tz", - "mp2_dz", - "mp2_qz", - "mp2_tz", - "wb97x_6-31g(d)", - "wb97x_tz", + "hf/cc-pvdz", + "hf/cc-pvqz", + "hf/cc-pvtz", + "mp2/cc-pvdz", + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "wb97x/6-31g(d)", + "wb97x/cc-pvtz", ] energy_target_names = [ @@ -155,8 +155,8 @@ class ANI1X(ANI1): "wb97x_tz", ] - def __init__(self) -> None: - super().__init__() + def __init__(self, energy_unit=None, distance_unit=None) -> None: + super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) if __name__ == "__main__": diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index dfe1be2..a9fb793 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -9,6 +9,7 @@ from sklearn.utils import Bunch from tqdm import tqdm +from openqdc.utils.atomization_energies import IsolatedAtomEnergyFactory from openqdc.utils.constants import NB_ATOMIC_FEATURES from openqdc.utils.io import ( copy_exists, @@ -67,6 +68,7 @@ class BaseDataset(torch.utils.data.Dataset): __force_methods__ = [] energy_target_names = [] force_target_names = [] + __isolated_atom_energies__ = [] # convert force gradient -1 __energy_unit__ = "hartree" @@ -84,6 +86,15 @@ def __init__(self, energy_unit=None, distance_unit=None) -> None: res = self.collate_list(entries) self.save_preprocess(res) self.read_preprocess() + self.compute_properties() + self.__isolated_atom_energies__ = ( + [IsolatedAtomEnergyFactory.get(en_method) for en_method in self.__energy_methods__] + if self.__energy_methods__ + else None + ) + + def compute_properties(): + pass @property def energy_unit(self): diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index 64181dd..0cba486 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -34,14 +34,14 @@ class COMP6(BaseDataset): __energy_methods__ = [ "wb97x/6-31g*", - "b3lyp-d3m(bj)_tz", - "b3lyp_tz", - "hf_tz", - "pbe-d3(bj)_dz", - "pbe_tz", - "svwm_tz", - "wb97m-d3(bj)_tz", - "wb97m_tz", + "b3lyp-d3mbj/def2-tzvp", + "b3lyp/def2-tzvp", + "hf/def2-tzvp", + "pbe-d3bj/def2-tzvp", + "pbe/def2-tzvp", + "svwn/def2-tzvp", + "wb97m-d3bj/def2-tzvp", + "wb97m/def2-tzvp", ] energy_target_names = [ diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py index 44094f5..827588b 100644 --- a/src/openqdc/datasets/dess.py +++ b/src/openqdc/datasets/dess.py @@ -36,14 +36,14 @@ def read_mol(mol_path, smiles, subset, targets): class DESS(BaseDataset): __name__ = "dess" __energy_methods__ = [ - "mp2_cc", - "mp2_qz", - "mp2_tz", - "mp2_cbs", - "ccsd(t)_cc", - "ccsd(t)_cbs", - "ccsd(t)_nn", - "sapt", + "mp2/cc-pvdz", + "mp2/cc-pvqz", + "mp2/cc-pvtz", + "mp2/cbs", + "ccsd(t)/cc-pvdz", + "ccsd(t)/cbs", # cbs + "ccsd(t)/nn", # nn + "sapt/aug-cc-pwcvxz", ] energy_target_names = [ diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py new file mode 100644 index 0000000..d3767fe --- /dev/null +++ b/src/openqdc/datasets/dummy.py @@ -0,0 +1,47 @@ +import numpy as np # noqa +from sklearn.utils import Bunch + +from openqdc.datasets.base import BaseDataset + + +class Dummy(BaseDataset): + """ + Dummy dataset + """ + + __name__ = "dummy" + __energy_methods__ = ["I_solved_the_schrodinger_equation_by_hand"] + __force_methods__ = ["I_made_up_random_forces"] + __energy_unit__ = "kcal/mol" + __distance_unit__ = "ang" + __forces_unit__ = "kcal/mol/ang" + + energy_target_names = ["energy"] + + force_target_names = ["forces"] + + def __init__(self, energy_unit=None, distance_unit=None) -> None: + try: + super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) + except: # noqa + pass + + def read_raw_entries(self): + pass + + def __len__(self): + return 999999999 + + def __getitem__(self, idx: int): + size = np.random.randint(1, 250) + z = np.random.randint(1, 100, size) + return Bunch( + positions=np.random.rand(size, 3) * 10, + atomic_numbers=z, + charges=np.random.randint(-1, 2, size), + e0=np.zeros(size), + energies=np.random.rand(1) * 100, + name="dummy_{}".format(idx), + subset="dummy", + forces=np.random.rand(size, 3) * 100, + ) diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py index 8dd82eb..01ed166 100644 --- a/src/openqdc/datasets/gdml.py +++ b/src/openqdc/datasets/gdml.py @@ -38,9 +38,9 @@ class GDML(BaseDataset): __energy_methods__ = [ "ccsd/cc-pvdz", "ccsd(t)/cc-pvdz", - # "pbe+mbd/light", #MD22 + "pbe/mbd", # MD22 # "pbe+mbd/tight", #MD22 - "pbe+vdw-ts", # MD17 + "pbe/vdw-ts", # MD17 ] energy_target_names = [ @@ -52,9 +52,9 @@ class GDML(BaseDataset): __force_methods__ = [ "ccsd/cc-pvdz", "ccsd(t)/cc-pvdz", - # "pbe+mbd/light", #MD22 + "pbe/mbd", # MD22 # "pbe+mbd/tight", #MD22 - "pbe+vdw-ts", # MD17 + "pbe/vdw-ts", # MD17 ] force_target_names = [ diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index 10ddf57..9cbf03b 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -30,7 +30,7 @@ class ISO17(BaseDataset): atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) __energy_methods__ = [ - "pbe+vdw-ts", + "pbe/vdw-ts", ] energy_target_names = [ @@ -38,7 +38,7 @@ class ISO17(BaseDataset): ] __force_methods__ = [ - "pbe+vdw-ts", + "pbe/vdw-ts", ] force_target_names = [ diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/qm7x.py index 7357889..30fba50 100644 --- a/src/openqdc/datasets/qm7x.py +++ b/src/openqdc/datasets/qm7x.py @@ -39,11 +39,11 @@ class QM7X(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - __energy_methods__ = ["pbe0+mbd", "dft3b+mbd"] + __energy_methods__ = ["pbe0/mbd", "dft3b"] energy_target_names = ["ePBE0", "eMBD"] - __force_methods__ = ["pbe0+mbd", "dft3b+mbd"] + __force_methods__ = ["pbe0/mbd", "dft3b"] force_target_names = ["pbe0FOR", "vdwFOR"] diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index 75715f0..f3977cc 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -13,7 +13,7 @@ class SN2RXN(BaseDataset): atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) __energy_methods__ = [ - "dsd-blyp-d3(bj)_tz", + "dsd-blyp-d3(bj)/def2-tzvp", ] __energy_unit__ = "ev" __distance_unit__ = "ang" @@ -24,7 +24,7 @@ class SN2RXN(BaseDataset): ] __force_methods__ = [ - "dsd-blyp-d3(bj)_tz", + "dsd-blyp-d3(bj)/def2-tzvp", ] force_target_names = [ diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index cc075e2..af9aa8b 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -49,8 +49,8 @@ class Spice(BaseDataset): """ __name__ = "spice" - __energy_methods__ = ["wb97x/def2-tzvp"] - __force_methods__ = ["wb97x/def2-tzvp"] + __energy_methods__ = ["wb97m-d3bj/def2-tzvp"] + __force_methods__ = ["wb97m-d3bj/def2-tzvp"] __energy_unit__ = "hartree" __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py index 5d2f1d3..600113b 100644 --- a/src/openqdc/datasets/tmqm.py +++ b/src/openqdc/datasets/tmqm.py @@ -52,10 +52,14 @@ class TMQM(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - __energy_methods__ = ["tpssh/def2tzvp"] + __energy_methods__ = ["tpssh/def2-tzvp"] energy_target_names = ["TPSSh/def2TZVP level"] + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + def __init__(self) -> None: super().__init__() diff --git a/src/openqdc/datasets/waterclusters3_30.py b/src/openqdc/datasets/waterclusters3_30.py index 6b263fb..62e1d4a 100644 --- a/src/openqdc/datasets/waterclusters3_30.py +++ b/src/openqdc/datasets/waterclusters3_30.py @@ -53,9 +53,11 @@ class WaterClusters(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - # need to know where to find the data - __energy_methods__ = ["ttm2.1-f"] + __energy_unit__ = "kcal/mol" + __distance_unit__ = "ang" + __forces_unit__ = "kcal/mol/ang" + __energy_methods__ = ["ttm2.1-f"] energy_target_names = ["TTM2.1-F Potential"] def __init__(self, energy_unit=None, distance_unit=None) -> None: diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index 01d749a..78a3b14 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -1,6 +1,33 @@ -ORBENT_DENALI = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I" +from loguru import logger + +ATOM_SPECIES = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I" +# Energy in atomic unit/ Hartree / Ang + + +class IsolatedAtomEnergyFactory: + def __init__(self): + pass + + def __call__(self, level_of_theory: str): + return self.get(level_of_theory=level_of_theory) + + @staticmethod + def get(level_of_theory: str): + level_of_theory = level_of_theory.lower() + is_dft = True + try: + func, basis = level_of_theory.split("/") + except ValueError: + func = level_of_theory + is_dft = not is_dft + + functional_dict = getattr(ISOLATED_ATOM_ENERGIES, func, None) + if functional_dict is None: + logger.warning(f"Isolated atom energies not found for {level_of_theory}") + if not is_dft: + return functional_dict + return getattr(ISOLATED_ATOM_ENERGIES, basis, None) -# Energy in atomic unit/ Hartree SPICE = { ("Br", -1): -2574.2451510945853, @@ -32,8 +59,6 @@ ("S", 0): -398.1599636677874, ("S", 1): -397.7746615977658, } - - GFN1 = { ("H", 0): -0.4014294744618301, ("Li", 1): 0.13691666666666666, @@ -108,7 +133,6 @@ ("I", -1): -4.060355599036047, ("I", 0): -3.7796302627467933, } - DFTB = { ("H", 0): -0.2386004000, ("Li", 1): 0.000000000, @@ -146,8 +170,6 @@ ("I", -1): -2.6981275000, ("I", 0): -2.5796080002, } - -# PM6 is in kcal/mol need to change it to hartree PM6 = { ("H", 0): 0.08302988483033709, ("Li", 1): 0.23429648020984556, @@ -187,37 +209,8 @@ } -ISOLATED_ATOM_ENERGIES = { - "wb97m-d3(bj)": {"def2-tzvp": SPICE}, - "wb97x-d3": {"def2-tzvp": ORBENT_DENALI}, - "gfn2": GFN2, - "gfn1": GFN1, - "dftb3": DFTB, -} - -CHARGES_LOOKUP = { - "Br": [-1, 0], - "C": [-1, 0, 1], - "B": [-3, -1, 0, 3], - "Ca": [2], - "Cl": [-1, 0], - "F": [-1, 0], - "H": [0], - "I": [-1, 0], - "K": [1], - "Li": [1], - "Mg": [2], - "N": [-1, 0, 1], - "Na": [1], - "O": [-1, 0, 1], - "P": [0, 1], - "S": [-1, 0, 1], - "Si": [+4, 0, -4], -} - - -# "tpssh/def2-tzvp" -bas1 = { +# tpssh/def2-tzvp +TMQM = { ("H", 0): -0.4998936035891093, ("Li", 1): -7.285942861425713, ("B", -3): -24.011884397333016, @@ -255,7 +248,7 @@ ("I", 0): -297.5887657326151, } # "wb97m-d3bj/def2-TZVPPD" -bas2 = { +wb97m_d3bj_def2_TZVPPD = { ("H", 0): -0.4987605100487541, ("Li", 1): -7.285254714046117, ("B", -3): -24.191211616488623, @@ -293,7 +286,7 @@ ("I", 0): -297.7622891423178, } # "revpbe-d3(bj)/def2-tzvp" -bas3 = { +SolvatedPeptides = { ("H", 0): -0.5041476427597161, ("Li", 1): -7.280731201437635, ("B", -3): -24.006372610643076, @@ -331,7 +324,7 @@ ("I", 0): -297.72268439613055, } # "DSD-BLYP-D3BJ/def2-TZVPPD" -bas3 = { +SN2RXN = { ("H", 0): -0.4990585651127987, ("Li", 1): -7.2751828330696995, ("B", -3): -24.127790514752746, @@ -369,7 +362,7 @@ ("I", 0): -297.4541938789708, } # "b3lyp/6-31g*" -bas4 = { +QMUGS_DFT = { ("H", 0): -0.5002733301377901, ("Li", 1): -7.284546111273075, ("B", -3): -23.577268753399462, @@ -403,11 +396,11 @@ ("Ca", 2): -676.8667395990246, ("Br", -1): -2573.824201570383, ("Br", 0): -2573.705283744811, - ("I", -1): None, - ("I", 0): None, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set } # "wb97x-d3/def2-tzvp" -bas3 = { +ORBNET = { ("H", 0): -0.5025865385814652, ("Li", 1): -7.289728176048259, ("B", -3): -23.984063702375366, @@ -445,7 +438,7 @@ ("I", 0): -297.7482461760336, } # "wb97x-d/def2-svp" -bas3 = { +NABLADFT = { ("H", 0): -0.5024927493280441, ("Li", 1): -7.289461512680954, ("B", -3): -23.76326340520956, @@ -483,7 +476,7 @@ ("I", 0): -297.7376955031015, } # "wb97x/6-31g(d)" -bas3 = { +ANI1 = { ("H", 0): -0.4993457316092281, ("Li", 1): -7.2856300653219614, ("B", -3): -23.575157416550805, @@ -517,11 +510,11 @@ ("Ca", 2): -676.8704088358037, ("Br", -1): -2573.8502718776604, ("Br", 0): -2573.733913792756, - ("I", -1): None, - ("I", 0): None, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set } # "WB97X/6-31g*" -bas3 = { +COMP6_1 = { ("H", 0): -0.4993457316092281, ("Li", 1): -7.285630065321961, ("B", -3): -23.5751574165508, @@ -555,11 +548,11 @@ ("Ca", 2): -676.8704088358036, ("Br", -1): -2573.8502718776604, ("Br", 0): -2573.7339137927547, - ("I", -1): None, - ("I", 0): None, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set } # "ccsd/aug-cc-pVDZ" -bas3 = { +ccsdaug = { ("H", 0): -0.49933431543958506, ("Li", 1): -7.23623079003172, ("B", -3): -24.135298809957895, @@ -589,15 +582,15 @@ ("S", 1): -397.2542253763525, ("Cl", -1): -459.7398865093852, ("Cl", 0): -459.6156482951034, - ("K", 1): None, - ("Ca", 2): None, + ("K", 1): None, # not available with this basis set + ("Ca", 2): None, # not available with this basis set ("Br", -1): -2572.6265539931533, ("Br", 0): -2572.5063313966352, - ("I", -1): None, - ("I", 0): None, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set } # "ccsd(t)/aug-cc-pVDZ" -bas3 = { +ccsdtaug = { ("H", 0): -0.4993343154395853, ("Li", 1): -7.236230790031718, ("B", -3): -24.14659676027675, @@ -627,15 +620,15 @@ ("S", 1): -397.2555638941634, ("Cl", -1): -459.74421517568555, ("Cl", 0): -459.6181191157645, - ("K", 1): None, - ("Ca", 2): None, + ("K", 1): None, # not available with this basis set + ("Ca", 2): None, # not available with this basis set ("Br", -1): -2572.630606833861, ("Br", 0): -2572.508930744571, - ("I", -1): None, - ("I", 0): None, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set } # "mp2/aug-cc-pVDZ" -bas3 = { +mp2aug = { ("H", 0): -0.4993343154395852, ("Li", 1): -7.2362434239942885, ("B", -3): -24.11454063530035, @@ -665,15 +658,15 @@ ("S", 1): -397.236034450623, ("Cl", -1): -459.7293671162834, ("Cl", 0): -459.5986332871817, - ("K", 1): None, - ("Ca", 2): None, + ("K", 1): None, # not available with this basis set + ("Ca", 2): None, # not available with this basis set ("Br", -1): -2571.9455214335435, ("Br", 0): -2571.8203622687925, - ("I", -1): None, - ("I", 0): None, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set } # "mp2/def2-TZVP" -bas3 = { +mp2def2TZVP = { ("H", 0): -0.4998098322318883, ("Li", 1): -7.26625465274989, ("B", -3): -23.89130329586724, @@ -710,8 +703,9 @@ ("I", -1): -297.32915651116025, ("I", 0): -297.2135511448063, } -# "SVWN/def2-TZVP" -bas3 = { +# "SVWN/def2-TZVP" TODO: RECALCULATE THIS + +COMP6_7 = { ("H", 0): -0.4961415246858913, ("Li", 1): -7.182160595407815, ("B", -3): -23.858154175760482, @@ -749,7 +743,7 @@ ("I", 0): None, } # "PBE-D3BJ2B/def2-TZVP" -bas3 = { +COMP6_5 = { ("H", 0): -0.49963874688778964, ("Li", 1): -7.256644236856915, ("B", -3): -23.965651173919607, @@ -787,7 +781,7 @@ ("I", 0): -297.66553802500096, } # "B3LYP-D3MBJ2B/def2-TZVP" -bas3 = { +COMP6_2 = { ("H", 0): -0.5021763508982502, ("Li", 1): -7.28605166725753, ("B", -3): -24.00227248681287, @@ -825,7 +819,7 @@ ("I", 0): -297.7784640477503, } # "b3lyp/def2-TZVP" -bas3 = { +COMP6_3 = { ("H", 0): -0.5021763508982502, ("Li", 1): -7.2860516672575315, ("B", -3): -24.002272486812885, @@ -862,3 +856,694 @@ ("I", -1): -297.8970487306444, ("I", 0): -297.7784640477502, } + +# ccsd(t)/cc-pVDZ +GDML_2 = { + ("H", 0): -0.49927840341958285, + ("Li", 1): -7.236223739656382, + ("B", -3): -23.61782373835322, + ("B", -1): -24.528388906235705, + ("B", 0): -24.590264050112527, + ("B", 3): -21.98588333987049, + ("C", -1): -37.688228871632006, + ("C", 0): -37.70277208656365, + ("C", 1): -37.3579597779074, + ("N", -1): -54.321974972075715, + ("N", 0): -54.373768477368074, + ("N", 1): -53.87510137954731, + ("O", -1): -74.87516352403559, + ("O", 0): -74.82827800838686, + ("O", 1): -74.30135465859384, + ("F", -1): -99.56030962418485, + ("F", 0): -99.52932183945009, + ("Na", 1): -161.67188329184694, + ("Mg", 2): -198.82669320079302, + ("Si", 4): -285.17919483395195, + ("Si", 0): -288.88085983569533, + ("Si", -4): -287.40461285633614, + ("P", 0): -340.7265584017754, + ("P", 1): -340.36984136674585, + ("S", -1): -397.63315120158666, + ("S", 0): -397.55317747510554, + ("S", 1): -397.1659426092399, + ("Cl", -1): -459.69470422539786, + ("Cl", 0): -459.60398876941906, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.2271898047749, + ("Br", -1): -2572.584907858833, + ("Br", 0): -2572.4941153123455, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# ccsd(t)/cc-pVTZ +ANI1CCX_2 = { + ("H", 0): -0.49980981130184293, + ("Li", 1): -7.249353374937752, + ("B", -3): -23.793685421585884, + ("B", -1): -24.56648780776967, + ("B", 0): -24.605381789792233, + ("B", 3): -21.991368552278544, + ("C", -1): -37.747141724045164, + ("C", 0): -37.735863889731654, + ("C", 1): -37.37850843579137, + ("N", -1): -54.41337048412563, + ("N", 0): -54.42353049479941, + ("N", 1): -53.91625772121427, + ("O", -1): -74.99249367544891, + ("O", 0): -74.90337716789482, + ("O", 1): -74.36027901195692, + ("F", -1): -99.71046952902925, + ("F", 0): -99.63219230886922, + ("Na", 1): -161.68615285472157, + ("Mg", 2): -198.8436504300981, + ("Si", 4): -285.2290232109956, + ("Si", 0): -288.954195226872, + ("Si", -4): -287.62141587617776, + ("P", 0): -340.79678977311414, + ("P", 1): -340.432199862984, + ("S", -1): -397.7409199255247, + ("S", 0): -397.6361063083311, + ("S", 1): -397.2347675440139, + ("Cl", -1): -459.8163494320064, + ("Cl", 0): -459.70310084056786, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.3176100772968, + ("Br", -1): -2572.8167538662433, + ("Br", 0): -2572.702100151291, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# ccsd/cc-pVDZ +GDML_1 = { + ("H", 0): -0.49927840341958285, + ("Li", 1): -7.236223739656382, + ("B", -3): -23.613877846876942, + ("B", -1): -24.52547666267111, + ("B", 0): -24.589429443373188, + ("B", 3): -21.98588333987049, + ("C", -1): -37.68362301484667, + ("C", 0): -37.69937564411741, + ("C", 1): -37.35727461654343, + ("N", -1): -54.31612564560329, + ("N", 0): -54.3667355223191, + ("N", 1): -53.871756805827864, + ("O", -1): -74.87454456240714, + ("O", 0): -74.82074180638969, + ("O", 1): -74.29143146516834, + ("F", -1): -99.55969095436343, + ("F", 0): -99.5284215563597, + ("Na", 1): -161.67186865791962, + ("Mg", 2): -198.826650230425, + ("Si", 4): -285.17913845059644, + ("Si", 0): -288.87753485972564, + ("Si", -4): -287.40275985231415, + ("P", 0): -340.7210732625289, + ("P", 1): -340.3662836136086, + ("S", -1): -397.631810717651, + ("S", 0): -397.54760940641853, + ("S", 1): -397.15909131565013, + ("Cl", -1): -459.6933866998589, + ("Cl", 0): -459.60268687745884, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.2265307613668, + ("Br", -1): -2572.5834492880094, + ("Br", 0): -2572.492623348252, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# ccsd/cc-pVTZ +CCSD_VTZ = { + ("H", 0): -0.49980981130184293, + ("Li", 1): -7.249353374937752, + ("B", -3): -23.78682468678494, + ("B", -1): -24.56193370904525, + ("B", 0): -24.60388179904298, + ("B", 3): -21.991368552278544, + ("C", -1): -37.74093800618891, + ("C", 0): -37.73042268826894, + ("C", 1): -37.377165803324715, + ("N", -1): -54.40441588438247, + ("N", 0): -54.4152043962678, + ("N", 1): -53.91038920924042, + ("O", -1): -74.98771409352835, + ("O", 0): -74.89293727915536, + ("O", 1): -74.34899994406153, + ("F", -1): -99.70481088713056, + ("F", 0): -99.62851668514091, + ("Na", 1): -161.68598877560345, + ("Mg", 2): -198.84332758531946, + ("Si", 4): -285.228514965889, + ("Si", 0): -288.9476846603088, + ("Si", -4): -287.6138873496766, + ("P", 0): -340.78870701737065, + ("P", 1): -340.42522678302885, + ("S", -1): -397.73415929387704, + ("S", 0): -397.62619555322124, + ("S", 1): -397.225460043223, + ("Cl", -1): -459.80856103622415, + ("Cl", 0): -459.69693046874454, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.3160445414744, + ("Br", -1): -2572.8073946290465, + ("Br", 0): -2572.694327605488, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# hf/cc-pVDZ +ANI1X_1 = { + ("H", 0): -0.4992784034195828, + ("Li", 1): -7.236120435571012, + ("B", -3): -23.517631518350836, + ("B", -1): -24.43849458753095, + ("B", 0): -24.52995828509406, + ("B", 3): -21.98542712791857, + ("C", -1): -37.57949842909864, + ("C", 0): -37.59598618627132, + ("C", 1): -37.28952528470851, + ("N", -1): -54.170756777551894, + ("N", 0): -54.251655645342815, + ("N", 1): -53.75577765594358, + ("O", -1): -74.72122641123744, + ("O", 0): -74.66528700138886, + ("O", 1): -74.16935785917661, + ("F", -1): -99.3660232395006, + ("F", 0): -99.37525020985224, + ("Na", 1): -161.67106997000676, + ("Mg", 2): -198.82420265081305, + ("Si", 4): -285.17413886038224, + ("Si", 0): -288.7869064370983, + ("Si", -4): -287.3055013422455, + ("P", 0): -340.6188035921855, + ("P", 1): -340.26328028589194, + ("S", -1): -397.506997287547, + ("S", 0): -397.4131194811572, + ("S", 1): -397.04821663752654, + ("Cl", -1): -459.54222556583767, + ("Cl", 0): -459.4711432886898, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.1457625057777, + ("Br", -1): -2571.766685524917, + ("Br", 0): -2571.6943737649776, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# hf/cc-pVTZ +ANI1X_3 = { + ("H", 0): -0.49980981130184304, + ("Li", 1): -7.236381928884647, + ("B", -3): -23.654030528094694, + ("B", -1): -24.45440782122731, + ("B", 0): -24.532065412570418, + ("B", 3): -21.985654326745827, + ("C", -1): -37.6036322232934, + ("C", 0): -37.602187116127666, + ("C", 1): -37.294742506720475, + ("N", -1): -54.20897619252452, + ("N", 0): -54.263903101255586, + ("N", 1): -53.765473796977965, + ("O", -1): -74.76618798136187, + ("O", 0): -74.6842428689006, + ("O", 1): -74.18751432538998, + ("F", -1): -99.42428986904464, + ("F", 0): -99.40551931536073, + ("Na", 1): -161.67601880318512, + ("Mg", 2): -198.82947207595663, + ("Si", 4): -285.1793556127226, + ("Si", 0): -288.7945961163259, + ("Si", -4): -287.41256067563575, + ("P", 0): -340.6294583289231, + ("P", 1): -340.2717794204319, + ("S", -1): -397.5319459632172, + ("S", 0): -397.4249161291449, + ("S", 1): -397.06067984991046, + ("Cl", -1): -459.5646668064105, + ("Cl", 0): -459.4854291853036, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.1540716436532, + ("Br", -1): -2572.528468875192, + ("Br", 0): -2572.445069318686, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} + +# mp2/cc-pVDZ +DES1 = { + ("H", 0): -0.4992784034195828, + ("Li", 1): -7.236236031279599, + ("B", -3): -23.59075634654498, + ("B", -1): -24.496049160245956, + ("B", 0): -24.56749154944109, + ("B", 3): -21.985897030619704, + ("C", -1): -37.65666509987848, + ("C", 0): -37.66302875884139, + ("C", 1): -37.3321238689667, + ("N", -1): -54.28620525567718, + ("N", 0): -54.334987200983385, + ("N", 1): -53.827357208281775, + ("O", -1): -74.86327217217499, + ("O", 0): -74.78617322485147, + ("O", 1): -74.25332362507456, + ("F", -1): -99.55668287878551, + ("F", 0): -99.51775797009576, + ("Na", 1): -161.67192521516694, + ("Mg", 2): -198.82669914019823, + ("Si", 4): -285.1791105165065, + ("Si", 0): -288.8472784365606, + ("Si", -4): -287.3919999801635, + ("P", 0): -340.6925553040255, + ("P", 1): -340.33066918694686, + ("S", -1): -397.61602048346754, + ("S", 0): -397.5157894668129, + ("S", 1): -397.126843359414, + ("Cl", -1): -459.68240407270594, + ("Cl", 0): -459.5865928328137, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.2188060975801, + ("Br", -1): -2571.903217203978, + ("Br", 0): -2571.8074873037867, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} + +# mp2/cc-pVQZ +DES2 = { + ("H", 0): -0.4999455685829884, + ("Li", 1): -7.250250946178424, + ("B", -3): -23.881056379140478, + ("B", -1): -24.562769033198762, + ("B", 0): -24.601332055304802, + ("B", 3): -22.00384581220691, + ("C", -1): -37.78757616460555, + ("C", 0): -37.72055375923268, + ("C", 1): -37.374641050923756, + ("N", -1): -54.42675509155296, + ("N", 0): -54.41599555658964, + ("N", 1): -53.89571949369111, + ("O", -1): -75.03532831936059, + ("O", 0): -74.89960636766679, + ("O", 1): -74.42732171580235, + ("F", -1): -99.77773243315134, + ("F", 0): -99.66592682518191, + ("Na", 1): -161.68639387893282, + ("Mg", 2): -198.85342876070732, + ("Si", 4): -285.21266596906895, + ("Si", 0): -288.9153023940409, + ("Si", -4): -287.84995588475664, + ("P", 0): -340.78254912688595, + ("P", 1): -340.41137033923945, + ("S", -1): -397.764457176497, + ("S", 0): -397.63328479696963, + ("S", 1): -397.2291889048987, + ("Cl", -1): -459.85575358503627, + ("Cl", 0): -459.725756402736, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.353471955094, + ("Br", -1): -2572.9216392833405, + ("Br", 0): -2572.79376070567, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} +# pbe/def2-tzvp +ISO17 = { + ("H", 0): -0.4996387468896132, + ("Li", 1): -7.256644236856955, + ("B", -3): -23.935382459402287, + ("B", -1): -24.585965866081416, + ("B", 0): -24.610084509908482, + ("B", 3): -21.98118646897415, + ("C", -1): -37.77594560897306, + ("C", 0): -37.732895049756756, + ("C", 1): -37.38238697233679, + ("N", -1): -54.441487575279545, + ("N", 0): -54.43218609912527, + ("N", 1): -53.89863329199101, + ("O", -1): -75.04792601076215, + ("O", 0): -74.9084975444151, + ("O", 1): -74.35740906502845, + ("F", -1): -99.77558183886431, + ("F", 0): -99.66914009406862, + ("Na", 1): -161.9641373718238, + ("Mg", 2): -199.1000109617099, + ("Si", 4): -285.4180171255296, + ("Si", 0): -289.2015108290971, + ("Si", -4): -288.02271678330254, + ("P", 0): -341.06484223053843, + ("P", 1): -340.68322234698707, + ("S", -1): -398.00391422392744, + ("S", 0): -397.9053091661701, + ("S", 1): -397.5008759502245, + ("Cl", -1): -460.0784728780043, + ("Cl", 0): -459.95841441797796, + ("K", 1): -599.5277926006352, + ("Ca", 2): -676.6655247948639, + ("Br", -1): -2573.8415230488945, + ("Br", 0): -2573.720729522105, + ("I", -1): -297.7815346863186, + ("I", 0): -297.66553802494457, +} + + +# hf/cc-pVQZ +ANI1X_2 = { + ("H", 0): -0.49994556858298844, + ("Li", 1): -7.236386237851972, + ("B", -3): -23.74309031828107, + ("B", -1): -24.46286773184739, + ("B", 0): -24.5329645824744, + ("B", 3): -21.986158801102064, + ("C", -1): -37.66896328779905, + ("C", 0): -37.604262031495196, + ("C", 1): -37.29646463702154, + ("N", -1): -54.22426108804101, + ("N", 0): -54.26750374803837, + ("N", 1): -53.76849831230501, + ("O", -1): -74.78286297582162, + ("O", 0): -74.68967002333635, + ("O", 1): -74.19286214550267, + ("F", -1): -99.44462949539432, + ("F", 0): -99.41376829607128, + ("Na", 1): -161.67672032176134, + ("Mg", 2): -198.83037897754207, + ("Si", 4): -285.1803724364078, + ("Si", 0): -288.79743501319945, + ("Si", -4): -287.65204471889274, + ("P", 0): -340.63262408709096, + ("P", 1): -340.27442412596326, + ("S", -1): -397.54055244875906, + ("S", 0): -397.42820343953593, + ("S", 1): -397.06412575498064, + ("Cl", -1): -459.57282279413744, + ("Cl", 0): -459.4890928627921, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.1542980250254, + ("Br", -1): -2572.5345236382864, + ("Br", 0): -2572.448003418184, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} + + +# mp2/cc-pVTZ +DES3 = { + ("H", 0): -0.49980981130184304, + ("Li", 1): -7.24726155786237, + ("B", -3): -23.763643794842856, + ("B", -1): -24.53409654753541, + ("B", 0): -24.583383154203396, + ("B", 3): -21.991094434286477, + ("C", -1): -37.71496709817741, + ("C", 0): -37.69583488009523, + ("C", 1): -37.35364857976649, + ("N", -1): -54.37687246581612, + ("N", 0): -54.38498928095387, + ("N", 1): -53.86758718077272, + ("O", -1): -74.97696880669871, + ("O", 0): -74.85981462857248, + ("O", 1): -74.3128417784704, + ("F", -1): -99.70562180844765, + ("F", 0): -99.61731492045887, + ("Na", 1): -161.68534038705675, + ("Mg", 2): -198.84302024453982, + ("Si", 4): -285.22727858476895, + ("Si", 0): -288.9183509250862, + ("Si", -4): -287.5995448051336, + ("P", 0): -340.75961526664724, + ("P", 1): -340.3904498977919, + ("S", -1): -397.7141036332652, + ("S", 0): -397.5920220310466, + ("S", 1): -397.19206598949114, + ("Cl", -1): -459.79402765207186, + ("Cl", 0): -459.67567575694216, + ("K", 1): None, # not available with this basis set + ("Ca", 2): -676.3023664599882, + ("Br", -1): -2572.801814668155, + ("Br", 0): -2572.6834739695705, + ("I", -1): None, # not available with this basis set + ("I", 0): None, # not available with this basis set +} + +# pbe/def2-tzvp TODO: Recalculate this +# H -1 +1 , S -2 +2 +ISO17 = { + ("H", 0): -0.49963874688961296, + ("Li", 1): -7.256644236856955, + ("B", -3): None, + ("B", -1): None, + ("B", 0): -24.61008450990847, + ("B", 3): -21.981186468974155, + ("C", -1): None, + ("C", 0): None, + ("C", 1): -37.37216480721611, + ("N", -1): None, + ("N", 0): None, + ("N", 1): None, + ("O", -1): -75.04792601076218, + ("O", 0): None, + ("O", 1): None, + ("F", -1): -99.77558183886428, + ("F", 0): -99.66914009406861, + ("Na", 1): -161.96413737182382, + ("Mg", 2): -199.10001096170993, + ("Si", 4): -285.4180171255296, + ("Si", 0): None, + ("Si", -4): -288.02271678330254, + ("P", 0): None, + ("P", 1): None, + ("S", -1): -398.00391422392744, + ("S", 0): None, + ("S", 1): None, + ("Cl", -1): -460.07847287800433, + ("Cl", 0): -459.958414417978, + ("K", 1): -599.5277926006352, + ("Ca", 2): -676.6655247948639, + ("Br", -1): -2573.841523048894, + ("Br", 0): -2573.720729522104, + ("I", -1): -297.78153468631854, + ("I", 0): -297.66553802494457, +} +# pbe0/def2-tzvp +QM7X_DFT = { + ("H", 0): -0.5010619187567116, + ("Li", 1): -7.262402336780465, + ("B", -3): -23.93538245940231, + ("B", -1): -24.58596586608141, + ("B", 0): -24.618279526937158, + ("B", 3): -21.993880405036222, + ("C", -1): -37.775945608973075, + ("C", 0): -37.73289504975675, + ("C", 1): -37.38238697233677, + ("N", -1): -54.4414875752795, + ("N", 0): -54.43218609912527, + ("N", 1): -53.898633291991025, + ("O", -1): -75.04858314388663, + ("O", 0): -74.9084975444151, + ("O", 1): -74.35740906502848, + ("F", -1): -99.77378866090523, + ("F", 0): -99.67618937527747, + ("Na", 1): -161.98136849490916, + ("Mg", 2): -199.1241396537923, + ("Si", 4): -285.4539026316095, + ("Si", 0): -289.20151082909706, + ("Si", -4): -288.04650100943854, + ("P", 0): -341.06484223053843, + ("P", 1): -340.6832223469869, + ("S", -1): -398.03842612700186, + ("S", 0): -397.90530916617007, + ("S", 1): -397.5008759502245, + ("Cl", -1): -460.11739716845636, + ("Cl", 0): -459.9974100829532, + ("K", 1): -599.5783201878277, + ("Ca", 2): -676.7194481655977, + ("Br", -1): -2573.9328383617813, + ("Br", 0): -2573.8118913577364, + ("I", -1): -297.8097622358941, + ("I", 0): -297.6931741613416, +} + +# LEVEL OF THEORY: WB97M-V/def2-tzvp +COMP6_9 = { + ("H", 0): -0.4942304316867456, + ("Li", 1): -7.275845986964876, + ("B", -3): -23.944386486890433, + ("B", -1): -24.620648350767315, + ("B", 0): -24.649626180737634, + ("B", 3): -22.041679002146115, + ("C", -1): -37.81902657653025, + ("C", 0): -37.78784557278033, + ("C", 1): -37.43099787866309, + ("N", -1): -54.50330209852381, + ("N", 0): -54.48942541262065, + ("N", 1): -53.97039551980893, + ("O", -1): -75.10937339867125, + ("O", 0): -74.98274472768641, + ("O", 1): -74.42816465620183, + ("F", -1): -99.8448159370651, + ("F", 0): -99.74528654206127, + ("Na", 1): -162.06872009995914, + ("Mg", 2): -199.22338375053474, + ("Si", 4): -285.5821192636676, + ("Si", 0): -289.31658008917617, + ("Si", -4): -288.11126408870666, + ("P", 0): -341.2109132073535, + ("P", 1): -340.8136624526414, + ("S", -1): -398.1550625555495, + ("S", 0): -398.0362575878335, + ("S", 1): -397.63036775088466, + ("Cl", -2): -459.52873734619544, + ("Cl", -1): -460.24520403058557, + ("Cl", 0): -460.12503955811985, + ("Cl", 2): -458.6770781144964, + ("K", 1): -599.7242257909018, + ("Ca", 2): -676.8737360488551, + ("Br", -1): -2574.0859799330883, + ("Br", 0): -2573.967555604986, + ("I", -1): -297.7777930229968, + ("I", 0): -297.66455265533017, +} + +# hf/def2-tzvp +HF_DEF2 = { + ("H", 0): -0.4998098322318885, + ("Li", 1): -7.236374246714073, + ("B", -3): -23.74140302512685, + ("B", -1): -24.462195925378662, + ("B", 0): -24.53233202503875, + ("B", 3): -21.985926089783565, + ("C", -1): -37.613473799868544, + ("C", 0): -37.603219252494, + ("C", 1): -37.295541183753926, + ("N", -1): -54.223174834464814, + ("N", 0): -54.266099796938654, + ("N", 1): -53.76717547003795, + ("O", -1): -74.78142147694243, + ("O", 0): -74.68804805190297, + ("O", 1): -74.19115875887655, + ("F", -1): -99.44317910914634, + ("F", 0): -99.41179977280933, + ("Na", 1): -161.67025708598274, + ("Mg", 2): -198.82300763311338, + ("Si", 4): -285.17360760657004, + ("Si", 0): -288.7894100524365, + ("Si", -4): -287.5042786445288, + ("P", 0): -340.6233882863439, + ("P", 1): -340.26541318034015, + ("S", -1): -397.5252097143351, + ("S", 0): -397.4176274212401, + ("S", 1): -397.0534456500219, + ("Cl", -1): -459.55564984013716, + ("Cl", 0): -459.47680800709793, + ("K", 1): -599.0060338509219, + ("Ca", 2): -676.1418445564589, + ("Br", -1): -2572.4811033491237, + ("Br", 0): -2572.398074528429, + ("I", -1): -296.7409981252531, + ("I", 0): -296.6585948224954, +} +ANI1X_8 = { + ("H", 0): -0.5013136410415637, + ("Li", 1): -7.286464366413948, + ("B", -3): -23.86534129296109, + ("B", -1): -24.613473886395223, + ("B", 0): -24.65142963156562, + ("B", 3): -22.073004626190233, + ("C", 0): -37.780134440896255, + ("N", -1): -54.481657808873116, + ("N", 0): -54.48280823582692, + ("N", 1): -53.95708783281901, + ("O", -1): -75.09104966465256, + ("O", 0): -74.97131697424727, + ("O", 1): -74.41885693671637, + ("F", -1): -99.82474743242214, + ("F", 0): -99.73990054006921, + ("Na", 1): -162.08501075159776, + ("Mg", 2): -199.24620625842113, + ("Si", 4): -285.6197527177925, + ("Si", 0): -289.323387632431, + ("Si", -4): -288.04657476482333, + ("P", 0): -341.1958015245573, + ("P", 1): -340.8193558685238, + ("S", -1): -398.1805976553139, + ("S", 0): -398.0529588010547, + ("S", 1): -397.69734443410385, + ("Cl", -1): -460.2768559014631, + ("Cl", 0): -460.1543938788908, + ("K", 1): None, + ("Ca", 2): -676.921587688464, + ("Br", -1): -2574.3069571951482, + ("Br", 0): -2574.1862987794157, + ("I", -1): None, + ("I", 0): None, +} + + +ISOLATED_ATOM_ENERGIES = { + # DFT + "wb97x": { + "6-31g*": COMP6_1, + "6-31g(d)": ANI1, + "cc-pvtz": ANI1X_8, + }, + "wb97x-d": {"def2-svp": NABLADFT}, + "wb97x-d3": {"def2-tzvp": ORBNET}, + "wb97m": { + "def2-tzvp": COMP6_9, + }, + "wb97m-d3bj": {"def2-tzvp": SPICE}, + "tpssh": {"def2-tzvp": TMQM}, + "revpbe-d3(bj)": {"def2-tzvp": SolvatedPeptides}, + "dsd-blyp-d3(bj)": {"def2-tzvp": SN2RXN}, + "b3lyp": { + "6-31g*": QMUGS_DFT, + "def2-tzv": COMP6_3, + }, + "b3lyp-d3mbj": {"def2-tzvp": COMP6_2}, + "pbe-d3bj": { + "def2-tzvp": COMP6_5, + }, + "hf": { + "def2-tzvp": HF_DEF2, + "cc-pvdz": ANI1X_1, + "cc-pvqz": ANI1X_2, + "cc-pvtz": ANI1X_3, + }, + "svwn": { + "def2-tzv": COMP6_7, + }, + # PAW + "pbe0": { + "mbd": QM7X_DFT, + }, + "pbe": { + "vdw-ts": ISO17, + "mbd": ISO17, + "def2-tzvp": ISO17, + }, + # HIGHER LEVEL OF THEORY + "ccsd": { + "cc-pvdz": GDML_1, + "cc-pvtz": CCSD_VTZ, + }, + "ccsd(t)": { + "cc-pvdz": GDML_2, + "cc-pvtz": ANI1CCX_2, + "cbs": ccsdtaug, + "nn": None, # ML Calculated + }, + "mp2": { + "cc-pvdz": DES1, + "cc-pvqz": DES2, + "cc-pvtz": DES3, + "cbs": mp2aug, + }, + # SAPT0 + "sapt": { + "aug-cc-pwcvxz": None, # DOESNT MAKE SENSE + }, + # SEMI EMPIRICAL + "gfn2_xtb": GFN2, + "gfn1_xtb": GFN1, + "dft3b": DFTB, + "pm6": PM6, +} + +# TODO: Talk with ivan about cbs extrapolation from from av[TQ]z. For now this should be ok diff --git a/tests/test_dummy.py b/tests/test_dummy.py index 3887df5..54a5d9d 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -1,2 +1,9 @@ -def test_this_file(): - print("this is a dummy test") +"""Path hack to make tests work.""" + +from openqdc.datasets.dummy import Dummy # noqa: E402 + + +def test_dummy(): + ds = Dummy() + assert len(ds) > 10 + assert ds[100] From ca1fa38c9f925c40f641ccf4bd5322659379f700 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 24 Oct 2023 13:12:20 -0400 Subject: [PATCH 05/20] Factory simple test --- src/openqdc/datasets/base.py | 4 ++-- src/openqdc/utils/atomization_energies.py | 5 ++--- tests/test_dummy.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index a9fb793..957fd3c 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -72,8 +72,8 @@ class BaseDataset(torch.utils.data.Dataset): # convert force gradient -1 __energy_unit__ = "hartree" - __distance_unit__ = "bohr" - __forces_unit__ = "hartree/bohr" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" __fn_energy__ = lambda x: x __fn_distance__ = lambda x: x __fn_forces__ = lambda x: x diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index 78a3b14..0fbed07 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -20,13 +20,12 @@ def get(level_of_theory: str): except ValueError: func = level_of_theory is_dft = not is_dft - - functional_dict = getattr(ISOLATED_ATOM_ENERGIES, func, None) + functional_dict = ISOLATED_ATOM_ENERGIES.get(func, None) if functional_dict is None: logger.warning(f"Isolated atom energies not found for {level_of_theory}") if not is_dft: return functional_dict - return getattr(ISOLATED_ATOM_ENERGIES, basis, None) + return functional_dict.get(basis, None) SPICE = { diff --git a/tests/test_dummy.py b/tests/test_dummy.py index 54a5d9d..65fe9b6 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -1,9 +1,21 @@ """Path hack to make tests work.""" from openqdc.datasets.dummy import Dummy # noqa: E402 +from openqdc.utils.atomization_energies import ( + ISOLATED_ATOM_ENERGIES, + IsolatedAtomEnergyFactory, +) def test_dummy(): ds = Dummy() assert len(ds) > 10 assert ds[100] + + +def test_is_at_factory(): + res = IsolatedAtomEnergyFactory.get("mp2/cc-pvdz") + assert len(res) == len(ISOLATED_ATOM_ENERGIES["mp2"]["cc-pvdz"]) + res = IsolatedAtomEnergyFactory.get("PM6") + assert len(res) == len(ISOLATED_ATOM_ENERGIES["pm6"]) + assert isinstance(res[("H", 0)], float) From 39494119db5d003d341e2551126fcdc0bd30d821 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 26 Oct 2023 18:14:27 -0400 Subject: [PATCH 06/20] Lazy loading --- src/openqdc/__init__.py | 43 +++++++ src/openqdc/datasets/__init__.py | 125 ++++++++++++++++----- src/openqdc/datasets/ani.py | 4 +- src/openqdc/datasets/base.py | 1 + src/openqdc/datasets/comp6.py | 4 +- src/openqdc/datasets/dess.py | 5 +- src/openqdc/datasets/iso_17.py | 2 +- src/openqdc/datasets/molecule3d.py | 6 +- src/openqdc/datasets/nabladft.py | 12 +- src/openqdc/datasets/orbnet_denali.py | 1 - src/openqdc/datasets/qmugs.py | 2 +- src/openqdc/datasets/spice.py | 4 +- src/openqdc/utils/atomization_energies.py | 44 +++++++- src/openqdc/utils/package_utils.py | 130 ++++++++++++++++++++++ 14 files changed, 336 insertions(+), 47 deletions(-) create mode 100644 src/openqdc/utils/package_utils.py diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py index e69de29..a5c0555 100644 --- a/src/openqdc/__init__.py +++ b/src/openqdc/__init__.py @@ -0,0 +1,43 @@ +import importlib +import os +from typing import TYPE_CHECKING # noqa F401 + +# The below lazy import logic is coming from openff-toolkit: +# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44 + +# Dictionary of objects to lazily import; maps the object's name to its module path + +_lazy_imports_obj = {} + +_lazy_imports_mod = { + "datasets": "openqdc.datamodule", +} + + +def __getattr__(name): + """Lazily import objects from _lazy_imports_obj or _lazy_imports_mod + + Note that this method is only called by Python if the name cannot be found + in the current module.""" + obj_mod = _lazy_imports_obj.get(name) + if obj_mod is not None: + mod = importlib.import_module(obj_mod) + return mod.__dict__[name] + + lazy_mod = _lazy_imports_mod.get(name) + if lazy_mod is not None: + return importlib.import_module(lazy_mod) + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__(): + """Add _lazy_imports_obj and _lazy_imports_mod to dir()""" + keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys()) + return sorted(keys) + + +if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": + # These types are imported lazily at runtime, but we need to tell type + # checkers what they are. + from .datasets import * diff --git a/src/openqdc/datasets/__init__.py b/src/openqdc/datasets/__init__.py index 9c17922..f57fde4 100644 --- a/src/openqdc/datasets/__init__.py +++ b/src/openqdc/datasets/__init__.py @@ -1,26 +1,99 @@ -from .ani import ANI1, ANI1CCX, ANI1X -from .comp6 import COMP6 -from .gdml import GDML -from .geom import GEOM -from .iso_17 import ISO17 -from .molecule3d import Molecule3D -from .orbnet_denali import OrbnetDenali -from .qmugs import QMugs -from .sn2_rxn import SN2RXN -from .spice import Spice - -__all__ = [ - "ANI1", - "ANI1CCX", - "ANI1X", - "Spice", - "GEOM", - "QMugs", - "ISO17", - "COMP6", - "GDML", - "Molecule3D", - "OrbnetDenali", - "QMugs", - "SN2RXN", -] +import importlib +import os +from typing import TYPE_CHECKING # noqa F401 + +# The below lazy import logic is coming from openff-toolkit: +# https://github.com/openforcefield/openff-toolkit/blob/b52879569a0344878c40248ceb3bd0f90348076a/openff/toolkit/__init__.py#L44 + +# Dictionary of objects to lazily import; maps the object's name to its module path + +_lazy_imports_obj = { + "ANI1": "openqdc.datasets.ani", + "ANI1CCX": "openqdc.datasets.ani", + "ANI1X": "openqdc.datasets.ani", + "Spice": "openqdc.datasets.spice", + "GEOM": "openqdc.datasets.geom", + "QMugs": "openqdc.datasets.qmugs", + "ISO17": "openqdc.datasets.iso_17", + "COMP6": "openqdc.datasets.comp6", + "GDML": "openqdc.datasets.gdml", + "Molecule3D": "openqdc.datasets.molecule3d", + "OrbnetDenali": "openqdc.datasets.orbnet_denali", + "SN2RXN": "openqdc.datasets.sn2_rxn", + "QM7X": "openqdc.datasets.qm7x", + "DESS": "openqdc.datasets.dess", + "NablaDFT": "openqdc.datasets.nabladft", + "SolvatedPeptides": "openqdc.datasets.solvated_peptides", + "WaterClusters": "openqdc.datasets.waterclusters3_30", + "TMQM": "openqdc.datasets.tmqm", + "Dummy": "openqdc.datasets.dummy", +} + +_lazy_imports_mod = {} + + +def __getattr__(name): + """Lazily import objects from _lazy_imports_obj or _lazy_imports_mod + + Note that this method is only called by Python if the name cannot be found + in the current module.""" + obj_mod = _lazy_imports_obj.get(name) + if obj_mod is not None: + mod = importlib.import_module(obj_mod) + return mod.__dict__[name] + + lazy_mod = _lazy_imports_mod.get(name) + if lazy_mod is not None: + return importlib.import_module(lazy_mod) + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__(): + """Add _lazy_imports_obj and _lazy_imports_mod to dir()""" + keys = (*globals().keys(), *_lazy_imports_obj.keys(), *_lazy_imports_mod.keys()) + return sorted(keys) + + +if TYPE_CHECKING or os.environ.get("OPENQDC_DISABLE_LAZY_LOADING", "0") == "1": + # These types are imported lazily at runtime, but we need to tell type + # checkers what they are. + from .ani import ANI1, ANI1CCX, ANI1X # noqa + from .comp6 import COMP6 # noqa + from .dess import DESS # noqa + from .dummy import Dummy # noqa + from .gdml import GDML # noqa + from .geom import GEOM # noqa + from .iso_17 import ISO17 # noqa + from .molecule3d import Molecule3D # noqa + from .nabladft import NablaDFT # noqa + from .orbnet_denali import OrbnetDenali # noqa + from .qm7x import QM7X # noqa + from .qmugs import QMugs # noqa + from .sn2_rxn import SN2RXN # noqa + from .solvated_peptides import SolvatedPeptides # noqa + from .spice import Spice # noqa + from .tmqm import TMQM # noqa + from .waterclusters3_30 import WaterClusters # noqa + + __all__ = [ + "ANI1", + "ANI1X", + "ANI1CCX", + "Spice", + "GEOM", + "QMugs", + "ISO17", + "COMP6", + "GDML", + "Molecule3D", + "OrbnetDenali", + "SN2RXN", + "QM7X", + "DESS", + "NablaDFT", + "SolvatedPeptides", + "WaterClusters", + "TMQM", + "Dummy", + ] diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index 49f5c1e..49bd8fb 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -151,8 +151,8 @@ class ANI1X(ANI1): ] __force_methods__ = [ - "wb97x_6-31g(d)", - "wb97x_tz", + "wb97x/6-31g(d)", + "wb97x/cc-pvtz", ] def __init__(self, energy_unit=None, distance_unit=None) -> None: diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 957fd3c..b15f976 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -248,6 +248,7 @@ def __len__(self): return self.data["energies"].shape[0] def __getitem__(self, idx: int): + # if idx is more than len doesn t throw error p_start, p_end = self.data["position_idx_range"][idx] input = self.data["atomic_inputs"][p_start:p_end] z, c, positions, energies = ( diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index 0cba486..bca1d5b 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -28,8 +28,8 @@ class COMP6(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) # watchout that forces are stored as -grad(E) - __energy_unit__ = "hartree" - __distance_unit__ = "ang" + __energy_unit__ = "hartree" # kcal/mol now + __distance_unit__ = "ang" # bohr __forces_unit__ = "hartree/ang" __energy_methods__ = [ diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py index 827588b..9a488e4 100644 --- a/src/openqdc/datasets/dess.py +++ b/src/openqdc/datasets/dess.py @@ -35,6 +35,9 @@ def read_mol(mol_path, smiles, subset, targets): class DESS(BaseDataset): __name__ = "dess" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" __energy_methods__ = [ "mp2/cc-pvdz", "mp2/cc-pvqz", @@ -43,7 +46,7 @@ class DESS(BaseDataset): "ccsd(t)/cc-pvdz", "ccsd(t)/cbs", # cbs "ccsd(t)/nn", # nn - "sapt/aug-cc-pwcvxz", + "sapt0/aug-cc-pwcvxz", ] energy_target_names = [ diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index 9cbf03b..c9d3c0c 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -46,7 +46,7 @@ class ISO17(BaseDataset): ] __energy_unit__ = "ev" - __distance_unit__ = "ang" + __distance_unit__ = "ang" # bohr __forces_unit__ = "ev/ang" def __init__(self, energy_unit=None, distance_unit=None) -> None: diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index f9d5004..063f215 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -85,9 +85,9 @@ class Molecule3D(BaseDataset): __name__ = "molecule3d" __energy_methods__ = ["b3lyp/6-31g*"] # UNITS MOST LIKELY WRONG, MUST CHECK THEM MANUALLY - __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" + __energy_unit__ = "ev" # CALCULATED + __distance_unit__ = "wrong unit . it is 1/bohr somehow" # 1/bohr wrong unit + __forces_unit__ = "ev/ang" energy_target_names = ["b3lyp/6-31g*.energy"] diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py index 3371d4b..b4941df 100644 --- a/src/openqdc/datasets/nabladft.py +++ b/src/openqdc/datasets/nabladft.py @@ -4,11 +4,11 @@ import datamol as dm import numpy as np -from nablaDFT.dataset import HamiltonianDatabase from tqdm import tqdm from openqdc.datasets.base import BaseDataset from openqdc.utils.constants import MAX_ATOMIC_NUMBER +from openqdc.utils.package_utils import requires_package def to_mol(entry) -> Dict[str, np.ndarray]: @@ -27,7 +27,10 @@ def to_mol(entry) -> Dict[str, np.ndarray]: return res +@requires_package("nablaDFT") def read_chunk_from_db(raw_path, start_idx, stop_idx, step_size=1000): + from nablaDFT.dataset import HamiltonianDatabase + print(f"Loading from {start_idx} to {stop_idx}") db = HamiltonianDatabase(raw_path) idxs = list(np.arange(start_idx, stop_idx)) @@ -59,8 +62,8 @@ class NablaDFT(BaseDataset): energy_target_names = ["wb97x-d/def2-svp"] __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" + __distance_unit__ = "bohr" # ANG?? + __forces_unit__ = "hartree/bohr" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -68,7 +71,10 @@ class NablaDFT(BaseDataset): def __init__(self, energy_unit=None, distance_unit=None) -> None: super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) + @requires_package("nablaDFT") def read_raw_entries(self): + from nablaDFT.dataset import HamiltonianDatabase + raw_path = p_join(self.root, "dataset_full.db") train = HamiltonianDatabase(raw_path) n, c = len(train), 20 diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index 824453e..0ea1171 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -54,7 +54,6 @@ class OrbnetDenali(BaseDataset): __name__ = "orbnet_denali" __energy_methods__ = ["wb97x-d3/def2-tzvp", "gfn1_xtb"] - # not sure probably Hartree ang -> must manually check energy_target_names = ["dft_energy", "xtb1_energy"] __energy_unit__ = "hartree" __distance_unit__ = "ang" diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index 77df4c1..3bc5f5a 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -53,7 +53,7 @@ class QMugs(BaseDataset): """ __name__ = "qmugs" - __energy_methods__ = ["gfn2_xtb", "b3lyp/6-31g*"] + __energy_methods__ = ["gfn2_xtb", "wb97x-d-D/def2-svp"] __energy_unit__ = "hartree" __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index af9aa8b..40f065b 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -49,8 +49,8 @@ class Spice(BaseDataset): """ __name__ = "spice" - __energy_methods__ = ["wb97m-d3bj/def2-tzvp"] - __force_methods__ = ["wb97m-d3bj/def2-tzvp"] + __energy_methods__ = ["wb97m-d3bj/def2-tzvppd"] + __force_methods__ = ["wb97m-d3bj/def2-tzvppd"] __energy_unit__ = "hartree" __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index 0fbed07..2e1ae3e 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -3,6 +3,8 @@ ATOM_SPECIES = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I" # Energy in atomic unit/ Hartree / Ang +# didn t calculate for Pd, Pt, Mo, Ni, Fe, Cu, see DESS + class IsolatedAtomEnergyFactory: def __init__(self): @@ -23,12 +25,44 @@ def get(level_of_theory: str): functional_dict = ISOLATED_ATOM_ENERGIES.get(func, None) if functional_dict is None: logger.warning(f"Isolated atom energies not found for {level_of_theory}") + return ZEROS if not is_dft: return functional_dict - return functional_dict.get(basis, None) + return functional_dict.get(basis, ZEROS) -SPICE = { +ZEROS = { + ("Br", -1): 0.0, + ("Br", 0): 0.0, + ("C", -1): 0.0, + ("C", 0): 0.0, + ("C", 1): 0.0, + ("Ca", 2): 0.0, + ("Cl", -1): 0.0, + ("Cl", 0): 0.0, + ("F", -1): 0.0, + ("F", 0): 0.0, + ("H", 0): 0.0, + ("I", -1): 0.0, + ("I", 0): 0.0, + ("K", 1): 0.0, + ("Li", 1): 0.0, + ("Mg", 2): 0.0, + ("N", -1): 0.0, + ("N", 0): 0.0, + ("N", 1): 0.0, + ("Na", 1): 0.0, + ("O", -1): 0.0, + ("O", 0): 0.0, + ("O", 1): 0.0, + ("P", 0): 0.0, + ("P", 1): 0.0, + ("S", -1): 0.0, + ("S", 0): 0.0, + ("S", 1): 0.0, +} + +wb97m_d3bj_def2_tzvp = { ("Br", -1): -2574.2451510945853, ("Br", 0): -2574.1167240829964, ("C", -1): -37.91424135791358, @@ -247,7 +281,7 @@ def get(level_of_theory: str): ("I", 0): -297.5887657326151, } # "wb97m-d3bj/def2-TZVPPD" -wb97m_d3bj_def2_TZVPPD = { +SPICE = { ("H", 0): -0.4987605100487541, ("Li", 1): -7.285254714046117, ("B", -3): -24.191211616488623, @@ -1487,7 +1521,7 @@ def get(level_of_theory: str): "wb97m": { "def2-tzvp": COMP6_9, }, - "wb97m-d3bj": {"def2-tzvp": SPICE}, + "wb97m-d3bj": {"def2-tzvp": wb97m_d3bj_def2_tzvp, "def2-tzvppd": SPICE}, "tpssh": {"def2-tzvp": TMQM}, "revpbe-d3(bj)": {"def2-tzvp": SolvatedPeptides}, "dsd-blyp-d3(bj)": {"def2-tzvp": SN2RXN}, @@ -1535,7 +1569,7 @@ def get(level_of_theory: str): "cbs": mp2aug, }, # SAPT0 - "sapt": { + "sapt0": { "aug-cc-pwcvxz": None, # DOESNT MAKE SENSE }, # SEMI EMPIRICAL diff --git a/src/openqdc/utils/package_utils.py b/src/openqdc/utils/package_utils.py new file mode 100644 index 0000000..c7b8aac --- /dev/null +++ b/src/openqdc/utils/package_utils.py @@ -0,0 +1,130 @@ +import importlib +from functools import wraps +from typing import Any, Callable, TypeVar + +F = TypeVar("F", bound=Callable[..., Any]) + + +class MissingOptionalDependencyError(BaseException): + """ + An exception raised when an optional dependency is required + but cannot be found. + + Attributes + ---------- + library_name + The name of the missing library. + """ + + def __init__(self, library_name: str): + """ + + Parameters + ---------- + library_name + The name of the missing library. + license_issue + Whether the library was importable but was unusable due + to a missing license. + """ + + message = f"The required {library_name} module could not be imported." + + super(MissingOptionalDependencyError, self).__init__(message) + + self.library_name = library_name + + +def has_package(package_name: str) -> bool: + """ + Helper function to generically check if a Python package is installed. + Intended to be used to check for optional dependencies. + + Parameters + ---------- + package_name : str + The name of the Python package to check the availability of + + Returns + ------- + package_available : bool + Boolean indicator if the package is available or not + + Examples + -------- + >>> has_numpy = has_package('numpy') + >>> has_numpy + True + >>> has_foo = has_package('other_non_installed_package') + >>> has_foo + False + """ + try: + importlib.import_module(package_name) + except ModuleNotFoundError: + return False + return True + + +def requires_package(package_name: str) -> Callable[..., Any]: + """ + Helper function to denote that a funciton requires some optional + dependency. A function decorated with this decorator will raise + `MissingOptionalDependencyError` if the package is not found by + `importlib.import_module()`. + + Parameters + ---------- + package_name : str + The name of the module to be imported. + + Raises + ------ + MissingOptionalDependencyError + + """ + + def inner_decorator(function: F) -> F: + @wraps(function) + def wrapper(*args, **kwargs): + import importlib + + try: + importlib.import_module(package_name) + except ImportError: + raise MissingOptionalDependencyError(library_name=package_name) + except Exception as e: + raise e + + return function(*args, **kwargs) + + return wrapper + + return inner_decorator + + +def get_dir(): + r""" + Get the Torch Hub cache directory used for storing downloaded models & weights. + + If :func:`~torch.hub.set_dir` is not called, default path is ``$TORCH_HOME/hub`` where + environment variable ``$TORCH_HOME`` defaults to ``$XDG_CACHE_HOME/torch``. + ``$XDG_CACHE_HOME`` follows the X Design Group specification of the Linux + filesystem layout, with a default value ``~/.cache`` if the environment + variable is not set. + """ + + if _hub_dir is not None: + return _hub_dir + # return os.path.join(_get_torch_home(), 'hub') + + +def set_dir(d): + r""" + Optionally set the Torch Hub directory used to save downloaded models & weights. + + Args: + d (str): path to a local folder to save downloaded models & weights. + """ + global _hub_dir + # _hub_dir = os.path.expanduser(d) From 41c871ae4757cee67206add4c76925b36699323e Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 26 Oct 2023 19:31:00 -0400 Subject: [PATCH 07/20] Cache selection --- src/openqdc/__init__.py | 4 +--- src/openqdc/datasets/ani.py | 9 --------- src/openqdc/datasets/base.py | 16 +++++++++++----- src/openqdc/datasets/comp6.py | 3 --- src/openqdc/datasets/dess.py | 3 --- src/openqdc/datasets/dummy.py | 4 ++-- src/openqdc/datasets/gdml.py | 3 --- src/openqdc/datasets/geom.py | 3 --- src/openqdc/datasets/iso_17.py | 3 --- src/openqdc/datasets/molecule3d.py | 3 --- src/openqdc/datasets/nabladft.py | 3 --- src/openqdc/datasets/orbnet_denali.py | 3 --- src/openqdc/datasets/pcqm.py | 3 --- src/openqdc/datasets/qmugs.py | 3 --- src/openqdc/datasets/sn2_rxn.py | 3 --- src/openqdc/datasets/solvated_peptides.py | 3 --- src/openqdc/datasets/spice.py | 3 --- src/openqdc/datasets/tmqm.py | 3 --- src/openqdc/datasets/waterclusters3_30.py | 3 --- src/openqdc/utils/__init__.py | 8 ++++++++ src/openqdc/utils/io.py | 17 ++++++++++++++++- 21 files changed, 38 insertions(+), 65 deletions(-) diff --git a/src/openqdc/__init__.py b/src/openqdc/__init__.py index a5c0555..1432923 100644 --- a/src/openqdc/__init__.py +++ b/src/openqdc/__init__.py @@ -9,9 +9,7 @@ _lazy_imports_obj = {} -_lazy_imports_mod = { - "datasets": "openqdc.datamodule", -} +_lazy_imports_mod = {"datasets": "openqdc.datamodule", "utils": "openqdc.utils"} def __getattr__(name): diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index 49bd8fb..1c823ab 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -41,9 +41,6 @@ class ANI1(BaseDataset): __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - @property def root(self): return p_join(get_local_cache(), "ani") @@ -98,9 +95,6 @@ class ANI1CCX(ANI1): __force_methods__ = [] force_target_names = [] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - class ANI1X(ANI1): """ @@ -155,9 +149,6 @@ class ANI1X(ANI1): "wb97x/cc-pvtz", ] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - if __name__ == "__main__": for data_class in [ diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index b15f976..a375872 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -17,6 +17,7 @@ load_hdf5_file, pull_locally, push_remote, + set_cache_dir, ) from openqdc.utils.molecule import atom_table from openqdc.utils.units import get_conversion @@ -78,7 +79,10 @@ class BaseDataset(torch.utils.data.Dataset): __fn_distance__ = lambda x: x __fn_forces__ = lambda x: x - def __init__(self, energy_unit=None, distance_unit=None) -> None: + def __init__( + self, energy_unit: Optional[str] = None, distance_unit: Optional[str] = None, cache_dir: Optional[str] = None + ) -> None: + set_cache_dir(cache_dir) self.data = None self._set_units(energy_unit, distance_unit) if not self.is_preprocessed(): @@ -86,16 +90,12 @@ def __init__(self, energy_unit=None, distance_unit=None) -> None: res = self.collate_list(entries) self.save_preprocess(res) self.read_preprocess() - self.compute_properties() self.__isolated_atom_energies__ = ( [IsolatedAtomEnergyFactory.get(en_method) for en_method in self.__energy_methods__] if self.__energy_methods__ else None ) - def compute_properties(): - pass - @property def energy_unit(self): return self.__energy_unit__ @@ -275,3 +275,9 @@ def __getitem__(self, idx: int): subset=subset, forces=forces, ) + + def __str__(self): + return f"{self.__name__}" + + def __repr__(self): + return f"{self.__name__}" diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index bca1d5b..c63dfeb 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -64,9 +64,6 @@ class COMP6(BaseDataset): "Gradient", ] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): samples = [] for subset in ["ani_md", "drugbank", "gdb7_9", "gdb10_13", "s66x8", "tripeptides"]: diff --git a/src/openqdc/datasets/dess.py b/src/openqdc/datasets/dess.py index 9a488e4..59f8d8e 100644 --- a/src/openqdc/datasets/dess.py +++ b/src/openqdc/datasets/dess.py @@ -66,9 +66,6 @@ class DESS(BaseDataset): partitions = ["DES370K", "DES5M"] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def _read_raw_(self, part): df = pd.read_csv(p_join(self.root, f"{part}.csv")) for col in self.energy_target_names: diff --git a/src/openqdc/datasets/dummy.py b/src/openqdc/datasets/dummy.py index d3767fe..4e1ff17 100644 --- a/src/openqdc/datasets/dummy.py +++ b/src/openqdc/datasets/dummy.py @@ -20,9 +20,9 @@ class Dummy(BaseDataset): force_target_names = ["forces"] - def __init__(self, energy_unit=None, distance_unit=None) -> None: + def __init__(self, energy_unit=None, distance_unit=None, cache_dir=None) -> None: try: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) + super().__init__(energy_unit=energy_unit, distance_unit=distance_unit, cache_dir=cache_dir) except: # noqa pass diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py index 01ed166..e8aea01 100644 --- a/src/openqdc/datasets/gdml.py +++ b/src/openqdc/datasets/gdml.py @@ -67,9 +67,6 @@ class GDML(BaseDataset): __distance_unit__ = "ang" __forces_unit__ = "kcal/mol/ang" - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw_path = p_join(self.root, "gdml.h5") samples = read_qc_archive_h5(raw_path, "gdml", self.energy_target_names, self.force_target_names) diff --git a/src/openqdc/datasets/geom.py b/src/openqdc/datasets/geom.py index 309352a..cc2e06b 100644 --- a/src/openqdc/datasets/geom.py +++ b/src/openqdc/datasets/geom.py @@ -91,9 +91,6 @@ class GEOM(BaseDataset): partitions = ["qm9", "drugs"] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def _read_raw_(self, partition): raw_path = p_join(self.root, "rdkit_folder") diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index c9d3c0c..b647f43 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -49,9 +49,6 @@ class ISO17(BaseDataset): __distance_unit__ = "ang" # bohr __forces_unit__ = "ev/ang" - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw_path = p_join(self.root, "iso_17.h5") samples = read_qc_archive_h5(raw_path, "iso_17", self.energy_target_names, self.force_target_names) diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index 063f215..ba887b2 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -94,9 +94,6 @@ class Molecule3D(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw = p_join(self.root, "data", "raw") sdf_paths = glob(p_join(raw, "*.sdf")) diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py index b4941df..1271fc9 100644 --- a/src/openqdc/datasets/nabladft.py +++ b/src/openqdc/datasets/nabladft.py @@ -68,9 +68,6 @@ class NablaDFT(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - @requires_package("nablaDFT") def read_raw_entries(self): from nablaDFT.dataset import HamiltonianDatabase diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index 0ea1171..aa5a7e3 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -62,9 +62,6 @@ class OrbnetDenali(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): label_path = p_join(self.root, "denali_labels.csv") df = pd.read_csv(label_path, usecols=["sample_id", "mol_id", "subset", "dft_energy", "xtb1_energy"]) diff --git a/src/openqdc/datasets/pcqm.py b/src/openqdc/datasets/pcqm.py index 87acf44..8bc93b2 100644 --- a/src/openqdc/datasets/pcqm.py +++ b/src/openqdc/datasets/pcqm.py @@ -68,9 +68,6 @@ class PubchemQC(BaseDataset): partitions = ["b3lyp", "pm6"] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def _read_raw_(self, part): arxiv_paths = glob(p_join(self.root, f"{part}", "*.tar.gz")) print(len(arxiv_paths)) diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index 3bc5f5a..94491da 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -66,9 +66,6 @@ class QMugs(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw_path = p_join(self.root, "structures") mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)] diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index f3977cc..00427b1 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -31,9 +31,6 @@ class SN2RXN(BaseDataset): "DSD-BLYP-D3(BJ):def2-TZVP Gradient", ] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw_path = p_join(self.root, "sn2_rxn.h5") samples = read_qc_archive_h5(raw_path, "sn2_rxn", self.energy_target_names, self.force_target_names) diff --git a/src/openqdc/datasets/solvated_peptides.py b/src/openqdc/datasets/solvated_peptides.py index cfd9fe9..07047f6 100644 --- a/src/openqdc/datasets/solvated_peptides.py +++ b/src/openqdc/datasets/solvated_peptides.py @@ -33,9 +33,6 @@ class SolvatedPeptides(BaseDataset): __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw_path = p_join(self.root, "solvated_peptides.h5") samples = read_qc_archive_h5(raw_path, "solvated_peptides", self.energy_target_names, self.force_target_names) diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index 40f065b..99405d4 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -96,9 +96,6 @@ class Spice(BaseDataset): "SPICE Ion Pairs Single Points Dataset v1.1": "Ion Pairs", } - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): raw_path = p_join(self.root, "SPICE-1.1.4.hdf5") diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py index 600113b..b0860f7 100644 --- a/src/openqdc/datasets/tmqm.py +++ b/src/openqdc/datasets/tmqm.py @@ -60,9 +60,6 @@ class TMQM(BaseDataset): __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" - def __init__(self) -> None: - super().__init__() - def read_raw_entries(self): df = pd.read_csv(p_join(self.root, "tmQM_y.csv"), sep=";", usecols=["CSD_code", "Electronic_E"]) e_map = dict(zip(df["CSD_code"], df["Electronic_E"])) diff --git a/src/openqdc/datasets/waterclusters3_30.py b/src/openqdc/datasets/waterclusters3_30.py index 62e1d4a..94aeff1 100644 --- a/src/openqdc/datasets/waterclusters3_30.py +++ b/src/openqdc/datasets/waterclusters3_30.py @@ -60,9 +60,6 @@ class WaterClusters(BaseDataset): __energy_methods__ = ["ttm2.1-f"] energy_target_names = ["TTM2.1-F Potential"] - def __init__(self, energy_unit=None, distance_unit=None) -> None: - super().__init__(energy_unit=energy_unit, distance_unit=distance_unit) - def read_raw_entries(self): samples = [] for i in range(3, 31): diff --git a/src/openqdc/utils/__init__.py b/src/openqdc/utils/__init__.py index 92eec25..aeb5321 100644 --- a/src/openqdc/utils/__init__.py +++ b/src/openqdc/utils/__init__.py @@ -1,13 +1,17 @@ from .io import ( check_file, create_hdf5_file, + get_local_cache, + get_remote_cache, load_hdf5_file, load_json, load_pkl, load_torch, makedirs, save_pkl, + set_cache_dir, ) +from .units import get_conversion __all__ = [ "load_pkl", @@ -18,4 +22,8 @@ "load_torch", "create_hdf5_file", "check_file", + "set_cache_dir", + "get_local_cache", + "get_remote_cache", + "get_conversion", ] diff --git a/src/openqdc/utils/io.py b/src/openqdc/utils/io.py index 0a5f7c5..0dd2486 100644 --- a/src/openqdc/utils/io.py +++ b/src/openqdc/utils/io.py @@ -13,9 +13,24 @@ gcp_filesys = fsspec.filesystem("gs") local_filesys = LocalFileSystem() +_OPENQDC_CACHE_DIR = "~/.cache/openqdc" + + +def set_cache_dir(d): + r""" + Optionally set the _OPENQDC_CACHE_DIR directory. + + Args: + d (str): path to a local folder. + """ + if d is None: + return + global _OPENQDC_CACHE_DIR + _OPENQDC_CACHE_DIR = os.path.expanduser(d) + def get_local_cache(): - cache_dir = os.path.expanduser(os.path.expandvars("~/.cache/openqdc")) + cache_dir = os.path.expanduser(os.path.expandvars(_OPENQDC_CACHE_DIR)) os.makedirs(cache_dir, exist_ok=True) return cache_dir From ebf94647750ae4f2e67cb5790ed2982ce1174b71 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Fri, 27 Oct 2023 15:16:35 -0400 Subject: [PATCH 08/20] Fixed units + cache + utils to write extxyz --- src/openqdc/datasets/ani.py | 13 +++++++++++-- src/openqdc/datasets/base.py | 24 ++++++++++++++---------- src/openqdc/datasets/comp6.py | 6 +++--- src/openqdc/datasets/gdml.py | 4 ++-- src/openqdc/datasets/iso_17.py | 4 ++-- src/openqdc/datasets/molecule3d.py | 2 +- src/openqdc/datasets/nabladft.py | 2 +- src/openqdc/datasets/sn2_rxn.py | 4 ++-- src/openqdc/datasets/spice.py | 3 +++ src/openqdc/utils/io.py | 7 +++++++ 10 files changed, 46 insertions(+), 23 deletions(-) diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index 1c823ab..9a70f5c 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -38,8 +38,8 @@ class ANI1(BaseDataset): "ωB97x:6-31G(d) Energy", ] __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" + __distance_unit__ = "bohr" + __forces_unit__ = "hartree/bohr" @property def root(self): @@ -74,6 +74,9 @@ class ANI1CCX(ANI1): """ __name__ = "ani1ccx" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -113,6 +116,9 @@ class ANI1X(ANI1): """ __name__ = "ani1x" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) @@ -149,6 +155,9 @@ class ANI1X(ANI1): "wb97x/cc-pvtz", ] + def convert_forces(self, x): + return super().convert_forces(x) * 0.529177249 # correct the Dataset error + if __name__ == "__main__": for data_class in [ diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index a375872..4f273e4 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd import torch +from ase.io.extxyz import write_extxyz from loguru import logger from sklearn.utils import Bunch from tqdm import tqdm @@ -13,6 +14,7 @@ from openqdc.utils.constants import NB_ATOMIC_FEATURES from openqdc.utils.io import ( copy_exists, + dict_to_atoms, get_local_cache, load_hdf5_file, pull_locally, @@ -70,7 +72,6 @@ class BaseDataset(torch.utils.data.Dataset): energy_target_names = [] force_target_names = [] __isolated_atom_energies__ = [] - # convert force gradient -1 __energy_unit__ = "hartree" __distance_unit__ = "ang" @@ -188,11 +189,6 @@ def collate_list(self, list_entries): def save_preprocess(self, data_dict): # save memmaps logger.info("Preprocessing data and saving it to cache.") - logger.info( - f"Dataset {self.__name__} data with the following units:\n" - f"Energy: {self.energy_unit}, Distance: {self.distance_unit}, " - f"Forces: {self.force_unit if self.__force_methods__ else 'None'}" - ) for key in self.data_keys: local_path = p_join(self.preprocess_path, f"{key}.mmap") out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape) @@ -211,10 +207,10 @@ def save_preprocess(self, data_dict): def read_preprocess(self): logger.info("Reading preprocessed data") logger.info( - f"{self.__name__} data with the following units:\ - Energy: {self.energy_unit},\ - Distance: {self.distance_unit},\ - Forces: {self.force_unit}" + f"{self.__name__} data with the following units:\n\ + Energy: {self.energy_unit},\n\ + Distance: {self.distance_unit},\n\ + Forces: {self.force_unit if self.__force_methods__ else 'None'}" ) self.data = {} for key in self.data_keys: @@ -244,6 +240,14 @@ def is_preprocessed(self): predicats += [copy_exists(p_join(self.preprocess_path, f"{x}.npz")) for x in ["name", "subset"]] return all(predicats) + def save_xyz(self, idx: int, path: Optional[str] = None): + if path is None: + path = os.getcwd() + entry = self[idx] + name = entry.pop("name").decode() + at = dict_to_atoms(entry) + write_extxyz(p_join(path, f"{name}.xyz"), at) + def __len__(self): return self.data["energies"].shape[0] diff --git a/src/openqdc/datasets/comp6.py b/src/openqdc/datasets/comp6.py index c63dfeb..e596e13 100644 --- a/src/openqdc/datasets/comp6.py +++ b/src/openqdc/datasets/comp6.py @@ -28,9 +28,9 @@ class COMP6(BaseDataset): # Energy in hartree, all zeros by default atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) # watchout that forces are stored as -grad(E) - __energy_unit__ = "hartree" # kcal/mol now - __distance_unit__ = "ang" # bohr - __forces_unit__ = "hartree/ang" + __energy_unit__ = "kcal/mol" + __distance_unit__ = "bohr" # bohr + __forces_unit__ = "kcal/mol/bohr" __energy_methods__ = [ "wb97x/6-31g*", diff --git a/src/openqdc/datasets/gdml.py b/src/openqdc/datasets/gdml.py index e8aea01..1457890 100644 --- a/src/openqdc/datasets/gdml.py +++ b/src/openqdc/datasets/gdml.py @@ -64,8 +64,8 @@ class GDML(BaseDataset): ] __energy_unit__ = "kcal/mol" - __distance_unit__ = "ang" - __forces_unit__ = "kcal/mol/ang" + __distance_unit__ = "bohr" + __forces_unit__ = "kcal/mol/bohr" def read_raw_entries(self): raw_path = p_join(self.root, "gdml.h5") diff --git a/src/openqdc/datasets/iso_17.py b/src/openqdc/datasets/iso_17.py index b647f43..748d8e9 100644 --- a/src/openqdc/datasets/iso_17.py +++ b/src/openqdc/datasets/iso_17.py @@ -46,8 +46,8 @@ class ISO17(BaseDataset): ] __energy_unit__ = "ev" - __distance_unit__ = "ang" # bohr - __forces_unit__ = "ev/ang" + __distance_unit__ = "bohr" # bohr + __forces_unit__ = "ev/bohr" def read_raw_entries(self): raw_path = p_join(self.root, "iso_17.h5") diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index ba887b2..a376000 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -86,7 +86,7 @@ class Molecule3D(BaseDataset): __energy_methods__ = ["b3lyp/6-31g*"] # UNITS MOST LIKELY WRONG, MUST CHECK THEM MANUALLY __energy_unit__ = "ev" # CALCULATED - __distance_unit__ = "wrong unit . it is 1/bohr somehow" # 1/bohr wrong unit + __distance_unit__ = "ang" __forces_unit__ = "ev/ang" energy_target_names = ["b3lyp/6-31g*.energy"] diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py index 1271fc9..2d9d7f0 100644 --- a/src/openqdc/datasets/nabladft.py +++ b/src/openqdc/datasets/nabladft.py @@ -62,7 +62,7 @@ class NablaDFT(BaseDataset): energy_target_names = ["wb97x-d/def2-svp"] __energy_unit__ = "hartree" - __distance_unit__ = "bohr" # ANG?? + __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" # Energy in hartree, all zeros by default diff --git a/src/openqdc/datasets/sn2_rxn.py b/src/openqdc/datasets/sn2_rxn.py index 00427b1..02ee05d 100644 --- a/src/openqdc/datasets/sn2_rxn.py +++ b/src/openqdc/datasets/sn2_rxn.py @@ -16,8 +16,8 @@ class SN2RXN(BaseDataset): "dsd-blyp-d3(bj)/def2-tzvp", ] __energy_unit__ = "ev" - __distance_unit__ = "ang" - __forces_unit__ = "ev/ang" + __distance_unit__ = "bohr" + __forces_unit__ = "ev/bohr" energy_target_names = [ "DSD-BLYP-D3(BJ):def2-TZVP Atomization Energy", diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index 99405d4..44abcd9 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -96,6 +96,9 @@ class Spice(BaseDataset): "SPICE Ion Pairs Single Points Dataset v1.1": "Ion Pairs", } + def convert_forces(self, x): + return (-1.0) * super().convert_forces(x) + def read_raw_entries(self): raw_path = p_join(self.root, "SPICE-1.1.4.hdf5") diff --git a/src/openqdc/utils/io.py b/src/openqdc/utils/io.py index 0dd2486..a4ad984 100644 --- a/src/openqdc/utils/io.py +++ b/src/openqdc/utils/io.py @@ -6,6 +6,7 @@ import fsspec import h5py import torch +from ase.atoms import Atoms from fsspec.implementations.local import LocalFileSystem from gcsfs import GCSFileSystem from rdkit.Chem import MolFromXYZFile @@ -165,3 +166,9 @@ def load_json(path): def load_xyz(path): return MolFromXYZFile(path) + + +def dict_to_atoms(d: dict): + pos, atomic_numbers, charges = d.pop("positions"), d.pop("atomic_numbers"), d.pop("charges") + at = Atoms(positions=pos, numbers=atomic_numbers, charges=charges, info=d) + return at From 928716b528216837d9d5e31f48029cf63ccca89e Mon Sep 17 00:00:00 2001 From: FNTwin Date: Mon, 30 Oct 2023 09:12:30 -0400 Subject: [PATCH 09/20] e0 new routine --- src/openqdc/datasets/base.py | 9 +- src/openqdc/datasets/molecule3d.py | 4 - src/openqdc/datasets/nabladft.py | 4 - src/openqdc/datasets/orbnet_denali.py | 4 - src/openqdc/datasets/qm7x.py | 4 - src/openqdc/datasets/qmugs.py | 4 - src/openqdc/datasets/spice.py | 23 ---- src/openqdc/datasets/tmqm.py | 4 - src/openqdc/utils/atomization_energies.py | 127 ++++++++++++++++++++++ 9 files changed, 133 insertions(+), 50 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 7e7a834..787f308 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -98,7 +98,7 @@ def __init__( else: self.read_preprocess(overwrite_local_cache=overwrite_local_cache) self.__isolated_atom_energies__ = ( - [IsolatedAtomEnergyFactory.get(en_method) for en_method in self.__energy_methods__] + [IsolatedAtomEnergyFactory.get_matrix(en_method) for en_method in self.__energy_methods__] if self.__energy_methods__ else None ) @@ -265,6 +265,7 @@ def __len__(self): def __getitem__(self, idx: int): # if idx is more than len doesn t throw error + shift = IsolatedAtomEnergyFactory.max_charge p_start, p_end = self.data["position_idx_range"][idx] input = self.data["atomic_inputs"][p_start:p_end] z, c, positions, energies = ( @@ -280,12 +281,14 @@ def __getitem__(self, idx: int): forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32)) else: forces = None - + isolated_atom_energies = [ + get_conversion("hartree", self.__energy_unit__)(x[z, c + shift]) for x in self.__isolated_atom_energies__ + ] return Bunch( positions=positions, atomic_numbers=z, charges=c, - e0=self.convert_energy(self.atomic_energies[z]), + e0=isolated_atom_energies, energies=energies, name=name, subset=subset, diff --git a/src/openqdc/datasets/molecule3d.py b/src/openqdc/datasets/molecule3d.py index cdeeb89..dc47e53 100644 --- a/src/openqdc/datasets/molecule3d.py +++ b/src/openqdc/datasets/molecule3d.py @@ -9,7 +9,6 @@ from tqdm import tqdm from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import get_atomic_number_and_charge @@ -91,9 +90,6 @@ class Molecule3D(BaseDataset): energy_target_names = ["b3lyp/6-31g*.energy"] - # Energy in hartree, all zeros by default - atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - def read_raw_entries(self): raw = p_join(self.root, "data", "raw") sdf_paths = glob(p_join(raw, "*.sdf")) diff --git a/src/openqdc/datasets/nabladft.py b/src/openqdc/datasets/nabladft.py index 3f6b8fc..e7d9eb8 100644 --- a/src/openqdc/datasets/nabladft.py +++ b/src/openqdc/datasets/nabladft.py @@ -7,7 +7,6 @@ from tqdm import tqdm from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.package_utils import requires_package @@ -65,9 +64,6 @@ class NablaDFT(BaseDataset): __distance_unit__ = "bohr" __forces_unit__ = "hartree/bohr" - # Energy in hartree, all zeros by default - atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - @requires_package("nablaDFT") def read_raw_entries(self): from nablaDFT.dataset import HamiltonianDatabase diff --git a/src/openqdc/datasets/orbnet_denali.py b/src/openqdc/datasets/orbnet_denali.py index 3ea36bf..614e252 100644 --- a/src/openqdc/datasets/orbnet_denali.py +++ b/src/openqdc/datasets/orbnet_denali.py @@ -6,7 +6,6 @@ import pandas as pd from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import atom_table @@ -59,9 +58,6 @@ class OrbnetDenali(BaseDataset): __distance_unit__ = "ang" __forces_unit__ = "hartree/ang" - # Energy in hartree, all zeros by default - atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - def read_raw_entries(self): label_path = p_join(self.root, "denali_labels.csv") df = pd.read_csv(label_path, usecols=["sample_id", "mol_id", "subset", "dft_energy", "xtb1_energy"]) diff --git a/src/openqdc/datasets/qm7x.py b/src/openqdc/datasets/qm7x.py index cf22738..eb8b015 100644 --- a/src/openqdc/datasets/qm7x.py +++ b/src/openqdc/datasets/qm7x.py @@ -4,7 +4,6 @@ from tqdm import tqdm from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.io import load_hdf5_file @@ -36,9 +35,6 @@ def read_mol(mol_h5, mol_name, energy_target_names, force_target_names): class QM7X(BaseDataset): __name__ = "qm7x" - # Energy in hartree, all zeros by default - atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - __energy_methods__ = ["pbe0/mbd", "dft3b"] energy_target_names = ["ePBE0", "eMBD"] diff --git a/src/openqdc/datasets/qmugs.py b/src/openqdc/datasets/qmugs.py index e79f31b..c75f8b5 100644 --- a/src/openqdc/datasets/qmugs.py +++ b/src/openqdc/datasets/qmugs.py @@ -6,7 +6,6 @@ import numpy as np from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import get_atomic_number_and_charge @@ -63,9 +62,6 @@ class QMugs(BaseDataset): "DFT:TOTAL_ENERGY", ] - # Energy in hartree, all zeros by default - atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - def read_raw_entries(self): raw_path = p_join(self.root, "structures") mol_dirs = [p_join(raw_path, d) for d in os.listdir(raw_path)] diff --git a/src/openqdc/datasets/spice.py b/src/openqdc/datasets/spice.py index 7ff4617..974d45f 100644 --- a/src/openqdc/datasets/spice.py +++ b/src/openqdc/datasets/spice.py @@ -6,7 +6,6 @@ from openqdc.datasets.base import BaseDataset from openqdc.utils import load_hdf5_file -from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import get_atomic_number_and_charge @@ -61,28 +60,6 @@ class Spice(BaseDataset): force_target_names = ["dft_total_gradient"] - # Energy in hartree, all zeros by default - atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - tmp = { - 35: -2574.2451510945853, - 6: -37.91424135791358, - 20: -676.9528465198214, - 17: -460.3350243496703, - 9: -99.91298732343974, - 1: -0.5027370838721259, - 53: -297.8813829975981, - 19: -599.8025677513111, - 3: -7.285254714046546, - 12: -199.2688420040449, - 7: -54.62327513368922, - 11: -162.11366478783253, - 8: -75.17101657391741, - 15: -341.3059197024934, - 16: -398.2405387031612, - } - for key in tmp: - atomic_energies[key] = tmp[key] - subset_mapping = { "SPICE Solvated Amino Acids Single Points Dataset v1.1": "Solvated Amino Acids", "SPICE Dipeptides Single Points Dataset v1.2": "Dipeptides", diff --git a/src/openqdc/datasets/tmqm.py b/src/openqdc/datasets/tmqm.py index 5f70ff4..8952aaa 100644 --- a/src/openqdc/datasets/tmqm.py +++ b/src/openqdc/datasets/tmqm.py @@ -6,7 +6,6 @@ from tqdm import tqdm from openqdc.datasets.base import BaseDataset -from openqdc.utils.constants import MAX_ATOMIC_NUMBER from openqdc.utils.molecule import atom_table @@ -48,9 +47,6 @@ def read_xyz(fname, e_map): class TMQM(BaseDataset): __name__ = "tmqm" - # Energy in hartree, all zeros by default - atomic_energies = np.zeros((MAX_ATOMIC_NUMBER,), dtype=np.float32) - __energy_methods__ = ["tpssh/def2-tzvp"] energy_target_names = ["TPSSh/def2TZVP level"] diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index 2e1ae3e..ba84908 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -1,12 +1,128 @@ +import numpy as np from loguru import logger +from openqdc.utils.constants import MAX_ATOMIC_NUMBER + ATOM_SPECIES = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I" # Energy in atomic unit/ Hartree / Ang # didn t calculate for Pd, Pt, Mo, Ni, Fe, Cu, see DESS +atomic_numbers = {} +chemical_symbols = [ + "X", + "H", + "He", + "Li", + "Be", + "B", + "C", + "N", + "O", + "F", + "Ne", + "Na", + "Mg", + "Al", + "Si", + "P", + "S", + "Cl", + "Ar", + "K", + "Ca", + "Sc", + "Ti", + "V", + "Cr", + "Mn", + "Fe", + "Co", + "Ni", + "Cu", + "Zn", + "Ga", + "Ge", + "As", + "Se", + "Br", + "Kr", + "Rb", + "Sr", + "Y", + "Zr", + "Nb", + "Mo", + "Tc", + "Ru", + "Rh", + "Pd", + "Ag", + "Cd", + "In", + "Sn", + "Sb", + "Te", + "I", + "Xe", + "Cs", + "Ba", + "La", + "Ce", + "Pr", + "Nd", + "Pm", + "Sm", + "Eu", + "Gd", + "Tb", + "Dy", + "Ho", + "Er", + "Tm", + "Yb", + "Lu", + "Hf", + "Ta", + "W", + "Re", + "Os", + "Ir", + "Pt", + "Au", + "Hg", + "Tl", + "Pb", + "Bi", + "Po", + "At", + "Rn", + "Fr", + "Ra", + "Ac", + "Th", + "Pa", + "U", + "Np", + "Pu", + "Am", + "Cm", + "Bk", + "Cf", + "Es", + "Fm", + "Md", + "No", + "Lr", +] + + +for Z, symbol in enumerate(chemical_symbols): + atomic_numbers[symbol] = Z class IsolatedAtomEnergyFactory: + max_charge = 4 + def __init__(self): pass @@ -30,6 +146,17 @@ def get(level_of_theory: str): return functional_dict return functional_dict.get(basis, ZEROS) + @staticmethod + def get_matrix(level_of_theory: str): + shift = IsolatedAtomEnergyFactory.max_charge + matrix = np.zeros((MAX_ATOMIC_NUMBER, shift * 2 + 1)) + tuple_hashmap = IsolatedAtomEnergyFactory.get(level_of_theory) + if tuple_hashmap is None: + return matrix + for key in tuple_hashmap.keys(): + matrix[atomic_numbers[key[0]], key[1] + shift] = tuple_hashmap[key] + return matrix + ZEROS = { ("Br", -1): 0.0, From ae4f4828f5cad3bbeb54f5a7d7a6804967a0e8ca Mon Sep 17 00:00:00 2001 From: FNTwin Date: Mon, 30 Oct 2023 10:15:46 -0400 Subject: [PATCH 10/20] Increased chem space --- src/openqdc/utils/atomization_energies.py | 186 +++++++++++++++++----- 1 file changed, 144 insertions(+), 42 deletions(-) diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index ba84908..b534fc2 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -196,11 +196,15 @@ def get_matrix(level_of_theory: str): ("C", 0): -37.87264507233593, ("C", 1): -37.45349214963933, ("Ca", 2): -676.9528465198214, + ("Cl", -2): -459.6072967078548, ("Cl", -1): -460.3350243496703, ("Cl", 0): -460.1988762285739, + ("Cl", 2): -458.7433813454319, ("F", -1): -99.91298732343974, ("F", 0): -99.78611622985483, + ("H", -1): -0.5027370838721212, ("H", 0): -0.4987605100487341, + ("H", 1): 0.0, ("I", -1): -297.8813829975981, ("I", 0): -297.76228914445625, ("K", 1): -599.8025677513111, @@ -220,7 +224,9 @@ def get_matrix(level_of_theory: str): ("S", 1): -397.7746615977658, } GFN1 = { + ("H", -1): -0.5678094489236601, ("H", 0): -0.4014294744618301, + ("H", 1): 0.2350495, ("Li", 1): 0.13691666666666666, ("B", -3): -1.652343221335327, ("B", -1): -1.3514075648859643, @@ -247,8 +253,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -3.761566793286506, ("S", 0): -3.535920743315634, ("S", 1): -2.772567335542398, + ("Cl", -2): -4.177925186599567, ("Cl", -1): -4.527948236258716, ("Cl", 0): -4.166353944016668, + ("Cl", 2): -2.3809951798365505, ("K", 1): 0.08160976666666667, ("Ca", 2): 0.5662308, ("Br", -1): -3.957113536482028, @@ -257,7 +265,9 @@ def get_matrix(level_of_theory: str): ("I", 0): -3.885757275227844, } GFN2 = { + ("H", -1): -0.6107466928548624, ("H", 0): -0.3934827590437188, + ("H", 1): 0.22955216666666667, ("Li", 1): 0.1659637, ("B", -3): 0.4947743711421284, ("B", -1): -0.8833252789733281, @@ -284,7 +294,9 @@ def get_matrix(level_of_theory: str): ("S", -1): -3.4046900452338025, ("S", 0): -3.1482710158768508, ("S", 1): -2.5869831371080387, + ("Cl", -2): -4.249780801412338, ("Cl", -1): -4.785133953760966, + ("Cl", 2): -2.6084223252074965, ("Cl", 0): -4.482525134292114, ("K", 1): 0.19157049999999998, ("Ca", 2): 1.1759288, @@ -294,7 +306,9 @@ def get_matrix(level_of_theory: str): ("I", 0): -3.7796302627467933, } DFTB = { + ("H", -1): -0.267450800, ("H", 0): -0.2386004000, + ("H", 1): 0.2097500000, ("Li", 1): 0.000000000, ("B", -3): 0.1087536003, ("B", -1): -0.8108828001, @@ -321,8 +335,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -2.3857500900, ("S", 0): -2.2921235603, ("S", 1): -1.8696970300, + ("Cl", -2): -3.31200000, ("Cl", -1): -3.2238180000, ("Cl", 0): -3.0908230002, + ("Cl", 2): -1.7244330000, ("K", 1): 0.0678210000, ("Ca", 2): 0.3528980000, ("Br", -1): -3.0478250000, @@ -331,7 +347,9 @@ def get_matrix(level_of_theory: str): ("I", 0): -2.5796080002, } PM6 = { + ("H", -1): 0.20069130482, ("H", 0): 0.08302988483033709, + ("H", 1): 0.49634827548, ("Li", 1): 0.23429648020984556, ("B", -3): 1.042845967149475, ("B", -1): 0.2915413006028599, @@ -358,7 +376,9 @@ def get_matrix(level_of_theory: str): ("S", -1): 0.00773920374050412, ("S", 0): 0.15340740929612162, ("S", 1): 0.5198027279290017, + ("Cl", -2): 3.87282505908, ("Cl", -1): -0.09598933242391743, + ("Cl", 2): 1.6530454862, ("Cl", 0): 0.04614458119325779, ("K", 1): 0.17382321209735638, ("Ca", 2): 0.6490542924483952, @@ -371,7 +391,9 @@ def get_matrix(level_of_theory: str): # tpssh/def2-tzvp TMQM = { + ("H", -1): -0.5066148831768739, ("H", 0): -0.4998936035891093, + ("H", 1): 0.0, ("Li", 1): -7.285942861425713, ("B", -3): -24.011884397333016, ("B", -1): -24.671478908940745, @@ -398,8 +420,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.19525449701325, ("S", 0): -398.130358877624, ("S", 1): -397.7467993687058, + ("Cl", -2): -459.4908872312368, ("Cl", -1): -460.28412127843484, ("Cl", 0): -460.1641720279233, + ("Cl", 2): -458.485405333257, ("K", 1): -599.7644436257333, ("Ca", 2): -676.9154959968483, ("Br", -1): -2574.1448096288846, @@ -409,7 +433,9 @@ def get_matrix(level_of_theory: str): } # "wb97m-d3bj/def2-TZVPPD" SPICE = { + ("H", -1): -0.5027370838426788, ("H", 0): -0.4987605100487541, + ("H", 1): 0.0, ("Li", 1): -7.285254714046117, ("B", -3): -24.191211616488623, ("B", -1): -24.677421752607636, @@ -436,8 +462,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.24053870171247, ("S", 0): -398.15996366615616, ("S", 1): -397.7746615960709, + ("Cl", -2): -460.08763805127313, ("Cl", -1): -460.33502435018204, ("Cl", 0): -460.1988762286936, + ("Cl", 2): -458.7438528011782, ("K", 1): -599.8025677532396, ("Ca", 2): -676.9528465165403, ("Br", -1): -2574.2451510820465, @@ -447,7 +475,9 @@ def get_matrix(level_of_theory: str): } # "revpbe-d3(bj)/def2-tzvp" SolvatedPeptides = { + ("H", -1): -0.4931715827683033, ("H", 0): -0.5041476427597161, + ("H", 1): 0.0, ("Li", 1): -7.280731201437635, ("B", -3): -24.006372610643076, ("B", -1): -24.660992037766704, @@ -474,8 +504,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.10497764958086, ("S", 0): -398.04159371790865, ("S", 1): -397.6599146755941, + ("Cl", -2): -459.3527862471638, ("Cl", -1): -460.1836953722962, ("Cl", 0): -460.0661711540315, + ("Cl", 2): -458.51775405333257, ("K", 1): -599.6472569880391, ("Ca", 2): -676.7916386065199, ("Br", -1): -2574.0081469191155, @@ -485,7 +517,9 @@ def get_matrix(level_of_theory: str): } # "DSD-BLYP-D3BJ/def2-TZVPPD" SN2RXN = { + ("H", -1): -0.4931715827683033, ("H", 0): -0.4990585651127987, + ("H", 1): 0.0, ("Li", 1): -7.2751828330696995, ("B", -3): -24.127790514752746, ("B", -1): -24.62825292497449, @@ -512,8 +546,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.0441756257772, ("S", 0): -397.9705195592595, ("S", 1): -397.5944122508692, + ("Cl", -2): -459.3527862471638, ("Cl", -1): -460.13181548141955, ("Cl", 0): -460.0006937311494, + ("Cl", 2): -458.51775405333257, ("K", 1): -599.4901238823808, ("Ca", 2): -676.6456698988475, ("Br", -1): -2573.604327011817, @@ -523,7 +559,9 @@ def get_matrix(level_of_theory: str): } # "b3lyp/6-31g*" QMUGS_DFT = { + ("H", -1): -0.4618190740256503, ("H", 0): -0.5002733301377901, + ("H", 1): 0.0, ("Li", 1): -7.284546111273075, ("B", -3): -23.577268753399462, ("B", -1): -24.614577395156598, @@ -550,8 +588,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.16568433994024, ("S", 0): -398.1049932797066, ("S", 1): -397.7199808615457, + ("Cl", -2): -459.5066184980746, ("Cl", -1): -460.25223446009306, ("Cl", 0): -460.13624346967765, + ("Cl", 2): -458.6740467177361, ("K", 1): -599.7247062673807, ("Ca", 2): -676.8667395990246, ("Br", -1): -2573.824201570383, @@ -561,7 +601,9 @@ def get_matrix(level_of_theory: str): } # "wb97x-d3/def2-tzvp" ORBNET = { + ("H", -1): -0.5051390575292232, ("H", 0): -0.5025865385814652, + ("H", 1): 0.0, ("Li", 1): -7.289728176048259, ("B", -3): -23.984063702375366, ("B", -1): -24.655892805089884, @@ -588,8 +630,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.17523835330115, ("S", 0): -398.1081144325829, ("S", 1): -397.7235371215097, + ("Cl", -2): -459.55571935610567, ("Cl", -1): -460.26962615981756, ("Cl", 0): -460.1472726772528, + ("Cl", 2): -458.68793188715097, ("K", 1): -599.7560426196044, ("Ca", 2): -676.9122500284535, ("Br", -1): -2574.293316484485, @@ -599,7 +643,9 @@ def get_matrix(level_of_theory: str): } # "wb97x-d/def2-svp" NABLADFT = { + ("H", -1): -0.487196574630614, ("H", 0): -0.5024927493280441, + ("H", 1): 0.0, ("Li", 1): -7.289461512680954, ("B", -3): -23.76326340520956, ("B", -1): -24.616565541453497, @@ -626,8 +672,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.0129589348639, ("S", 0): -397.9719510287289, ("S", 1): -397.58695970543334, + ("Cl", -2): -459.17907026002734, ("Cl", -1): -460.0809386171713, ("Cl", 0): -459.9885726673416, + ("Cl", 2): -458.52265869014025, ("K", 1): -599.6772169304438, ("Ca", 2): -676.8244048230532, ("Br", -1): -2573.9600885084546, @@ -637,7 +685,9 @@ def get_matrix(level_of_theory: str): } # "wb97x/6-31g(d)" ANI1 = { + ("H", -1): -0.45658037701866955, ("H", 0): -0.4993457316092281, + ("H", 1): 0.0, ("Li", 1): -7.2856300653219614, ("B", -3): -23.575157416550805, ("B", -1): -24.603134775026213, @@ -664,8 +714,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.14261145000256, ("S", 0): -398.0814606242194, ("S", 1): -397.6998359561112, + ("Cl", -2): -459.479319530353, ("Cl", -1): -460.2341096421279, ("Cl", 0): -460.1166957612669, + ("Cl", 2): -458.6588365149308, ("K", 1): -599.7184666927276, ("Ca", 2): -676.8704088358037, ("Br", -1): -2573.8502718776604, @@ -675,7 +727,9 @@ def get_matrix(level_of_theory: str): } # "WB97X/6-31g*" COMP6_1 = { + ("H", -1): -0.4565803770186695, ("H", 0): -0.4993457316092281, + ("H", 1): 0.0, ("Li", 1): -7.285630065321961, ("B", -3): -23.5751574165508, ("B", -1): -24.603134775026216, @@ -702,8 +756,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.14261145000256, ("S", 0): -398.0814606242193, ("S", 1): -397.6998359561114, + ("Cl", -2): -459.47931953035305, ("Cl", -1): -460.23410964212803, ("Cl", 0): -460.1166957612671, + ("Cl", 2): -458.65883651493084, ("K", 1): -599.7184666927277, ("Ca", 2): -676.8704088358036, ("Br", -1): -2573.8502718776604, @@ -713,7 +769,9 @@ def get_matrix(level_of_theory: str): } # "ccsd/aug-cc-pVDZ" ccsdaug = { + ("H", -1): -0.5240286252725133, ("H", 0): -0.49933431543958506, + ("H", 1): 0.0, ("Li", 1): -7.23623079003172, ("B", -3): -24.135298809957895, ("B", -1): -24.595731151135812, @@ -740,8 +798,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.67826887815926, ("S", 0): -397.6146112492681, ("S", 1): -397.2542253763525, + ("Cl", -2): -459.42201473799554, ("Cl", -1): -459.7398865093852, ("Cl", 0): -459.6156482951034, + ("Cl", 2): -458.1975299396907, ("K", 1): None, # not available with this basis set ("Ca", 2): None, # not available with this basis set ("Br", -1): -2572.6265539931533, @@ -751,7 +811,9 @@ def get_matrix(level_of_theory: str): } # "ccsd(t)/aug-cc-pVDZ" ccsdtaug = { + ("H", -1): -0.489676276755859, ("H", 0): -0.4993343154395853, + ("H", 1): 0.0, ("Li", 1): -7.236230790031718, ("B", -3): -24.14659676027675, ("B", -1): -24.59834841644963, @@ -789,7 +851,9 @@ def get_matrix(level_of_theory: str): } # "mp2/aug-cc-pVDZ" mp2aug = { + ("H", -1): -0.5118536127440081, ("H", 0): -0.4993343154395852, + ("H", 1): 0.0, ("Li", 1): -7.2362434239942885, ("B", -3): -24.11454063530035, ("B", -1): -24.57403291869507, @@ -816,8 +880,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.6614469463811, ("S", 0): -397.5953187556735, ("S", 1): -397.236034450623, + ("Cl", -2): -459.4111711211486, ("Cl", -1): -459.7293671162834, ("Cl", 0): -459.5986332871817, + ("Cl", 2): -458.16109262813154, ("K", 1): None, # not available with this basis set ("Ca", 2): None, # not available with this basis set ("Br", -1): -2571.9455214335435, @@ -827,7 +893,9 @@ def get_matrix(level_of_theory: str): } # "mp2/def2-TZVP" mp2def2TZVP = { + ("H", -1): -0.48253121006249655, ("H", 0): -0.4998098322318883, + ("H", 1): 0.0, ("Li", 1): -7.26625465274989, ("B", -3): -23.89130329586724, ("B", -1): -24.58967154224317, @@ -854,8 +922,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.7717421040001, ("S", 0): -397.71573728264894, ("S", 1): -397.34975334831165, + ("Cl", -2): -459.09862455580026, ("Cl", -1): -459.84969455647206, ("Cl", 0): -459.7312731162239, + ("Cl", 2): -458.28486559837125, ("K", 1): -599.1623610013563, ("Ca", 2): -676.3191334447123, ("Br", -1): -2572.8329868011315, @@ -863,10 +933,11 @@ def get_matrix(level_of_theory: str): ("I", -1): -297.32915651116025, ("I", 0): -297.2135511448063, } -# "SVWN/def2-TZVP" TODO: RECALCULATE THIS - +# SVWN/def2-TZVP COMP6_7 = { + ("H", -1): -0.5173468733170209, ("H", 0): -0.4961415246858913, + ("H", 1): 0.0, ("Li", 1): -7.182160595407815, ("B", -3): -23.858154175760482, ("B", -1): -24.477102446655582, @@ -893,8 +964,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -396.74391290562517, ("S", 0): -397.0472344910708, ("S", 1): -396.6400428334645, + ("Cl", -2): None, ("Cl", -1): -459.1427217366059, - ("Cl", 0): None, + ("Cl", 0): -457.029433121817, + ("Cl", 2): -457.5432679710133, ("K", 1): -598.3826110301004, ("Ca", 2): -675.4148005786843, ("Br", -1): -2571.43279407191, @@ -904,7 +977,9 @@ def get_matrix(level_of_theory: str): } # "PBE-D3BJ2B/def2-TZVP" COMP6_5 = { + ("H", -1): -0.4984251407077053, ("H", 0): -0.49963874688778964, + ("H", 1): 0.0, ("Li", 1): -7.256644236856915, ("B", -3): -23.965651173919607, ("B", -1): -24.61987718656591, @@ -931,8 +1006,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.00391422389356, ("S", 0): -397.93836821335026, ("S", 1): -397.5554184472038, + ("Cl", -2): -459.386408262179, ("Cl", -1): -460.0784728779802, ("Cl", 0): -459.9584144179813, + ("Cl", 2): -458.5661867317756, ("K", 1): -599.5277926006078, ("Ca", 2): -676.665524794864, ("Br", -1): -2573.8415230490864, @@ -942,7 +1019,9 @@ def get_matrix(level_of_theory: str): } # "B3LYP-D3MBJ2B/def2-TZVP" COMP6_2 = { + ("H", -1): -0.5104276111528594, ("H", 0): -0.5021763508982502, + ("H", 1): 0.0, ("Li", 1): -7.28605166725753, ("B", -3): -24.00227248681287, ("B", -1): -24.670150534162623, @@ -969,8 +1048,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.200223492228, ("S", 0): -398.1324076067549, ("S", 1): -397.7448455107872, + ("Cl", -2): -459.58678053070076, ("Cl", -1): -460.2889124003806, ("Cl", 0): -460.16699382696663, + ("Cl", 2): -458.70493083496865, ("K", 1): -599.7602668684151, ("Ca", 2): -676.9064118669689, ("Br", -1): -2574.264312179195, @@ -980,7 +1061,9 @@ def get_matrix(level_of_theory: str): } # "b3lyp/def2-TZVP" COMP6_3 = { + ("H", -1): -0.5104276111528594, ("H", 0): -0.5021763508982502, + ("H", 1): 0.0, ("Li", 1): -7.2860516672575315, ("B", -3): -24.002272486812885, ("B", -1): -24.67015053416263, @@ -1007,8 +1090,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.2002234922283, ("S", 0): -398.1324076067552, ("S", 1): -397.744845510787, + ("Cl", -2): -459.58678053070065, ("Cl", -1): -460.28891240038075, ("Cl", 0): -460.1669938269668, + ("Cl", 2): -458.70493083496893, ("K", 1): -599.7602668684153, ("Ca", 2): -676.9064118669687, ("Br", -1): -2574.264312179194, @@ -1019,7 +1104,9 @@ def get_matrix(level_of_theory: str): # ccsd(t)/cc-pVDZ GDML_2 = { + ("H", -1): -0.489739656382323, ("H", 0): -0.49927840341958285, + ("H", 1): 0.0, ("Li", 1): -7.236223739656382, ("B", -3): -23.61782373835322, ("B", -1): -24.528388906235705, @@ -1057,7 +1144,9 @@ def get_matrix(level_of_theory: str): } # ccsd(t)/cc-pVTZ ANI1CCX_2 = { + ("H", -1): -0.4963122609799637, ("H", 0): -0.49980981130184293, + ("H", 1): 0.0, ("Li", 1): -7.249353374937752, ("B", -3): -23.793685421585884, ("B", -1): -24.56648780776967, @@ -1084,8 +1173,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.7409199255247, ("S", 0): -397.6361063083311, ("S", 1): -397.2347675440139, + ("Cl", -2): -459.069378694994, ("Cl", -1): -459.8163494320064, ("Cl", 0): -459.70310084056786, + ("Cl", 2): -458.277524056067, ("K", 1): None, # not available with this basis set ("Ca", 2): -676.3176100772968, ("Br", -1): -2572.8167538662433, @@ -1095,7 +1186,9 @@ def get_matrix(level_of_theory: str): } # ccsd/cc-pVDZ GDML_1 = { + ("H", -1): -0.49927840341958285, ("H", 0): -0.49927840341958285, + ("H", 1): 0.0, ("Li", 1): -7.236223739656382, ("B", -3): -23.613877846876942, ("B", -1): -24.52547666267111, @@ -1122,8 +1215,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.631810717651, ("S", 0): -397.54760940641853, ("S", 1): -397.15909131565013, + ("Cl", -2): -458.6471183178738, ("Cl", -1): -459.6933866998589, ("Cl", 0): -459.60268687745884, + ("Cl", 2): -458.1932998145885, ("K", 1): None, # not available with this basis set ("Ca", 2): -676.2265307613668, ("Br", -1): -2572.5834492880094, @@ -1133,7 +1228,9 @@ def get_matrix(level_of_theory: str): } # ccsd/cc-pVTZ CCSD_VTZ = { + ("H", -1): -0.49631226097996367, ("H", 0): -0.49980981130184293, + ("H", 1): 0.0, ("Li", 1): -7.249353374937752, ("B", -3): -23.78682468678494, ("B", -1): -24.56193370904525, @@ -1160,8 +1257,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.73415929387704, ("S", 0): -397.62619555322124, ("S", 1): -397.225460043223, + ("Cl", -2): -459.06087948746443, ("Cl", -1): -459.80856103622415, ("Cl", 0): -459.69693046874454, + ("Cl", 2): -458.26687876975234, ("K", 1): None, # not available with this basis set ("Ca", 2): -676.3160445414744, ("Br", -1): -2572.8073946290465, @@ -1171,7 +1270,9 @@ def get_matrix(level_of_theory: str): } # hf/cc-pVDZ ANI1X_1 = { + ("H", -1): -0.4488383380351602, ("H", 0): -0.4992784034195828, + ("H", 1): 0.0, ("Li", 1): -7.236120435571012, ("B", -3): -23.517631518350836, ("B", -1): -24.43849458753095, @@ -1198,8 +1299,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.506997287547, ("S", 0): -397.4131194811572, ("S", 1): -397.04821663752654, + ("Cl", -2): -458.49341773983207, ("Cl", -1): -459.54222556583767, ("Cl", 0): -459.4711432886898, + ("Cl", 2): -458.07541032143655, ("K", 1): None, # not available with this basis set ("Ca", 2): -676.1457625057777, ("Br", -1): -2571.766685524917, @@ -1209,7 +1312,9 @@ def get_matrix(level_of_theory: str): } # hf/cc-pVTZ ANI1X_3 = { + ("H", -1): -0.4668418892599132, ("H", 0): -0.49980981130184304, + ("H", 1): 0.0, ("Li", 1): -7.236381928884647, ("B", -3): -23.654030528094694, ("B", -1): -24.45440782122731, @@ -1236,8 +1341,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.5319459632172, ("S", 0): -397.4249161291449, ("S", 1): -397.06067984991046, + ("Cl", -2): -458.80494925757927, ("Cl", -1): -459.5646668064105, ("Cl", 0): -459.4854291853036, + ("Cl", 2): -458.09232019709674, ("K", 1): None, # not available with this basis set ("Ca", 2): -676.1540716436532, ("Br", -1): -2572.528468875192, @@ -1248,7 +1355,9 @@ def get_matrix(level_of_theory: str): # mp2/cc-pVDZ DES1 = { + ("H", -1): -0.46472136044848017, ("H", 0): -0.4992784034195828, + ("H", 1): 0.0, ("Li", 1): -7.236236031279599, ("B", -3): -23.59075634654498, ("B", -1): -24.496049160245956, @@ -1275,8 +1384,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.61602048346754, ("S", 0): -397.5157894668129, ("S", 1): -397.126843359414, + ("Cl", -2): -458.63292301888237, ("Cl", -1): -459.68240407270594, ("Cl", 0): -459.5865928328137, + ("Cl", 2): -458.1568260632668, ("K", 1): None, # not available with this basis set ("Ca", 2): -676.2188060975801, ("Br", -1): -2571.903217203978, @@ -1287,7 +1398,9 @@ def get_matrix(level_of_theory: str): # mp2/cc-pVQZ DES2 = { + ("H", -1): -0.49885469416811784, ("H", 0): -0.4999455685829884, + ("H", 1): 0.0, ("Li", 1): -7.250250946178424, ("B", -3): -23.881056379140478, ("B", -1): -24.562769033198762, @@ -1314,8 +1427,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.764457176497, ("S", 0): -397.63328479696963, ("S", 1): -397.2291889048987, + ("Cl", -2): -459.276002809114, ("Cl", -1): -459.85575358503627, ("Cl", 0): -459.725756402736, + ("Cl", 2): -458.27234841921444, ("K", 1): None, # not available with this basis set ("Ca", 2): -676.353471955094, ("Br", -1): -2572.9216392833405, @@ -1325,7 +1440,9 @@ def get_matrix(level_of_theory: str): } # pbe/def2-tzvp ISO17 = { + ("H", -1): -0.4984251407077052, ("H", 0): -0.4996387468896132, + ("H", 1): 0.0, ("Li", 1): -7.256644236856955, ("B", -3): -23.935382459402287, ("B", -1): -24.585965866081416, @@ -1352,8 +1469,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.00391422392744, ("S", 0): -397.9053091661701, ("S", 1): -397.5008759502245, + ("Cl", -2): -459.38640826217886, ("Cl", -1): -460.0784728780043, ("Cl", 0): -459.95841441797796, + ("Cl", 2): -458.566186731762, ("K", 1): -599.5277926006352, ("Ca", 2): -676.6655247948639, ("Br", -1): -2573.8415230488945, @@ -1365,7 +1484,9 @@ def get_matrix(level_of_theory: str): # hf/cc-pVQZ ANI1X_2 = { + ("H", -1): -0.47386028485392406, ("H", 0): -0.49994556858298844, + ("H", 1): 0.0, ("Li", 1): -7.236386237851972, ("B", -3): -23.74309031828107, ("B", -1): -24.46286773184739, @@ -1392,8 +1513,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.54055244875906, ("S", 0): -397.42820343953593, ("S", 1): -397.06412575498064, + ("Cl", -2): -458.978571599394, ("Cl", -1): -459.57282279413744, ("Cl", 0): -459.4890928627921, + ("Cl", 2): -458.0963453990511, ("K", 1): None, # not available with this basis set ("Ca", 2): -676.1542980250254, ("Br", -1): -2572.5345236382864, @@ -1405,7 +1528,9 @@ def get_matrix(level_of_theory: str): # mp2/cc-pVTZ DES3 = { + ("H", -1): -0.4891625462679369, ("H", 0): -0.49980981130184304, + ("H", 1): 0.0, ("Li", 1): -7.24726155786237, ("B", -3): -23.763643794842856, ("B", -1): -24.53409654753541, @@ -1432,8 +1557,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.7141036332652, ("S", 0): -397.5920220310466, ("S", 1): -397.19206598949114, + ("Cl", -2): -459.0459580553311, ("Cl", -1): -459.79402765207186, ("Cl", 0): -459.67567575694216, + ("Cl", 2): -458.22960655909685, ("K", 1): None, # not available with this basis set ("Ca", 2): -676.3023664599882, ("Br", -1): -2572.801814668155, @@ -1442,48 +1569,11 @@ def get_matrix(level_of_theory: str): ("I", 0): None, # not available with this basis set } -# pbe/def2-tzvp TODO: Recalculate this -# H -1 +1 , S -2 +2 -ISO17 = { - ("H", 0): -0.49963874688961296, - ("Li", 1): -7.256644236856955, - ("B", -3): None, - ("B", -1): None, - ("B", 0): -24.61008450990847, - ("B", 3): -21.981186468974155, - ("C", -1): None, - ("C", 0): None, - ("C", 1): -37.37216480721611, - ("N", -1): None, - ("N", 0): None, - ("N", 1): None, - ("O", -1): -75.04792601076218, - ("O", 0): None, - ("O", 1): None, - ("F", -1): -99.77558183886428, - ("F", 0): -99.66914009406861, - ("Na", 1): -161.96413737182382, - ("Mg", 2): -199.10001096170993, - ("Si", 4): -285.4180171255296, - ("Si", 0): None, - ("Si", -4): -288.02271678330254, - ("P", 0): None, - ("P", 1): None, - ("S", -1): -398.00391422392744, - ("S", 0): None, - ("S", 1): None, - ("Cl", -1): -460.07847287800433, - ("Cl", 0): -459.958414417978, - ("K", 1): -599.5277926006352, - ("Ca", 2): -676.6655247948639, - ("Br", -1): -2573.841523048894, - ("Br", 0): -2573.720729522104, - ("I", -1): -297.78153468631854, - ("I", 0): -297.66553802494457, -} # pbe0/def2-tzvp QM7X_DFT = { + ("H", -1): -0.5000012696776297, ("H", 0): -0.5010619187567116, + ("H", 1): 0.0, ("Li", 1): -7.262402336780465, ("B", -3): -23.93538245940231, ("B", -1): -24.58596586608141, @@ -1510,8 +1600,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.03842612700186, ("S", 0): -397.90530916617007, ("S", 1): -397.5008759502245, + ("Cl", -2): -459.4152688089829, ("Cl", -1): -460.11739716845636, ("Cl", 0): -459.9974100829532, + ("Cl", 2): -458.6052342125039, ("K", 1): -599.5783201878277, ("Ca", 2): -676.7194481655977, ("Br", -1): -2573.9328383617813, @@ -1522,7 +1614,9 @@ def get_matrix(level_of_theory: str): # LEVEL OF THEORY: WB97M-V/def2-tzvp COMP6_9 = { + ("H", -1): -0.5043034149209957, ("H", 0): -0.4942304316867456, + ("H", 1): 0.0, ("Li", 1): -7.275845986964876, ("B", -3): -23.944386486890433, ("B", -1): -24.620648350767315, @@ -1563,7 +1657,9 @@ def get_matrix(level_of_theory: str): # hf/def2-tzvp HF_DEF2 = { + ("H", -1): -0.4668133747908114, ("H", 0): -0.4998098322318885, + ("H", 1): 0.0, ("Li", 1): -7.236374246714073, ("B", -3): -23.74140302512685, ("B", -1): -24.462195925378662, @@ -1590,8 +1686,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -397.5252097143351, ("S", 0): -397.4176274212401, ("S", 1): -397.0534456500219, + ("Cl", -2): -458.7948759929542, ("Cl", -1): -459.55564984013716, ("Cl", 0): -459.47680800709793, + ("Cl", 2): -458.0838125597828, ("K", 1): -599.0060338509219, ("Ca", 2): -676.1418445564589, ("Br", -1): -2572.4811033491237, @@ -1600,7 +1698,9 @@ def get_matrix(level_of_theory: str): ("I", 0): -296.6585948224954, } ANI1X_8 = { + ("H", -1): -0.5043034149209957, ("H", 0): -0.5013136410415637, + ("H", 1): 0.0, ("Li", 1): -7.286464366413948, ("B", -3): -23.86534129296109, ("B", -1): -24.613473886395223, @@ -1625,8 +1725,10 @@ def get_matrix(level_of_theory: str): ("S", -1): -398.1805976553139, ("S", 0): -398.0529588010547, ("S", 1): -397.69734443410385, + ("Cl", -2): -459.5595393232076, ("Cl", -1): -460.2768559014631, ("Cl", 0): -460.1543938788908, + ("Cl", 2): -458.6962780587144, ("K", 1): None, ("Ca", 2): -676.921587688464, ("Br", -1): -2574.3069571951482, From d2a002a8654835e58c0b071f87aba101429010a2 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Mon, 30 Oct 2023 10:37:59 -0400 Subject: [PATCH 11/20] e0 optimization --- src/openqdc/datasets/base.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index 787f308..d198945 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -97,11 +97,7 @@ def __init__( # self.save_preprocess(res) else: self.read_preprocess(overwrite_local_cache=overwrite_local_cache) - self.__isolated_atom_energies__ = ( - [IsolatedAtomEnergyFactory.get_matrix(en_method) for en_method in self.__energy_methods__] - if self.__energy_methods__ - else None - ) + self._set_isolated_atom_energies() @property def energy_unit(self): @@ -160,6 +156,14 @@ def _set_units(self, en, ds): self.__forces_unit__ = self.energy_unit + "/" + self.distance_unit self.__class__.__fn_forces__ = get_conversion(old_en + "/" + old_ds, self.__forces_unit__) + def _set_isolated_atom_energies(self): + if self.__energy_methods__ is None: + logger.error("No energy methods defined for this dataset.") + f = get_conversion("hartree", self.__energy_unit__) + self.__isolated_atom_energies__ = f( + np.array([IsolatedAtomEnergyFactory.get_matrix(en_method) for en_method in self.__energy_methods__]) + ) + def convert_energy(self, x): return self.__class__.__fn_energy__(x) @@ -281,14 +285,11 @@ def __getitem__(self, idx: int): forces = self.convert_forces(np.array(self.data["forces"][p_start:p_end], dtype=np.float32)) else: forces = None - isolated_atom_energies = [ - get_conversion("hartree", self.__energy_unit__)(x[z, c + shift]) for x in self.__isolated_atom_energies__ - ] return Bunch( positions=positions, atomic_numbers=z, charges=c, - e0=isolated_atom_energies, + e0=self.__isolated_atom_energies__[..., z, c + shift], energies=energies, name=name, subset=subset, From 07008cdf7c55d24bb1e9c4d7ace7532fd4099bee Mon Sep 17 00:00:00 2001 From: FNTwin Date: Mon, 30 Oct 2023 11:02:01 -0400 Subject: [PATCH 12/20] indexing by column --- src/openqdc/datasets/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index d198945..d0e71ab 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -289,7 +289,7 @@ def __getitem__(self, idx: int): positions=positions, atomic_numbers=z, charges=c, - e0=self.__isolated_atom_energies__[..., z, c + shift], + e0=self.__isolated_atom_energies__[..., z, c + shift].T, energies=energies, name=name, subset=subset, From 0bfc2f1b7980d6e62ca7c742e642d0cb0d4ed615 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Mon, 30 Oct 2023 11:58:40 -0400 Subject: [PATCH 13/20] Ttm2.1 clarity --- src/openqdc/utils/atomization_energies.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index b534fc2..b306e71 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -1736,6 +1736,13 @@ def get_matrix(level_of_theory: str): ("I", -1): None, ("I", 0): None, } +# FF ttm2.1-f, calculated with ttm3-f f90 routine +# For isolated atoms doesn't change as it is always 0 +# Typed down for clarity +TTM2 = { + ("H", 0): 0.0, + ("O", 0): 0.0, +} ISOLATED_ATOM_ENERGIES = { @@ -1806,6 +1813,8 @@ def get_matrix(level_of_theory: str): "gfn1_xtb": GFN1, "dft3b": DFTB, "pm6": PM6, + # FF + "ttm2.1-f": TTM2, } # TODO: Talk with ivan about cbs extrapolation from from av[TQ]z. For now this should be ok From bfc5891eda74849bda27e5a99957857f321d0312 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 31 Oct 2023 13:08:20 -0400 Subject: [PATCH 14/20] Doc --- docs/API/isolated_atom_energies.md | 5 + docs/tutorials/usage.ipynb | 395 +++++++++++++++++++++- mkdocs.yml | 1 + src/openqdc/datasets/base.py | 132 +++++++- src/openqdc/utils/atomization_energies.py | 58 +++- src/openqdc/utils/io.py | 24 +- 6 files changed, 595 insertions(+), 20 deletions(-) create mode 100644 docs/API/isolated_atom_energies.md diff --git a/docs/API/isolated_atom_energies.md b/docs/API/isolated_atom_energies.md new file mode 100644 index 0000000..966b6a8 --- /dev/null +++ b/docs/API/isolated_atom_energies.md @@ -0,0 +1,5 @@ +# Isolated atoms energy + +This page contains the isolated atom energies. + +::: openqdc.utils.atomization_energies diff --git a/docs/tutorials/usage.ipynb b/docs/tutorials/usage.ipynb index d813272..b494396 100644 --- a/docs/tutorials/usage.ipynb +++ b/docs/tutorials/usage.ipynb @@ -13,29 +13,410 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/miniconda/base/envs/qdc/lib/python3.11/site-packages/google/auth/_default.py:76: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "\u001b[32m2023-10-31 11:43:09.510\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mopenqdc.datasets.base\u001b[0m:\u001b[36mread_preprocess\u001b[0m:\u001b[36m236\u001b[0m - \u001b[1mReading preprocessed data\u001b[0m\n", + "\u001b[32m2023-10-31 11:43:09.511\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mopenqdc.datasets.base\u001b[0m:\u001b[36mread_preprocess\u001b[0m:\u001b[36m237\u001b[0m - \u001b[1mspice data with the following units:\n", + " Energy: hartree,\n", + " Distance: bohr,\n", + " Forces: hartree/bohr\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded atomic_inputs with shape (33175288, 5), dtype float32\n", + "Loaded position_idx_range with shape (1110165, 2), dtype int32\n", + "Loaded energies with shape (1110165, 1), dtype float32\n", + "Loaded forces with shape (33175288, 3, 1), dtype float32\n", + "Loaded name_uniques with shape (19155,), dtype Dict[str, np.ndarray]: + """ + Compute the SOAP descriptors for the dataset. + + Parameters + ---------- + n_samples : Optional[int], optional + Number of samples to use for the computation, by default None. If None, all the dataset is used. + return_idxs : bool, optional + Whether to return the indices of the samples used, by default True. + progress : bool, optional + Whether to show a progress bar, by default True. + **soap_kwargs : dict + Keyword arguments to pass to the SOAP descriptor. + By defaut, the following values are used: + - r_cut : 5.0 + - n_max : 8 + - l_max : 6 + - average : "inner" + - periodic : False + - compression : {"mode" : "mu1nu1"} + + Returns + ------- + Dict[str, np.ndarray] + Dictionary containing the following keys: + - soap : np.ndarray of shape (N, M) containing the SOAP descriptors for the dataset + - soap_kwargs : dict containing the keyword arguments used for the SOAP descriptor + - idxs : np.ndarray of shape (N,) containing the indices of the samples used + + """ + import datamol as dm + from dscribe.descriptors import SOAP + + if n_samples is None: + idxs = list(range(len(self))) + else: + idxs = np.random.choice(len(self), size=n_samples, replace=False) + datum = {} + r_cut = soap_kwargs.pop("r_cut", 5.0) + n_max = soap_kwargs.pop("n_max", 8) + l_max = soap_kwargs.pop("l_max", 6) + average = soap_kwargs.pop("average", "inner") + periodic = soap_kwargs.pop("periodic", False) + compression = soap_kwargs.pop("compression", {"mode": "mu1nu1"}) + soap = SOAP( + species=self.chemical_species, + periodic=periodic, + r_cut=r_cut, + n_max=n_max, + l_max=l_max, + average=average, + compression=compression, + ) + datum["soap_kwargs"] = { + "r_cut": r_cut, + "n_max": n_max, + "l_max": l_max, + "average": average, + "compression": compression, + "species": self.chemical_species, + "periodic": periodic, + **soap_kwargs, + } + + def wrapper(idx): + entry = self.get_ase_atoms(idx, ext=True) + soap.create(entry, centers=entry.positions) + return soap.create(entry, centers=entry.positions) + + descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads") + datum["soap"] = np.vstack(descr) + if return_idxs: + datum["idxs"] = idxs + return datum + def __len__(self): return self.data["energies"].shape[0] def __getitem__(self, idx: int): - # if idx is more than len doesn t throw error shift = IsolatedAtomEnergyFactory.max_charge p_start, p_end = self.data["position_idx_range"][idx] input = self.data["atomic_inputs"][p_start:p_end] diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index b306e71..04315af 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -1,8 +1,14 @@ +from typing import Dict, Tuple, TypeAlias + import numpy as np from loguru import logger from openqdc.utils.constants import MAX_ATOMIC_NUMBER +__all__ = ["chemical_symbols", "atomic_numbers", "IsolatedAtomEnergyFactory"] + +EF_KEY: TypeAlias = Tuple[str, int] + ATOM_SPECIES = "H", "Li", "B", "C", "N", "O", "F", "Na", "Mg", "Si", "P", "S", "Cl", "K", "Ca", "Br", "I" # Energy in atomic unit/ Hartree / Ang @@ -121,16 +127,43 @@ class IsolatedAtomEnergyFactory: + """ + Factory method to get the isolated atom energies for a given level of theory. + """ + max_charge = 4 def __init__(self): pass def __call__(self, level_of_theory: str): + """ + Wrapper to the get method + + Parameters + ---------- + level_of_theory: str + """ return self.get(level_of_theory=level_of_theory) @staticmethod - def get(level_of_theory: str): + def get(level_of_theory: str) -> Dict[EF_KEY, float]: + """ + Get the dict isolated atom energies for a given level of theory + + Parameters + ---------- + level_of_theory: str + Level of theory in the format "functional/basis" or "functional" if semi empirical + + Returns + ------- + dict[tuple[str, int], float] + Dictionary containing the isolated atom energies for each entry written as a tuple (atom, charge): + + {("H", 1): 0.0, ...} + + """ level_of_theory = level_of_theory.lower() is_dft = True try: @@ -147,7 +180,27 @@ def get(level_of_theory: str): return functional_dict.get(basis, ZEROS) @staticmethod - def get_matrix(level_of_theory: str): + def get_matrix(level_of_theory: str) -> np.ndarray: + """ + Get the matrix of isolated atom energies for a given level of theory + + Parameters + ---------- + level_of_theory: str + Level of theory in the format "functional/basis" or "functional" if semi empirical + + Returns + ------- + np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * max_charge + 1) + Matrix containing the isolated atom energies for each atom and charge written in the form: + + | | -2 | -1 | 0 | +1 | +2 | <- charges + |---|----|----|---|----|----| + | 0 | | | | | | + | 1 | | | | | | + | 2 | | | | | | + + """ shift = IsolatedAtomEnergyFactory.max_charge matrix = np.zeros((MAX_ATOMIC_NUMBER, shift * 2 + 1)) tuple_hashmap = IsolatedAtomEnergyFactory.get(level_of_theory) @@ -1737,6 +1790,7 @@ def get_matrix(level_of_theory: str): ("I", 0): None, } # FF ttm2.1-f, calculated with ttm3-f f90 routine +# Link: https://www.pnnl.gov/science/ttm3f.asp # For isolated atoms doesn't change as it is always 0 # Typed down for clarity TTM2 = { diff --git a/src/openqdc/utils/io.py b/src/openqdc/utils/io.py index c77f5e7..6105d93 100644 --- a/src/openqdc/utils/io.py +++ b/src/openqdc/utils/io.py @@ -30,13 +30,19 @@ def set_cache_dir(d): _OPENQDC_CACHE_DIR = os.path.expanduser(d) -def get_local_cache(): +def get_local_cache() -> str: + """ + Returns the local cache directory. It creates it if it does not exist. + + Returns: + str: path to the local cache directory + """ cache_dir = os.path.expanduser(os.path.expandvars(_OPENQDC_CACHE_DIR)) os.makedirs(cache_dir, exist_ok=True) return cache_dir -def get_remote_cache(): +def get_remote_cache() -> str: remote_cache = "gs://opendatasets/openqdc" return remote_cache @@ -168,9 +174,19 @@ def load_xyz(path): return MolFromXYZFile(path) -def dict_to_atoms(d: dict): +def dict_to_atoms(d: dict, ext: bool = False) -> Atoms: + """ + Converts dictionary to ase atoms object + + Args: + d (dict): dictionary containing keys: positions, atomic_numbers, charges + ext (bool, optional): Whether to include all the rest of the dictionary in the atoms object info field. + Defaults to False. + """ pos, atomic_numbers, charges = d.pop("positions"), d.pop("atomic_numbers"), d.pop("charges") - at = Atoms(positions=pos, numbers=atomic_numbers, charges=charges, info=d) + at = Atoms(positions=pos, numbers=atomic_numbers, charges=charges) + if ext: + at.info = d return at From 26970819489038e2b819cb1bef8e54c7e051bbb0 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 31 Oct 2023 13:17:08 -0400 Subject: [PATCH 15/20] fix --- src/openqdc/datasets/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index faa774e..fcbbef0 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -283,7 +283,7 @@ def save_xyz(self, idx: int, path: Optional[str] = None): """ if path is None: path = os.getcwd() - at = self.get_xyz(idx, ext=True) + at = self.get_ase_atoms(idx, ext=True) name = at.info["name"] write_extxyz(p_join(path, f"{name}.xyz"), at) From 422609a09aace1b89fd6d46d3a035a261f3d0b04 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 31 Oct 2023 13:23:42 -0400 Subject: [PATCH 16/20] updated env file + fix --- env.yml | 2 +- src/openqdc/datasets/base.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/env.yml b/env.yml index b5ccda6..eefbd3a 100644 --- a/env.yml +++ b/env.yml @@ -1,6 +1,5 @@ channels: - conda-forge - - pyg # Only for macOS. Remove once https://github.com/conda-forge/pyg-lib-feedstock/pull/14 is merged. dependencies: # standard stuff @@ -30,6 +29,7 @@ dependencies: # ML #- einops =0.6.0 - pytorch + - dscribe # other stuffs - h5py >=3.8.0 diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index fcbbef0..a43ff4d 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -374,7 +374,6 @@ def chemical_space( def wrapper(idx): entry = self.get_ase_atoms(idx, ext=True) - soap.create(entry, centers=entry.positions) return soap.create(entry, centers=entry.positions) descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads") From 20f873d2cd9d100564c2832f91c76fe904d0d64b Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 31 Oct 2023 13:33:44 -0400 Subject: [PATCH 17/20] added missing unit conversiation --- src/openqdc/utils/units.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/openqdc/utils/units.py b/src/openqdc/utils/units.py index a810f1f..fb895ce 100644 --- a/src/openqdc/utils/units.py +++ b/src/openqdc/utils/units.py @@ -72,3 +72,4 @@ def get_conversion(in_unit: str, out_unit: str): ) Conversion("hartree/ang", "kcal/mol/ang", lambda x: get_conversion("hartree", "kcal/mol")(x)) Conversion("hartree/ang", "hartree/bohr", lambda x: get_conversion("bohr", "ang")(x)) +Conversion("hartree/bohr", "hartree/ang", lambda x: get_conversion("ang", "bohr")(x)) From 63ae525fc7b8b30d7777c0e72d1fbd349d331c90 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 31 Oct 2023 13:42:18 -0400 Subject: [PATCH 18/20] Duplicated ccsd(t) to tccsd(t) --- src/openqdc/datasets/ani.py | 2 +- src/openqdc/utils/atomization_energies.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/openqdc/datasets/ani.py b/src/openqdc/datasets/ani.py index 77da353..913fb8a 100644 --- a/src/openqdc/datasets/ani.py +++ b/src/openqdc/datasets/ani.py @@ -76,7 +76,7 @@ class ANI1CCX(ANI1): "ccsd(t)/cbs", "ccsd(t)/cc-pvdz", "ccsd(t)/cc-pvtz", - "ccsd(t)/cc-pvdz", + "tccsd(t)/cc-pvdz", ] energy_target_names = [ diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index 04315af..afcb1b2 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -1846,6 +1846,10 @@ def get_matrix(level_of_theory: str) -> np.ndarray: "cc-pvdz": GDML_1, "cc-pvtz": CCSD_VTZ, }, + "tccsd(t)": { + "cc-pvdz", + ANI1CCX_2, + }, "ccsd(t)": { "cc-pvdz": GDML_2, "cc-pvtz": ANI1CCX_2, From 9be6fdfcc8ba0019d25adc4c5208ad004fe8f8c3 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Tue, 31 Oct 2023 13:44:49 -0400 Subject: [PATCH 19/20] So many silly mistakes --- src/openqdc/utils/atomization_energies.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/openqdc/utils/atomization_energies.py b/src/openqdc/utils/atomization_energies.py index afcb1b2..40d0d13 100644 --- a/src/openqdc/utils/atomization_energies.py +++ b/src/openqdc/utils/atomization_energies.py @@ -1847,8 +1847,7 @@ def get_matrix(level_of_theory: str) -> np.ndarray: "cc-pvtz": CCSD_VTZ, }, "tccsd(t)": { - "cc-pvdz", - ANI1CCX_2, + "cc-pvdz": ANI1CCX_2, }, "ccsd(t)": { "cc-pvdz": GDML_2, From fff91e49573856dee1761929b058a11a57cfdae3 Mon Sep 17 00:00:00 2001 From: FNTwin Date: Thu, 2 Nov 2023 13:04:32 -0400 Subject: [PATCH 20/20] Docs + list of idx for soap --- src/openqdc/datasets/base.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/openqdc/datasets/base.py b/src/openqdc/datasets/base.py index a43ff4d..1de6ff1 100644 --- a/src/openqdc/datasets/base.py +++ b/src/openqdc/datasets/base.py @@ -1,6 +1,6 @@ import os from os.path import join as p_join -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union import numpy as np import pandas as pd @@ -306,15 +306,20 @@ def get_ase_atoms(self, idx: int, ext=True): @requires_package("dscribe") @requires_package("datamol") def chemical_space( - self, n_samples: Optional[int] = None, return_idxs: bool = True, progress: bool = True, **soap_kwargs + self, + n_samples: Optional[Union[List[int], int]] = None, + return_idxs: bool = True, + progress: bool = True, + **soap_kwargs, ) -> Dict[str, np.ndarray]: """ Compute the SOAP descriptors for the dataset. Parameters ---------- - n_samples : Optional[int], optional + n_samples : Optional[Union[List[int],int]], optional Number of samples to use for the computation, by default None. If None, all the dataset is used. + If a list of integers is provided, the descriptors are computed for each of the specified idx of samples. return_idxs : bool, optional Whether to return the indices of the samples used, by default True. progress : bool, optional @@ -343,8 +348,10 @@ def chemical_space( if n_samples is None: idxs = list(range(len(self))) - else: + elif isinstance(n_samples, int): idxs = np.random.choice(len(self), size=n_samples, replace=False) + elif isinstance(n_samples, list): + idxs = n_samples datum = {} r_cut = soap_kwargs.pop("r_cut", 5.0) n_max = soap_kwargs.pop("n_max", 8) @@ -373,7 +380,7 @@ def chemical_space( } def wrapper(idx): - entry = self.get_ase_atoms(idx, ext=True) + entry = self.get_ase_atoms(idx, ext=False) return soap.create(entry, centers=entry.positions) descr = dm.parallelized(wrapper, idxs, progress=progress, scheduler="threads")