diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9194d2ce..1abf329f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,8 +16,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] - os: ["ubuntu-latest"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + os: ["ubuntu-latest", "macos-latest"] #,"windows-latest" runs-on: ${{ matrix.os }} timeout-minutes: 30 @@ -53,5 +53,5 @@ jobs: - name: Run tests run: python -m pytest - #- name: Test building the doc - # run: mkdocs build + - name: Test building the doc + run: mkdocs build diff --git a/.gitignore b/.gitignore index 7a6dd93f..ffd7edf6 100644 --- a/.gitignore +++ b/.gitignore @@ -149,3 +149,6 @@ cookie.txt *.txt *.sh .DS_Store +*.zarr/ +scripts/ +notebooks/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..94a3b7e2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,352 @@ +Creative Commons Attribution-NonCommercial 4.0 International + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright and +certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + +- Considerations for licensors: Our public licenses are intended for + use by those authorized to give the public permission to use + material in ways otherwise restricted by copyright and certain other + rights. Our licenses are irrevocable. Licensors should read and + understand the terms and conditions of the license they choose + before applying it. Licensors should also secure all rights + necessary before applying our licenses so that the public can reuse + the material as expected. Licensors should clearly mark any material + not subject to the license. This includes other CC-licensed + material, or material used under an exception or limitation to + copyright. More considerations for licensors : + wiki.creativecommons.org/Considerations\_for\_licensors + +- Considerations for the public: By using one of our public licenses, + a licensor grants the public permission to use the licensed material + under specified terms and conditions. If the licensor's permission + is not necessary for any reason–for example, because of any + applicable exception or limitation to copyright–then that use is not + regulated by the license. Our licenses grant only permissions under + copyright and certain other rights that a licensor has authority to + grant. Use of the licensed material may still be restricted for + other reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, such + as asking that all changes be marked or described. Although not + required by our licenses, you are encouraged to respect those + requests where reasonable. More considerations for the public : + wiki.creativecommons.org/Considerations\_for\_licensees + +Creative Commons Attribution-NonCommercial 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-NonCommercial 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and conditions. + +- Section 1 – Definitions. + + - a. Adapted Material means material subject to Copyright and + Similar Rights that is derived from or based upon the Licensed + Material and in which the Licensed Material is translated, + altered, arranged, transformed, or otherwise modified in a + manner requiring permission under the Copyright and Similar + Rights held by the Licensor. For purposes of this Public + License, where the Licensed Material is a musical work, + performance, or sound recording, Adapted Material is always + produced where the Licensed Material is synched in timed + relation with a moving image. + - b. Adapter's License means the license You apply to Your + Copyright and Similar Rights in Your contributions to Adapted + Material in accordance with the terms and conditions of this + Public License. + - c. Copyright and Similar Rights means copyright and/or similar + rights closely related to copyright including, without + limitation, performance, broadcast, sound recording, and Sui + Generis Database Rights, without regard to how the rights are + labeled or categorized. For purposes of this Public License, the + rights specified in Section 2(b)(1)-(2) are not Copyright and + Similar Rights. + - d. Effective Technological Measures means those measures that, + in the absence of proper authority, may not be circumvented + under laws fulfilling obligations under Article 11 of the WIPO + Copyright Treaty adopted on December 20, 1996, and/or similar + international agreements. + - e. Exceptions and Limitations means fair use, fair dealing, + and/or any other exception or limitation to Copyright and + Similar Rights that applies to Your use of the Licensed + Material. + - f. Licensed Material means the artistic or literary work, + database, or other material to which the Licensor applied this + Public License. + - g. Licensed Rights means the rights granted to You subject to + the terms and conditions of this Public License, which are + limited to all Copyright and Similar Rights that apply to Your + use of the Licensed Material and that the Licensor has authority + to license. + - h. Licensor means the individual(s) or entity(ies) granting + rights under this Public License. + - i. NonCommercial means not primarily intended for or directed + towards commercial advantage or monetary compensation. For + purposes of this Public License, the exchange of the Licensed + Material for other material subject to Copyright and Similar + Rights by digital file-sharing or similar means is NonCommercial + provided there is no payment of monetary compensation in + connection with the exchange. + - j. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, + distribution, dissemination, communication, or importation, and + to make material available to the public including in ways that + members of the public may access the material from a place and + at a time individually chosen by them. + - k. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and + of the Council of 11 March 1996 on the legal protection of + databases, as amended and/or succeeded, as well as other + essentially equivalent rights anywhere in the world. + - l. You means the individual or entity exercising the Licensed + Rights under this Public License. Your has a corresponding + meaning. + +- Section 2 – Scope. + + - a. License grant. + - 1. Subject to the terms and conditions of this Public + License, the Licensor hereby grants You a worldwide, + royalty-free, non-sublicensable, non-exclusive, irrevocable + license to exercise the Licensed Rights in the Licensed + Material to: + - A. reproduce and Share the Licensed Material, in whole + or in part, for NonCommercial purposes only; and + - B. produce, reproduce, and Share Adapted Material for + NonCommercial purposes only. + - 2. Exceptions and Limitations. For the avoidance of doubt, + where Exceptions and Limitations apply to Your use, this + Public License does not apply, and You do not need to comply + with its terms and conditions. + - 3. Term. The term of this Public License is specified in + Section 6(a). + - 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter + created, and to make technical modifications necessary to do + so. The Licensor waives and/or agrees not to assert any + right or authority to forbid You from making technical + modifications necessary to exercise the Licensed Rights, + including technical modifications necessary to circumvent + Effective Technological Measures. For purposes of this + Public License, simply making modifications authorized by + this Section 2(a)(4) never produces Adapted Material. + - 5. Downstream recipients. + - A. Offer from the Licensor – Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + - B. No downstream restrictions. You may not offer or + impose any additional or different terms or conditions + on, or apply any Effective Technological Measures to, + the Licensed Material if doing so restricts exercise of + the Licensed Rights by any recipient of the Licensed + Material. + - 6. No endorsement. Nothing in this Public License + constitutes or may be construed as permission to assert or + imply that You are, or that Your use of the Licensed + Material is, connected with, or sponsored, endorsed, or + granted official status by, the Licensor or others + designated to receive attribution as provided in Section + 3(a)(1)(A)(i). + - b. Other rights. + - 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, + to the extent possible, the Licensor waives and/or agrees + not to assert any such rights held by the Licensor to the + limited extent necessary to allow You to exercise the + Licensed Rights, but not otherwise. + - 2. Patent and trademark rights are not licensed under this + Public License. + - 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties, including when + the Licensed Material is used other than for NonCommercial + purposes. + +- Section 3 – License Conditions. + + Your exercise of the Licensed Rights is expressly made subject to + the following conditions. + + - a. Attribution. + - 1. If You Share the Licensed Material (including in modified + form), You must: + - A. retain the following if it is supplied by the + Licensor with the Licensed Material: + - i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if designated); + - ii. a copyright notice; + - iii. a notice that refers to this Public License; + - iv. a notice that refers to the disclaimer of + warranties; + - v. a URI or hyperlink to the Licensed Material to + the extent reasonably practicable; + - B. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + - C. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + - 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may + be reasonable to satisfy the conditions by providing a URI + or hyperlink to a resource that includes the required + information. + - 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + - 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + +- Section 4 – Sui Generis Database Rights. + + Where the Licensed Rights include Sui Generis Database Rights that + apply to Your use of the Licensed Material: + + - a. for the avoidance of doubt, Section 2(a)(1) grants You the + right to extract, reuse, reproduce, and Share all or a + substantial portion of the contents of the database for + NonCommercial purposes only; + - b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; + and + - c. You must comply with the conditions in Section 3(a) if You + Share all or a substantial portion of the contents of the + database. + + For the avoidance of doubt, this Section 4 supplements and does not + replace Your obligations under this Public License where the + Licensed Rights include other Copyright and Similar Rights. + +- Section 5 – Disclaimer of Warranties and Limitation of Liability. + + - a. Unless otherwise separately undertaken by the Licensor, to + the extent possible, the Licensor offers the Licensed Material + as-is and as-available, and makes no representations or + warranties of any kind concerning the Licensed Material, whether + express, implied, statutory, or other. This includes, without + limitation, warranties of title, merchantability, fitness for a + particular purpose, non-infringement, absence of latent or other + defects, accuracy, or the presence or absence of errors, whether + or not known or discoverable. Where disclaimers of warranties + are not allowed in full or in part, this disclaimer may not + apply to You. + - b. To the extent possible, in no event will the Licensor be + liable to You on any legal theory (including, without + limitation, negligence) or otherwise for any direct, special, + indirect, incidental, consequential, punitive, exemplary, or + other losses, costs, expenses, or damages arising out of this + Public License or use of the Licensed Material, even if the + Licensor has been advised of the possibility of such losses, + costs, expenses, or damages. Where a limitation of liability is + not allowed in full or in part, this limitation may not apply to + You. + - c. The disclaimer of warranties and limitation of liability + provided above shall be interpreted in a manner that, to the + extent possible, most closely approximates an absolute + disclaimer and waiver of all liability. + +- Section 6 – Term and Termination. + + - a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply + with this Public License, then Your rights under this Public + License terminate automatically. + - b. Where Your right to use the Licensed Material has terminated + under Section 6(a), it reinstates: + + - 1. automatically as of the date the violation is cured, + provided it is cured within 30 days of Your discovery of the + violation; or + - 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect + any right the Licensor may have to seek remedies for Your + violations of this Public License. + + - c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing + so will not terminate this Public License. + - d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + +- Section 7 – Other Terms and Conditions. + + - a. The Licensor shall not be bound by any additional or + different terms or conditions communicated by You unless + expressly agreed. + - b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + +- Section 8 – Interpretation. + + - a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could + lawfully be made without permission under this Public License. + - b. To the extent possible, if any provision of this Public + License is deemed unenforceable, it shall be automatically + reformed to the minimum extent necessary to make it enforceable. + If the provision cannot be reformed, it shall be severed from + this Public License without affecting the enforceability of the + remaining terms and conditions. + - c. No term or condition of this Public License will be waived + and no failure to comply consented to unless expressly agreed to + by the Licensor. + - d. Nothing in this Public License constitutes or may be + interpreted as a limitation upon, or waiver of, any privileges + and immunities that apply to the Licensor or You, including from + the legal processes of any jurisdiction or authority. + +Creative Commons is not a party to its public licenses. Notwithstanding, +Creative Commons may elect to apply one of its public licenses to +material it publishes and in those instances will be considered the +"Licensor." The text of the Creative Commons public licenses is +dedicated to the public domain under the CC0 Public Domain Dedication. +Except for the limited purpose of indicating that material is shared +under a Creative Commons public license or as otherwise permitted by the +Creative Commons policies published at creativecommons.org/policies, +Creative Commons does not authorize the use of the trademark "Creative +Commons" or any other trademark or logo of Creative Commons without its +prior written consent including, without limitation, in connection with +any unauthorized modifications to any of its public licenses or any +other arrangements, understandings, or agreements concerning use of +licensed material. For the avoidance of doubt, this paragraph does not +form part of the public licenses. + +Creative Commons may be contacted at creativecommons.org. diff --git a/docs/API/available_datasets.md b/docs/API/available_datasets.md deleted file mode 100644 index fa630b8a..00000000 --- a/docs/API/available_datasets.md +++ /dev/null @@ -1,3 +0,0 @@ -# Available Datasets - -::: openqdc.datasets diff --git a/docs/API/basedataset.md b/docs/API/basedataset.md new file mode 100644 index 00000000..cdaeee77 --- /dev/null +++ b/docs/API/basedataset.md @@ -0,0 +1 @@ +::: openqdc.datasets.base diff --git a/docs/API/datasets/alchemy.md b/docs/API/datasets/alchemy.md new file mode 100644 index 00000000..096774c3 --- /dev/null +++ b/docs/API/datasets/alchemy.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.alchemy diff --git a/docs/API/datasets/ani.md b/docs/API/datasets/ani.md new file mode 100644 index 00000000..4f79f587 --- /dev/null +++ b/docs/API/datasets/ani.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.ani diff --git a/docs/API/datasets/comp6.md b/docs/API/datasets/comp6.md new file mode 100644 index 00000000..e473e211 --- /dev/null +++ b/docs/API/datasets/comp6.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.comp6 diff --git a/docs/API/datasets/des.md b/docs/API/datasets/des.md new file mode 100644 index 00000000..dbff5035 --- /dev/null +++ b/docs/API/datasets/des.md @@ -0,0 +1 @@ +::: openqdc.datasets.interaction.des diff --git a/docs/API/datasets/gdml.md b/docs/API/datasets/gdml.md new file mode 100644 index 00000000..a91cf993 --- /dev/null +++ b/docs/API/datasets/gdml.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.gdml diff --git a/docs/API/datasets/geom.md b/docs/API/datasets/geom.md new file mode 100644 index 00000000..f290eb93 --- /dev/null +++ b/docs/API/datasets/geom.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.geom.GEOM diff --git a/docs/API/datasets/iso_17.md b/docs/API/datasets/iso_17.md new file mode 100644 index 00000000..01a04e67 --- /dev/null +++ b/docs/API/datasets/iso_17.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.iso_17 diff --git a/docs/API/datasets/l7.md b/docs/API/datasets/l7.md new file mode 100644 index 00000000..512e7f37 --- /dev/null +++ b/docs/API/datasets/l7.md @@ -0,0 +1 @@ +::: openqdc.datasets.interaction.l7 diff --git a/docs/API/datasets/md22.md b/docs/API/datasets/md22.md new file mode 100644 index 00000000..d793b5cf --- /dev/null +++ b/docs/API/datasets/md22.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.md22 diff --git a/docs/API/datasets/metcalf.md b/docs/API/datasets/metcalf.md new file mode 100644 index 00000000..58566b02 --- /dev/null +++ b/docs/API/datasets/metcalf.md @@ -0,0 +1 @@ +::: openqdc.datasets.interaction.metcalf diff --git a/docs/API/datasets/molecule3d.md b/docs/API/datasets/molecule3d.md new file mode 100644 index 00000000..d7b6a5a4 --- /dev/null +++ b/docs/API/datasets/molecule3d.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.molecule3d diff --git a/docs/API/datasets/multixcqm9.md b/docs/API/datasets/multixcqm9.md new file mode 100644 index 00000000..55993cd7 --- /dev/null +++ b/docs/API/datasets/multixcqm9.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.multixcqm9 diff --git a/docs/API/datasets/nabladft.md b/docs/API/datasets/nabladft.md new file mode 100644 index 00000000..a69d68d7 --- /dev/null +++ b/docs/API/datasets/nabladft.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.nabladft diff --git a/docs/API/datasets/orbnet_denali.md b/docs/API/datasets/orbnet_denali.md new file mode 100644 index 00000000..1b4ee6a7 --- /dev/null +++ b/docs/API/datasets/orbnet_denali.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.orbnet_denali diff --git a/docs/API/datasets/pcqm.md b/docs/API/datasets/pcqm.md new file mode 100644 index 00000000..6cd1b92b --- /dev/null +++ b/docs/API/datasets/pcqm.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.pcqm diff --git a/docs/API/datasets/proteinfragments.md b/docs/API/datasets/proteinfragments.md new file mode 100644 index 00000000..d5aa28a5 --- /dev/null +++ b/docs/API/datasets/proteinfragments.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.proteinfragments diff --git a/docs/API/datasets/qm1b.md b/docs/API/datasets/qm1b.md new file mode 100644 index 00000000..b92dfff4 --- /dev/null +++ b/docs/API/datasets/qm1b.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.qm1b diff --git a/docs/API/datasets/qm7x.md b/docs/API/datasets/qm7x.md new file mode 100644 index 00000000..d649b40d --- /dev/null +++ b/docs/API/datasets/qm7x.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.qm7x diff --git a/docs/API/datasets/qmugs.md b/docs/API/datasets/qmugs.md new file mode 100644 index 00000000..06773b68 --- /dev/null +++ b/docs/API/datasets/qmugs.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.qmugs diff --git a/docs/API/datasets/qmx.md b/docs/API/datasets/qmx.md new file mode 100644 index 00000000..b7343767 --- /dev/null +++ b/docs/API/datasets/qmx.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.qmx diff --git a/docs/API/datasets/revmd17.md b/docs/API/datasets/revmd17.md new file mode 100644 index 00000000..e63ba031 --- /dev/null +++ b/docs/API/datasets/revmd17.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.revmd17 diff --git a/docs/API/datasets/sn2_rxn.md b/docs/API/datasets/sn2_rxn.md new file mode 100644 index 00000000..9095532c --- /dev/null +++ b/docs/API/datasets/sn2_rxn.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.sn2_rxn diff --git a/docs/API/datasets/solvated_peptides.md b/docs/API/datasets/solvated_peptides.md new file mode 100644 index 00000000..a6139c12 --- /dev/null +++ b/docs/API/datasets/solvated_peptides.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.solvated_peptides diff --git a/docs/API/datasets/spice.md b/docs/API/datasets/spice.md new file mode 100644 index 00000000..c0e95b79 --- /dev/null +++ b/docs/API/datasets/spice.md @@ -0,0 +1,2 @@ + +::: openqdc.datasets.potential.spice diff --git a/docs/API/datasets/splinter.md b/docs/API/datasets/splinter.md new file mode 100644 index 00000000..00789cfa --- /dev/null +++ b/docs/API/datasets/splinter.md @@ -0,0 +1 @@ +::: openqdc.datasets.interaction.splinter diff --git a/docs/API/datasets/tmqm.md b/docs/API/datasets/tmqm.md new file mode 100644 index 00000000..70b56781 --- /dev/null +++ b/docs/API/datasets/tmqm.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.tmqm diff --git a/docs/API/datasets/transition1x.md b/docs/API/datasets/transition1x.md new file mode 100644 index 00000000..63eceaa3 --- /dev/null +++ b/docs/API/datasets/transition1x.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.transition1x diff --git a/docs/API/datasets/vqm24.md b/docs/API/datasets/vqm24.md new file mode 100644 index 00000000..ed117b9f --- /dev/null +++ b/docs/API/datasets/vqm24.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.vqm24 diff --git a/docs/API/datasets/waterclusters.md b/docs/API/datasets/waterclusters.md new file mode 100644 index 00000000..f1f90883 --- /dev/null +++ b/docs/API/datasets/waterclusters.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.waterclusters diff --git a/docs/API/datasets/waterclusters3_30.md b/docs/API/datasets/waterclusters3_30.md new file mode 100644 index 00000000..3f0ccf7f --- /dev/null +++ b/docs/API/datasets/waterclusters3_30.md @@ -0,0 +1 @@ +::: openqdc.datasets.potential.waterclusters3_30 diff --git a/docs/API/datasets/x40.md b/docs/API/datasets/x40.md new file mode 100644 index 00000000..799738c5 --- /dev/null +++ b/docs/API/datasets/x40.md @@ -0,0 +1 @@ +::: openqdc.datasets.interaction.x40 diff --git a/docs/API/formats.md b/docs/API/formats.md new file mode 100644 index 00000000..fab98169 --- /dev/null +++ b/docs/API/formats.md @@ -0,0 +1 @@ +::: openqdc.datasets.structure diff --git a/docs/API/methods.md b/docs/API/methods.md index 7814334e..ce1d94a4 100644 --- a/docs/API/methods.md +++ b/docs/API/methods.md @@ -1,3 +1,7 @@ # QM Methods -::: openqdc.methods +::: openqdc.methods.enums + +# Isolated Atom Energies + +::: openqdc.methods.atom_energies diff --git a/docs/API/regressor.md b/docs/API/regressor.md new file mode 100644 index 00000000..dff0ad98 --- /dev/null +++ b/docs/API/regressor.md @@ -0,0 +1 @@ +::: openqdc.utils.regressor diff --git a/docs/API/units.md b/docs/API/units.md new file mode 100644 index 00000000..0401bdc4 --- /dev/null +++ b/docs/API/units.md @@ -0,0 +1,3 @@ +# UNITS + +::: openqdc.utils.units diff --git a/docs/API/utils.md b/docs/API/utils.md new file mode 100644 index 00000000..35fae5c8 --- /dev/null +++ b/docs/API/utils.md @@ -0,0 +1 @@ +::: openqdc.utils diff --git a/docs/_overrides/main.html b/docs/_overrides/main.html deleted file mode 100644 index 2eafd76b..00000000 --- a/docs/_overrides/main.html +++ /dev/null @@ -1,46 +0,0 @@ -{% extends "base.html" %} - -{% block content %} -{{ super() }} - - -{% endblock content %} diff --git a/docs/assets/StorageView.png b/docs/assets/StorageView.png new file mode 100644 index 00000000..8d398926 Binary files /dev/null and b/docs/assets/StorageView.png differ diff --git a/docs/assets/qdc_logo.png b/docs/assets/qdc_logo.png new file mode 100644 index 00000000..a8138dcc Binary files /dev/null and b/docs/assets/qdc_logo.png differ diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 00000000..8e2cd53b --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,113 @@ +# CLI for dataset downloading and uploading +You can quickly download, fetch, preprocess and upload openQDC datasets using the command line interface (CLI). + +## Datasets +Print a formatted table of the available openQDC datasets and some informations. + +Usage: + + openqdc datasets [OPTIONS] + +Options: + + --help Show this message and exit. + +## Cache +Get the current local cache path of openQDC + +Usage: + + openqdc cache [OPTIONS] + +Options: + + --help Show this message and exit. + + +## Download +Download preprocessed ml-ready datasets from the main openQDC hub. + +Usage: + + openqdc download DATASETS... [OPTIONS] + +Options: + + --help Show this message and exit. + --overwrite Whether to force the re-download of the datasets and overwrite the current cached dataset. [default: no-overwrite] + --cache-dir Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None] + --as-zarr Whether to use a zarr format for the datasets instead of memmap. [default: no-as-zarr] + --gs Whether source to use for downloading. If True, Google Storage will be used.Otherwise, AWS S3 will be used [default: no-gs] + +Example: + + openqdc download Spice + +## Fetch +Download the raw datasets files from the main openQDC hub + +Note: + + Special case: if the dataset is "all", "potential", "interaction". + +Usage: + + openqdc fetch DATASETS... [OPTIONS] + +Options: + + --help Show this message and exit. + --overwrite Whether to overwrite or force the re-download of the raw files. [default: no-overwrite] + --cache-dir Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None] + +Example: + + openqdc fetch Spice + +## Preprocess +Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote. + +Usage: + + openqdc preprocess DATASETS... [OPTIONS] + +Options: + + --help Show this message and exit. + --overwrite Whether to overwrite the current cached datasets. [default: overwrite] + --upload Whether to attempt the upload to the remote storage. Must have write permissions. [default: no-upload] + --as-zarr Whether to preprocess as a zarr format or a memmap format. [default: no-as-zarr] + +Example: + + openqdc preprocess Spice QMugs + +## Upload +Upload a preprocessed dataset to the remote storage + +Usage: + + openqdc upload DATASETS... [OPTIONS] + +Options: + + --help Show this message and exit. + --overwrite Whether to overwrite the remote files if they are present. [default: overwrite] + --as-zarr Whether to upload the zarr files if available. [default: no-as-zarr] + +Example: + + openqdc upload Spice --overwrite + +## Convert +Convert a preprocessed dataset from a memmap dataset to a zarr dataset. + +Usage: + + openqdc convert DATASETS... [OPTIONS] + +Options: + + --help Show this message and exit. + --overwrite Whether to overwrite the current zarr cached datasets. [default: no-overwrite] + --download Whether to force the re-download of the memmap datasets. [default: no-download] diff --git a/docs/contribute.md b/docs/contribute.md new file mode 100644 index 00000000..e0e22721 --- /dev/null +++ b/docs/contribute.md @@ -0,0 +1,59 @@ +# Contribute + +The below documents the development lifecycle of OpenQDC. + +## Setup a dev environment + +```bash +mamba env create -n openqdc -f env.yml +mamba activate datamol +pip install -e . +``` + +## Pre commit installation + +```bash +pre-commit install +pre-commit run --all-files +``` + +## Continuous Integration + +OpenQDC uses Github Actions to: + +- **Build and test** `openQDC`. + - Multiple combinations of OS and Python versions are tested. +- **Check** the code: + - Formatting with `black`. + - Static type check with `mypy`. + - Modules import formatting with `isort`. + - Pre-commit hooks. +- **Documentation**: + - Google docstring format. + - build and deploy the documentation on `main` and for every new git tag. + + +## Run tests + +```bash +pytest +``` + +## Build the documentation + +You can build and serve the documentation locally with: + +```bash +# Build and serve the doc +mike serve +``` + +or with + +```bash +mkdocs serve +``` + +### Multi-versionning + +The doc is built for eash push on `main` and every git tags using [mike](https://github.com/jimporter/mike). Everything is automated using Github Actions. Please refer to the official mike's documentation for the details. diff --git a/docs/css/custom-openqdc.css b/docs/css/custom-openqdc.css new file mode 100644 index 00000000..a1d97cf7 --- /dev/null +++ b/docs/css/custom-openqdc.css @@ -0,0 +1,92 @@ +:root { + --openqdc-primary: ##201342; + --openqdc-secondary: #4A1E7E; + + /* Primary color shades */ + --md-primary-fg-color: var(--openqdc-primary); + --md-primary-fg-color--light: var(--openqdc-primary); + --md-primary-fg-color--dark: var(--openqdc-primary); + --md-primary-bg-color: var(--openqdc-secondary); + --md-primary-bg-color--light: var(--openqdc-secondary); + --md-text-link-color: var(--openqdc-secondary); + + /* Accent color shades */ + --md-accent-fg-color: var(--openqdc-secondary); + --md-accent-fg-color--transparent: var(--openqdc-secondary); + --md-accent-bg-color: var(--openqdc-secondary); + --md-accent-bg-color--light: var(--openqdc-secondary); + } + + :root>* { + /* Code block color shades */ + --md-code-bg-color: hsla(0, 0%, 96%, 1); + --md-code-fg-color: hsla(200, 18%, 26%, 1); + + /* Footer */ + --md-footer-bg-color: var(--openqdc-primary); + /* --md-footer-bg-color--dark: hsla(0, 0%, 0%, 0.32); */ + --md-footer-fg-color: var(--openqdc-secondary); + --md-footer-fg-color--light: var(--openqdc-secondary); + --md-footer-fg-color--lighter: var(--openqdc-secondary); + + } + + .md-header { + background-image: linear-gradient(to right, #131036, #4A1E7E); + } + + .md-footer { + background-image: linear-gradient(to right, #131036, #4A1E7E); + } + + .md-tabs { + background-image: linear-gradient(to right, #F4F6F9, #b39bce); + } + + .md-header__topic { + color: rgb(255, 255, 255); + } + + .md-source__repository, + .md-source__icon, + .md-search__input, + .md-search__input::placeholder, + .md-search__input~.md-search__icon, + .md-footer__inner.md-grid, + .md-copyright__highlight, + .md-copyright, + .md-footer-meta.md-typeset a, + .md-version { + color: rgb(255, 255, 255) !important; + } + + .md-search__form { + background-color: rgba(255, 255, 255, 0.2); + } + + .md-search__input { + color: #222222 !important; + } + + .md-header__topic { + color: rgb(255, 255, 255); + font-size: 1.4em; + } + + /* Increase the size of the logo */ + .md-header__button.md-logo img, + .md-header__button.md-logo svg { + height: 2rem !important; + } + + /* Reduce the margin around the logo */ + .md-header__button.md-logo { + margin: 0.4em; + padding: 0.4em; + } + + /* Remove the `In` and `Out` block in rendered Jupyter notebooks */ + .md-container .jp-Cell-outputWrapper .jp-OutputPrompt.jp-OutputArea-prompt, + .md-container .jp-Cell-inputWrapper .jp-InputPrompt.jp-InputArea-prompt { + display: none !important; + } diff --git a/docs/data_storage.md b/docs/data_storage.md new file mode 100644 index 00000000..b24bec3b --- /dev/null +++ b/docs/data_storage.md @@ -0,0 +1,33 @@ +## Dataset structure + +For a dataset with N geometries, M atoms across all geometries, ne energy labels, +and nf force labels, we use zarr or memory-mapped arrays of various sizes: + +- (M, 5) for atomic numbers (1), +charges (1), and positions (3) of individual geometries; + +- (N, 2) for the beginning and end indices of +each geometry in the previous array; + +- (N, ne) for the energy labels of each geometry, extendable to +store other geometry-level QM properties such as HOMO-LUMO gap; + +- (M, nf , 3) for the force labels +of each geometry, extendable to store other atom-level QM properties. + + +The memory-mapped files efficiently access data stored on disk or in the cloud without reading +them into memory, enabling training on machines with smaller RAM than the dataset size and +accommodating concurrent reads in multi-GPU training. This allows for very efficient indexing, +batching and iteration. + +![](assets/StorageView.png) + + +## Formats + +We currently support the following formats: + +1) Zarr : https://zarr.readthedocs.io/en/stable/index.html + +2) Memmap : https://numpy.org/doc/stable/index.html diff --git a/docs/dataset_upload.md b/docs/dataset_upload.md new file mode 100644 index 00000000..e4740f75 --- /dev/null +++ b/docs/dataset_upload.md @@ -0,0 +1,69 @@ +# How to Add a Dataset to OpenQDC + +Do you think that OpenQDC is missing some important dataset? Do you think your dataset would be a good fit for OpenQDC? +If so, you can contribute to OpenQDC by adding your dataset to the OpenQDC repository in two ways: + +1. Opening a PR to add a new dataset +2. Request a new dataset through Google Form + +## OpenQDC PR Guidelines + +Implement your dataset in the OpenQDC repository by following the guidelines below: + +### Dataset class + +- The dataset class should be implemented in the `openqdc/datasets` directory. +- The dataset class should inherit from the `openqdc.datasets.base.BaseDataset` class. +- Add your `dataset.py` file to the `openqdc/datasets/potential` or `openqdc/datasets/interaction/` directory based on the type of energy. +- Implement the following for your dataset: + - Add the metadata of the dataset: + - Docstrings for the dataset class. Docstrings should report links and references to the dataset. A small description and if possible, the sampling strategy used to generate the dataset. + - `__links__`: Dictionary of name and link to download the dataset. + - `__name__`: Name of the dataset. This will create a folder with the name of the dataset in the cache directory. + - The original units for the dataset `__energy_unit__` and `__distance_unit__`. + - `__force_mask__`: Boolean to indicate if the dataset has forces. Or if multiple forces are present. A list of booleans. + - `__energy_methods__`: List of the `QmMethod` methods present in the dataset. + - `read_raw_entries(self)` -> `List[Dict[str, Any]]`: Preprocess the raw dataset and return a list of dictionaries containing the data. For a better overview of the data format. Look at data storage. This data should have the following keys: + - `atomic_inputs` : Atomic inputs of the molecule. numpy.Float32. + - `name`: Atomic numbers of the atoms in the molecule. numpy.Object. + - `subset`: Positions of the atoms in the molecule. numpy.Object. + - `energies`: Energies of the molecule. numpy.Float64. + - `n_atoms`: Number of atoms in the molecule. numpy.Int32 + - `forces`: Forces of the molecule. [Optional] numpy.Float32. + - Add the dataset import to the `openqdc/datasets//__init__.py` file and to `openqdc/__init__.py`. + +### Test the dataset + +Try to run the openQDC CLI pipeline with the dataset you implemented. + +Run the following command to download the dataset: + +- Fetch the dataset files +```bash +openqdc fetch DATASET_NAME +``` + +- Preprocess the dataset +```bash +openqdc preprocess DATASET_NAME +``` + +- Load it on python and check if the dataset is correctly loaded. +```python +from openqdc import DATASET_NAME +ds=DATASET_NAME() +``` + +If the dataset is correctly loaded, you can open a PR to add the dataset to OpenQDC. + +- Select for your PR the `dataset` label. + +Our team will review your PR and provide feedback if necessary. If everything is correct, your dataset will be added to OpenQDC remote storage. + +## OpenQDC Google Form + +Alternatively, you can ask the OpenQDC main development team to take care of the dataset upload for you. +You can fill out the Google Form [here](https://docs.google.com/forms/d/e/1FAIpQLSeh0YHRn-OoqPpUbrL7G-EOu3LtZC24rtQWwbjJaZ-2V8P2vQ/viewform?usp=sf_link) + +As the openQDC team will strive to provide a high quality curation and upload, +please be patient as the team will need to review the dataset and carry out the necessary steps to ensure the dataset is uploaded correctly. diff --git a/docs/index.md b/docs/index.md index 264211f1..db497b10 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,30 +1,65 @@ -# openQDC +# Overview -Open Quantum Data Commons +OpenQDC is a python library to work with quantum datasets. It's a package aimed at providing a simple and efficient way to download, load and utilize various datasets and provide a way to standardize the data for easy use in machine learning models. -## Setup Datasets +- 🐍 Simple pythonic API +- 🕹️ ML-Ready: all you manipulate are `torch.Tensor`,`jax.Array` or `numpy.Array`objects. +- ⚛️ Quantum Ready: The quantum methods are checked and standardized to provide addictional values. +- ✅ Standardized: The datasets are written in standard and performant formats with annotated metadata like units and labels. +- 🧠 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc). +- 📈 Data: have access to 1.5+ billion datapoints -Use the scripts in `setup/` to download the datasets. For more information, see the [README](setup/README.md) in the `setup/` directory. +Visit our website at TOFILL . + +## Installation + +Use mamba: -# Install the library in dev mode ```bash -# Install the deps -mamba env create -n qdc -f env.yml +mamba install -c conda-forge openqdc +``` -# Activate the environment -mamba activate qdc +_**Tips:** You can replace `mamba` by `conda`._ -# Install the qdc library in dev mode -pip install -e . +_**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install OpenQDC. The package is also pip installable if you need it: `pip install openqdc`._ -``` +## Quick API Tour -## Development lifecycle +```python +from openqdc as Spice -### Tests +# Load the original dataset +dataset = Spice() -You can run tests locally with: +# Load the dataset with a different units +dataset = Spice( + energy_unit = "kcal/mol", + distance_unit = "ang", + energy_type = "formation", + array_format = "torch" +) -```bash -pytest . +# Access the data +data = dataset[0] + +# Get relevant statistics +dataset.get_statistics() + +# Get dataset metadata +dataset.average_n_atoms +dataset.chemical_species +dataset.charges + +# Compute physical descriptors +dataset.calculate_descriptors( + descriptor_name = "soap" +) ``` + +## How to cite + +Please cite OpenQDC if you use it in your research: [![DOI](zenodo_badge)](zenodo_link). + +## Compatibilities + +OpenQDC is compatible with Python >= 3.8 and is tested on Linux, MacOS and Windows. diff --git a/docs/licensing.md b/docs/licensing.md new file mode 100644 index 00000000..ec5a3857 --- /dev/null +++ b/docs/licensing.md @@ -0,0 +1,3 @@ +``` +{!LICENSE!} +``` diff --git a/docs/normalization_e0s.md b/docs/normalization_e0s.md new file mode 100644 index 00000000..426e7d0d --- /dev/null +++ b/docs/normalization_e0s.md @@ -0,0 +1,38 @@ +# Overview of QM Methods and Normalization + +OpenQDC provides support for 250+ QM Methods and provides a way to standardize and categorize +the usage of different level of theories used for Quantum Mechanics Single Point Calculations +to add value and information to the datasets. + +## Level of Theory + +To avoid inconsistencies, level of theories are standardized and categorized into Python Enums +consisting of a functional, a basis set, and a correction method. +OpenQDC covers more than 106 functionals, 20 basis sets, and 11 +correction methods. +OpenQDC provides the computed the isolated atom energies `e0` for each QM method. + + +## Normalization + + +We provide support of energies through "physical" and "regression" normalization to conserve the size extensivity of chemical systems. +OpenQDC through this normalization, provide a way to transform the potential energy to atomization energy by subtracting isolated atom energies `e0` +physically interpretable and extensivity-conserving normalization method. Alternatively, we pre- +compute the average contribution of each atom species to potential energy via linear or ridge +regression, centering the distribution at 0 and providing uncertainty estimation for the computed +values. Predicted atomic energies can also be scaled to approximate a standard normal distribution. + +### Physical Normalization + +`e0` energies are calculated for each atom in the dataset at the appropriate level of theory and then subtracted from +the potential energy to obtain the atomization energy. This normalization method is physically interpretable and +only remove the atom energy contribution from the potential energy. + + +### Regression Normalization + +`e0` energies are calculated for each atom in the dataset from fitting a regression model to the potential energy. +The `e0` energies are then subtracted from the potential energy to obtain the atomization energy. This normalization +provides uncertainty estimation for the computed values and remove part of the interatomic energy contribution from the potential energy. +The resulting formation energy is centered at 0. diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 00000000..af62f453 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,42 @@ +# Usage + +## How to use + +OpenQDC has been designed to be used with a single import: + +```python +import openqdc as qdc +dataset = qdc.QM9() +``` + +All `openQDC` functions are available under `qdc`. +Or if you want to directly import a specific dataset: + +```python +from openqdc as Spice +# Spice dataset with distance unit in angstrom instead of bohr +dataset = Spice(distance_unit="ang", + array_format = "jax" +) +dataset[0] # dict of jax array +``` + +Or if you prefer handling `ase.Atoms` objects: + +```python +dataset.get_ase_atoms(0) +``` + +## Iterators + +OpenQDC provides a simple way to get the data as iterators: + +```python +for data in dataset.as_iter(atoms=True): + print(data) # Atoms object + break +``` + +## Lazy loading + +OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during `import openqdc as qdc`. In case of trouble you can always disable lazy loading by setting the environment variable `OPENQDC_DISABLE_LAZY_LOADING` to `1`. diff --git a/env.yml b/env.yml index 16ccc3c2..87a9ccac 100644 --- a/env.yml +++ b/env.yml @@ -11,10 +11,15 @@ dependencies: - gcsfs - typer - prettytable + - s3fs + - pydantic + - python-dotenv + # Scientific - pandas - numpy + - zarr # Chem - datamol #==0.9.0 @@ -36,6 +41,7 @@ dependencies: - ruff # Doc + - mike - mkdocs - mkdocs-material - mkdocs-material-extensions diff --git a/mkdocs.yml b/mkdocs.yml index caac43c9..fdb8856a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,39 +1,91 @@ -site_name: "Open Quantum Data Commons (openQDC)" +site_name: "OpenQDC" site_description: "I don't know... Something about data and Quantum stuff I guess :D" -site_url: "https://github.com/OpenDrugDiscovery/openQDC" repo_url: "https://github.com/OpenDrugDiscovery/openQDC" repo_name: "openQDC" copyright: Copyright 2023 Valence Labs +site_url: "https://github.com/OpenDrugDiscovery/openQDC" remote_branch: "privpage" use_directory_urls: false docs_dir: "docs" +# Fail on warnings to detect issues with types and docstring +strict: true + nav: - Overview: index.md + - Usage: + - Base usage : usage.md + - CLI: cli.md - Available Datasets: datasets.md - #- Tutorials: - # #- Really hard example: tutorials/usage.ipynb + - QM methods: normalization_e0s.md + - Data structure: data_storage.md + - Tutorials: + - Really hard example: tutorials/usage.ipynb - API: - - Datasets: API/available_datasets.md - - Isolated Atoms Energies: API/isolated_atom_energies.md + - QM methods: API/methods.md + - Normalization regressor: API/regressor.md + - Main class: API/basedataset.md + - Format loading: API/formats.md + - Datasets: + - Potential Energy: + - Alchemy : API/datasets/alchemy.md + - ANI : API/datasets/ani.md + - Spice : API/datasets/spice.md + - GEOM : API/datasets/geom.md + - Qmugs : API/datasets/qmugs.md + - ISO_17 : API/datasets/iso_17.md + - Comp6 : API/datasets/comp6.md + - GDML : API/datasets/gdml.md + - Molecule3D : API/datasets/molecule3d.md + - Orbnet Denali : API/datasets/orbnet_denali.md + - SN2 RXN : API/datasets/sn2_rxn.md + - QM7X : API/datasets/qm7x.md + - QM1B : API/datasets/qm1b.md + - NablaDFT : API/datasets/nabladft.md + - Solvated Peptides : API/datasets/solvated_peptides.md + - Waterclusters3_30 : API/datasets/waterclusters3_30.md + - SCAN Waterclusters : API/datasets/waterclusters.md + - TMQM : API/datasets/tmqm.md + - PCQM : API/datasets/pcqm.md + - RevMD17 : API/datasets/revmd17.md + - MD22 : API/datasets/md22.md + - Transition1X : API/datasets/transition1x.md + - MultixcQM9 : API/datasets/multixcqm9.md + - QMX : API/datasets/qmx.md + - Protein Fragments : API/datasets/proteinfragments.md + - VQM24 : API/datasets/vqm24.md + - Interaction Energy: + - DES : API/datasets/des.md + - L7 : API/datasets/l7.md + - X40 : API/datasets/x40.md + - Metcalf : API/datasets/metcalf.md + - Splinter : API/datasets/splinter.md + - Units: API/units.md + - Utils: API/utils.md + - Contribute: + - Mantaining: contribute.md + - Add a dataset: dataset_upload.md + - License: licensing.md + theme: name: material - custom_dir: docs/_overrides - palette: - primary: teal - accent: purple + #custom_dir: docs/_overrides features: - navigation.tabs - - navigation.expand + #- navigation.expand + #favicon: assets/qdc_logo.png + logo: assets/qdc_logo.png extra_css: - css/custom.css + - css/custom-openqdc.css extra_javascript: - javascripts/config.js - https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js + #- https://unpkg.com/mermaid@10.9.0/dist/mermaid.min.js markdown_extensions: - admonition @@ -53,11 +105,14 @@ markdown_extensions: - toc: permalink: true +watch: + - openqdc/ + plugins: - search - mkdocstrings: - watch: - - openqdc/ + #watch: + # - openqdc/ handlers: python: setup_commands: @@ -69,7 +124,11 @@ plugins: rendering: show_root_heading: yes heading_level: 3 - show_if_no_docstring: true + show_if_no_docstring: false - mkdocs-jupyter: execute: False # kernel_name: python3 + +extra: + version: + provider: mike diff --git a/openqdc/__init__.py b/openqdc/__init__.py index 7e2eb2cf..c6be72d4 100644 --- a/openqdc/__init__.py +++ b/openqdc/__init__.py @@ -15,6 +15,7 @@ def get_project_root(): "__version__": "openqdc._version", "BaseDataset": "openqdc.datasets.base", # POTENTIAL + "Alchemy": "openqdc.datasets.potential.alchemy", "ANI1": "openqdc.datasets.potential.ani", "ANI1CCX": "openqdc.datasets.potential.ani", "ANI1CCX_V2": "openqdc.datasets.potential.ani", @@ -39,6 +40,7 @@ def get_project_root(): "NablaDFT": "openqdc.datasets.potential.nabladft", "SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides", "WaterClusters": "openqdc.datasets.potential.waterclusters3_30", + "SCANWaterClusters": "openqdc.datasets.potential.waterclusters", "TMQM": "openqdc.datasets.potential.tmqm", "PCQM_B3LYP": "openqdc.datasets.potential.pcqm", "PCQM_PM6": "openqdc.datasets.potential.pcqm", @@ -47,6 +49,13 @@ def get_project_root(): "Transition1X": "openqdc.datasets.potential.transition1x", "MultixcQM9": "openqdc.datasets.potential.multixcqm9", "MultixcQM9_V2": "openqdc.datasets.potential.multixcqm9", + "QM7": "openqdc.datasets.potential.qmx", + "QM7b": "openqdc.datasets.potential.qmx", + "QM8": "openqdc.datasets.potential.qmx", + "QM9": "openqdc.datasets.potential.qmx", + "ProteinFragments": "openqdc.datasets.potential.proteinfragments", + "MDDataset": "openqdc.datasets.potential.proteinfragments", + "VQM24": "openqdc.datasets.potential.vqm24", # INTERACTION "DES5M": "openqdc.datasets.interaction.des", "DES370K": "openqdc.datasets.interaction.des", @@ -58,6 +67,7 @@ def get_project_root(): "Splinter": "openqdc.datasets.interaction.splinter", # DEBUG "Dummy": "openqdc.datasets.potential.dummy", + "PredefinedDataset": "openqdc.datasets.potential.dummy", # ALL "AVAILABLE_DATASETS": "openqdc.datasets", "AVAILABLE_POTENTIAL_DATASETS": "openqdc.datasets.potential", @@ -105,9 +115,10 @@ def __dir__(): from .datasets.interaction.x40 import X40 # POTENTIAL + from .datasets.potential.alchemy import Alchemy from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X from .datasets.potential.comp6 import COMP6 - from .datasets.potential.dummy import Dummy + from .datasets.potential.dummy import Dummy, PredefinedDataset from .datasets.potential.gdml import GDML from .datasets.potential.geom import GEOM from .datasets.potential.iso_17 import ISO17 @@ -117,13 +128,17 @@ def __dir__(): from .datasets.potential.nabladft import NablaDFT from .datasets.potential.orbnet_denali import OrbnetDenali from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6 + from .datasets.potential.proteinfragments import MDDataset, ProteinFragments from .datasets.potential.qm1b import QM1B, QM1B_SMALL from .datasets.potential.qm7x import QM7X, QM7X_V2 from .datasets.potential.qmugs import QMugs, QMugs_V2 + from .datasets.potential.qmx import QM7, QM8, QM9, QM7b from .datasets.potential.revmd17 import RevMD17 from .datasets.potential.sn2_rxn import SN2RXN from .datasets.potential.solvated_peptides import SolvatedPeptides from .datasets.potential.spice import Spice, SpiceV2, SpiceVL2 from .datasets.potential.tmqm import TMQM from .datasets.potential.transition1x import Transition1X + from .datasets.potential.vqm24 import VQM24 + from .datasets.potential.waterclusters import SCANWaterClusters from .datasets.potential.waterclusters3_30 import WaterClusters diff --git a/openqdc/cli.py b/openqdc/cli.py index 1d985090..7b32c9ae 100644 --- a/openqdc/cli.py +++ b/openqdc/cli.py @@ -1,3 +1,4 @@ +import os from typing import List, Optional import typer @@ -12,27 +13,40 @@ AVAILABLE_INTERACTION_DATASETS, AVAILABLE_POTENTIAL_DATASETS, ) +from openqdc.utils.io import get_local_cache app = typer.Typer(help="OpenQDC CLI") def sanitize(dictionary): + """ + Sanitize dataset names to be used in the CLI. + """ return {k.lower().replace("_", "").replace("-", ""): v for k, v in dictionary.items()} SANITIZED_AVAILABLE_DATASETS = sanitize(AVAILABLE_DATASETS) -def exist_dataset(dataset): +def exist_dataset(dataset) -> bool: + """ + Check if dataset is available in the openQDC datasets. + """ if dataset not in sanitize(AVAILABLE_DATASETS): logger.error(f"{dataset} is not available. Please open an issue on Github for the team to look into it.") return False return True -def format_entry(empty_dataset): +def format_entry(empty_dataset, max_num_to_display: int = 6): + """ + Format the entry for the table. + max_num_to_display: int = 6, + Maximum number of energy methods to display. Used to keep the table format + readable in case of datasets with many energy methods. [ex. MultiXQM9] + """ energy_methods = [str(x) for x in empty_dataset.__energy_methods__] - max_num_to_display = 6 + if len(energy_methods) > 6: entry = ",".join(energy_methods[:max_num_to_display]) + "..." else: @@ -46,7 +60,7 @@ def download( overwrite: Annotated[ bool, typer.Option( - help="Whether to overwrite or force the re-download of the datasets.", + help="Whether to force the re-download of the datasets and overwrite the current cached dataset.", ), ] = False, cache_dir: Annotated[ @@ -55,6 +69,19 @@ def download( help="Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used.", ), ] = None, + as_zarr: Annotated[ + bool, + typer.Option( + help="Whether to use a zarr format for the datasets instead of memmap.", + ), + ] = False, + gs: Annotated[ + bool, + typer.Option( + help="Whether source to use for downloading. If True, Google Storage will be used." + + "Otherwise, AWS S3 will be used", + ), + ] = False, ): """ Download preprocessed ml-ready datasets from the main openQDC hub. @@ -62,18 +89,25 @@ def download( Example: openqdc download Spice QMugs """ + if gs: + os.environ["OPENQDC_DOWNLOAD_API"] = "gs" + for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)): if exist_dataset(dataset): - if SANITIZED_AVAILABLE_DATASETS[dataset].no_init().is_cached() and not overwrite: + ds = SANITIZED_AVAILABLE_DATASETS[dataset].no_init() + ds.read_as_zarr = as_zarr + if ds.is_cached() and not overwrite: logger.info(f"{dataset} is already cached. Skipping download") else: - SANITIZED_AVAILABLE_DATASETS[dataset](overwrite_local_cache=True, cache_dir=cache_dir) + SANITIZED_AVAILABLE_DATASETS[dataset]( + overwrite_local_cache=True, cache_dir=cache_dir, read_as_zarr=as_zarr, skip_statistics=True + ) @app.command() def datasets(): """ - Print a table of the available openQDC datasets and some informations. + Print a formatted table of the available openQDC datasets and some informations. """ table = PrettyTable(["Name", "Type of Energy", "Forces", "Level of theory"]) for dataset in AVAILABLE_DATASETS: @@ -98,7 +132,7 @@ def fetch( overwrite: Annotated[ bool, typer.Option( - help="Whether to overwrite or force the re-download of the files.", + help="Whether to overwrite or force the re-download of the raw files.", ), ] = False, cache_dir: Annotated[ @@ -109,17 +143,14 @@ def fetch( ] = None, ): """ - Download the raw datasets files from the main openQDC hub. - overwrite: bool = False, - If True, the files will be re-downloaded and overwritten. - cache_dir: Optional[str] = None, - Path to the cache. If not provided, the default cache directory will be used. - Special case: if the dataset is "all", "potential", "interaction". - all: all available datasets will be downloaded. - potential: all the potential datasets will be downloaded - interaction: all the interaction datasets will be downloaded - Example: - openqdc fetch Spice + Download the raw datasets files from the main openQDC hub.\n + Special case: if the dataset is "all", "potential", "interaction".\n + all: all available datasets will be downloaded.\n + potential: all the potential datasets will be downloaded\n + interaction: all the interaction datasets will be downloaded\n\n + + Example:\n + openqdc fetch Spice """ if datasets[0].lower() == "all": dataset_names = list(sanitize(AVAILABLE_DATASETS).keys()) @@ -143,18 +174,27 @@ def preprocess( overwrite: Annotated[ bool, typer.Option( - help="Whether to overwrite or force the re-download of the datasets.", + help="Whether to overwrite the current cached datasets.", ), ] = True, upload: Annotated[ bool, typer.Option( - help="Whether to try the upload to the remote storage.", + help="Whether to attempt the upload to the remote storage. Must have write permissions.", + ), + ] = False, + as_zarr: Annotated[ + bool, + typer.Option( + help="Whether to preprocess as a zarr format or a memmap format.", ), ] = False, ): """ Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote. + + Example: + openqdc preprocess Spice QMugs """ for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)): if exist_dataset(dataset): @@ -166,5 +206,137 @@ def preprocess( raise e +@app.command() +def upload( + datasets: List[str], + overwrite: Annotated[ + bool, + typer.Option( + help="Whether to overwrite the remote files if they are present.", + ), + ] = True, + as_zarr: Annotated[ + bool, + typer.Option( + help="Whether to upload the zarr files if available.", + ), + ] = False, +): + """ + Upload a preprocessed dataset to the remote storage. + + Example: + openqdc upload Spice --overwrite + """ + for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)): + if exist_dataset(dataset): + logger.info(f"Uploading {SANITIZED_AVAILABLE_DATASETS[dataset].__name__}") + try: + SANITIZED_AVAILABLE_DATASETS[dataset](skip_statistics=True).upload(overwrite=overwrite, as_zarr=as_zarr) + except Exception as e: + logger.error(f"Error while uploading {dataset}. {e}. Did you preprocess the dataset first?") + raise e + + +@app.command() +def convert( + datasets: List[str], + overwrite: Annotated[ + bool, + typer.Option( + help="Whether to overwrite the current zarr cached datasets.", + ), + ] = False, + download: Annotated[ + bool, + typer.Option( + help="Whether to force the re-download of the memmap datasets.", + ), + ] = False, +): + """ + Convert a preprocessed dataset from a memmap dataset to a zarr dataset. + """ + import os + from os.path import join as p_join + + import numpy as np + import zarr + + from openqdc.utils.io import load_pkl + + def silent_remove(filename): + """ + Zarr zip files are currently not overwritable. This function is used to remove the file if it exists. + """ + try: + os.remove(filename) + except OSError: + pass + + for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)): + if exist_dataset(dataset): + logger.info(f"Converting {SANITIZED_AVAILABLE_DATASETS[dataset].__name__}") + try: + ds = SANITIZED_AVAILABLE_DATASETS[dataset](overwrite_local_cache=download, skip_statistics=True) + # os.makedirs(p_join(ds.root, "zips", ds.__name__), exist_ok=True) + + pkl = load_pkl(p_join(ds.preprocess_path, "props.pkl")) + metadata = p_join(ds.preprocess_path, "metadata.zip") + if overwrite: + silent_remove(metadata) + group = zarr.group(zarr.storage.ZipStore(metadata)) + for key, value in pkl.items(): + # sub=group.create_group(key) + if key in ["name", "subset"]: + data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype) + data[:] = value[0][:] + data2 = group.create_dataset(key + "_ptr", shape=value[1].shape, dtype=np.int32) + data2[:] = value[1][:] + else: + data = group.create_dataset(key, shape=value.shape, dtype=value.dtype) + data[:] = value[:] + + force_attrs = { + "unit": str(ds.force_unit), + "level_of_theory": ds.force_methods, + } + + energy_attrs = {"unit": str(ds.energy_unit), "level_of_theory": ds.energy_methods} + + atomic_inputs_attrs = { + "unit": str(ds.distance_unit), + } + attrs = {"forces": force_attrs, "energies": energy_attrs, "atomic_inputs": atomic_inputs_attrs} + + # os.makedirs(p_join(ds.root, "zips", ds.__name__), exist_ok=True) + for key, value in ds.data.items(): + if key not in ds.data_keys: + continue + print(key, value.shape) + + zarr_path = p_join(ds.preprocess_path, key + ".zip") # ds.__name__, + if overwrite: + silent_remove(zarr_path) + z = zarr.open( + zarr.storage.ZipStore(zarr_path), "w", zarr_version=2, shape=value.shape, dtype=value.dtype + ) + z[:] = value[:] + if key in attrs: + z.attrs.update(attrs[key]) + + except Exception as e: + logger.error(f"Error while converting {dataset}. {e}. Did you preprocess the dataset first?") + raise e + + +@app.command() +def cache(): + """ + Get the current local cache path of openQDC + """ + print(f"openQDC local cache:\n {get_local_cache()}") + + if __name__ == "__main__": app() diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py index 026bfd75..8a480125 100644 --- a/openqdc/datasets/base.py +++ b/openqdc/datasets/base.py @@ -1,13 +1,18 @@ """The BaseDataset defining shared functionality between all datasets.""" import os -import pickle as pkl + +try: + from collections.abc import Iterable +except ImportError: + from collections import Iterable from functools import partial from itertools import compress from os.path import join as p_join from typing import Callable, Dict, List, Optional, Union import numpy as np +from ase import Atoms from ase.io.extxyz import write_extxyz from loguru import logger from sklearn.utils import Bunch @@ -22,6 +27,7 @@ StatisticManager, TotalEnergyStats, ) +from openqdc.datasets.structure import MemMapDataset, ZarrDataset from openqdc.utils.constants import MAX_CHARGE, NB_ATOMIC_FEATURES from openqdc.utils.descriptors import get_descriptor from openqdc.utils.exceptions import ( @@ -32,7 +38,6 @@ copy_exists, dict_to_atoms, get_local_cache, - pull_locally, push_remote, set_cache_dir, ) @@ -76,6 +81,7 @@ class BaseDataset(DatasetPropertyMixIn): energy_target_names = [] force_target_names = [] + read_as_zarr = False __energy_methods__ = [] __force_mask__ = [] __isolated_atom_energies__ = [] @@ -99,7 +105,9 @@ def __init__( cache_dir: Optional[str] = None, recompute_statistics: bool = False, transform: Optional[Callable] = None, - regressor_kwargs={ + skip_statistics: bool = False, + read_as_zarr: bool = False, + regressor_kwargs: Dict = { "solver_type": "linear", "sub_sample": None, "stride": 1, @@ -107,29 +115,28 @@ def __init__( ) -> None: """ - Parameters - ---------- - energy_unit - Energy unit to convert dataset to. Supported units: ["kcal/mol", "kj/mol", "hartree", "ev"] - distance_unit - Distance unit to convert dataset to. Supported units: ["ang", "nm", "bohr"] - array_format - Format to return arrays in. Supported formats: ["numpy", "torch", "jax"] - energy_type - Type of isolated atom energy to use for the dataset. Default: "formation" - Supported types: ["formation", "regression", "null", None] - overwrite_local_cache - Whether to overwrite the locally cached dataset. - cache_dir - Cache directory location. Defaults to "~/.cache/openqdc" - recompute_statistics - Whether to recompute the statistics of the dataset. - transform, optional - transformation to apply to the __getitem__ calls - regressor_kwargs - Dictionary of keyword arguments to pass to the regressor. - Default: {"solver_type": "linear", "sub_sample": None, "stride": 1} - solver_type can be one of ["linear", "ridge"] + Parameters: + energy_unit: + Energy unit to convert dataset to. Supported units: ["kcal/mol", "kj/mol", "hartree", "ev"] + distance_unit: + Distance unit to convert dataset to. Supported units: ["ang", "nm", "bohr"] + array_format: + Format to return arrays in. Supported formats: ["numpy", "torch", "jax"] + energy_type: + Type of isolated atom energy to use for the dataset. Default: "formation" + Supported types: ["formation", "regression", "null", None] + overwrite_local_cache: + Whether to overwrite the locally cached dataset. + cache_dir: + Cache directory location. Defaults to "~/.cache/openqdc" + recompute_statistics: + Whether to recompute the statistics of the dataset. + transform: + transformation to apply to the __getitem__ calls + regressor_kwargs: + Dictionary of keyword arguments to pass to the regressor. + Default: {"solver_type": "linear", "sub_sample": None, "stride": 1} + solver_type can be one of ["linear", "ridge"] """ set_cache_dir(cache_dir) # self._init_lambda_fn() @@ -138,8 +145,10 @@ def __init__( self.recompute_statistics = recompute_statistics self.regressor_kwargs = regressor_kwargs self.transform = transform + self.read_as_zarr = read_as_zarr self.energy_type = energy_type if energy_type is not None else "null" self.refit_e0s = recompute_statistics or overwrite_local_cache + self.skip_statistics = skip_statistics if not self.is_preprocessed(): raise DatasetNotAvailableError(self.__name__) else: @@ -152,6 +161,12 @@ def _init_lambda_fn(self): self._fn_distance = lambda x: x self._fn_forces = lambda x: x + @property + def dataset_wrapper(self): + if not hasattr(self, "_dataset_wrapper"): + self._dataset_wrapper = ZarrDataset() if self.read_as_zarr else MemMapDataset() + return self._dataset_wrapper + @property def config(self): assert len(self.__links__) > 0, "No links provided for fetching" @@ -171,7 +186,8 @@ def _post_init( ) -> None: self._set_units(None, None) self._set_isolated_atom_energies() - self._precompute_statistics(overwrite_local_cache=overwrite_local_cache) + if not self.skip_statistics: + self._precompute_statistics(overwrite_local_cache=overwrite_local_cache) self._set_units(energy_unit, distance_unit) self._convert_data() self._set_isolated_atom_energies() @@ -331,6 +347,10 @@ def convert_forces(self, x): def set_energy_unit(self, value: str): """ Set a new energy unit for the dataset. + + Parameters: + value: + New energy unit to set. """ # old_unit = self.energy_unit # self.__energy_unit__ = value @@ -340,6 +360,10 @@ def set_energy_unit(self, value: str): def set_distance_unit(self, value: str): """ Set a new distance unit for the dataset. + + Parameters: + value: + New distance unit to set. """ # old_unit = self.distance_unit # self.__distance_unit__ = value @@ -351,9 +375,22 @@ def set_array_format(self, format: str): self.array_format = format def read_raw_entries(self): + """ + Preprocess the raw (aka from the fetched source) into a list of dictionaries. + """ raise NotImplementedError - def collate_list(self, list_entries): + def collate_list(self, list_entries: List[Dict]) -> Dict: + """ + Collate a list of entries into a single dictionary. + + Parameters: + list_entries: + List of dictionaries containing the entries to collate. + + Returns: + Dictionary containing the collated entries. + """ # concatenate entries res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]} @@ -364,54 +401,29 @@ def collate_list(self, list_entries): return res - def save_preprocess(self, data_dict, upload=False, overwrite=True): + def save_preprocess( + self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False + ): """ Save the preprocessed data to the cache directory and optionally upload it to the remote storage. - data_dict : dict - Dictionary containing the preprocessed data. - upload : bool, Defult: False - Whether to upload the preprocessed data to the remote storage or only saving it locally. - overwrite : bool, Default: False - Whether to overwrite the preprocessed data if it already exists. - Only used if upload is True. Cache is always overwritten locally. + + Parameters: + data_dict: + Dictionary containing the preprocessed data. + upload: + Whether to upload the preprocessed data to the remote storage or only saving it locally. + overwrite: + Whether to overwrite the preprocessed data if it already exists. + Only used if upload is True. Cache is always overwritten locally. """ # save memmaps logger.info("Preprocessing data and saving it to cache.") - for key in self.data_keys: - local_path = p_join(self.preprocess_path, f"{key}.mmap") - out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape) - out[:] = data_dict.pop(key)[:] - out.flush() - if upload: - push_remote(local_path, overwrite=overwrite) - - # save smiles and subset - local_path = p_join(self.preprocess_path, "props.pkl") - - # assert that (required) pkl keys are present in data_dict - assert all([key in data_dict.keys() for key in self.pkl_data_keys]) - - # store unique and inverse indices for str-based pkl keys - for key in self.pkl_data_keys: - if self.pkl_data_types[key] == str: - data_dict[key] = np.unique(data_dict[key], return_inverse=True) - - with open(local_path, "wb") as f: - pkl.dump(data_dict, f) + paths = self.dataset_wrapper.save_preprocess( + self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types + ) if upload: - push_remote(local_path, overwrite=overwrite) - - def _convert_on_loading(self, x, key): - if key == "energies": - return self.convert_energy(x) - elif key == "forces": - return self.convert_forces(x) - elif key == "atomic_inputs": - x = np.array(x, dtype=np.float32) - x[:, -3:] = self.convert_distance(x[:, -3:]) - return x - else: - return x + for local_path in paths: + push_remote(local_path, overwrite=overwrite) # make it async? def read_preprocess(self, overwrite_local_cache=False): logger.info("Reading preprocessed data.") @@ -421,62 +433,106 @@ def read_preprocess(self, overwrite_local_cache=False): Distance: {self.distance_unit},\n\ Forces: {self.force_unit if self.force_methods else 'None'}" ) - self.data = {} - for key in self.data_keys: - filename = p_join(self.preprocess_path, f"{key}.mmap") - pull_locally(filename, overwrite=overwrite_local_cache) - self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(*self.data_shapes[key]) - - filename = p_join(self.preprocess_path, "props.pkl") - pull_locally(filename, overwrite=overwrite_local_cache) - with open(filename, "rb") as f: - tmp = pkl.load(f) - all_pkl_keys = set(tmp.keys()) - set(self.data_keys) - # assert required pkl_keys are present in all_pkl_keys - assert all([key in all_pkl_keys for key in self.pkl_data_keys]) - for key in all_pkl_keys: - x = tmp.pop(key) - if len(x) == 2: - self.data[key] = x[0][x[1]] - else: - self.data[key] = x + self.data = self.dataset_wrapper.load_data( + self.preprocess_path, + self.data_keys, + self.data_types, + self.data_shapes, + self.pkl_data_keys, + overwrite_local_cache, + ) # this should be async if possible for key in self.data: logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}") - def is_preprocessed(self): + def _convert_on_loading(self, x, key): + if key == "energies": + return self.convert_energy(x) + elif key == "forces": + return self.convert_forces(x) + elif key == "atomic_inputs": + x = np.array(x, dtype=np.float32) + x[:, -3:] = self.convert_distance(x[:, -3:]) + return x + else: + return x + + def is_preprocessed(self) -> bool: """ Check if the dataset is preprocessed and available online or locally. + + Returns: + True if the dataset is available remotely or locally, False otherwise. """ - predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] - predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))] + predicats = [ + copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f"{key}"))) + for key in self.data_keys + ] + predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files] return all(predicats) - def is_cached(self): + def is_cached(self) -> bool: """ Check if the dataset is cached locally. + + Returns: + True if the dataset is cached locally, False otherwise. """ - predicats = [os.path.exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys] - predicats += [os.path.exists(p_join(self.preprocess_path, "props.pkl"))] + predicats = [ + os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f"{key}"))) + for key in self.data_keys + ] + predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files] return all(predicats) - def preprocess(self, upload: bool = False, overwrite: bool = True): + def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True): """ Preprocess the dataset and save it. - upload : bool, Defult: False - Whether to upload the preprocessed data to the remote storage or only saving it locally. - overwrite : bool, Default: False - Whether to overwrite the preprocessed data if it already exists. - Only used if upload is True. Cache is always overwritten locally. + + Parameters: + upload: + Whether to upload the preprocessed data to the remote storage or only saving it locally. + overwrite: + hether to overwrite the preprocessed data if it already exists. + Only used if upload is True. Cache is always overwritten locally. + as_zarr: + Whether to save the data as zarr files """ if overwrite or not self.is_preprocessed(): entries = self.read_raw_entries() res = self.collate_list(entries) - self.save_preprocess(res, upload, overwrite) + self.save_preprocess(res, upload, overwrite, as_zarr) + + def upload(self, overwrite: bool = False, as_zarr: bool = False): + """ + Upload the preprocessed data to the remote storage. Must be called after preprocess and + need to have write privileges. + + Parameters: + overwrite: + Whether to overwrite the remote data if it already exists + as_zarr: + Whether to upload the data as zarr files + """ + for key in self.data_keys: + local_path = p_join(self.preprocess_path, f"{key}.mmap" if not as_zarr else f"{key}.zip") + push_remote(local_path, overwrite=overwrite) + local_path = p_join(self.preprocess_path, "props.pkl" if not as_zarr else "metadata.zip") + push_remote(local_path, overwrite=overwrite) - def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext=True): + def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True): """ - Save the entry at index idx as an extxyz file. + Save a single entry at index idx as an extxyz file. + + Parameters: + idx: + Index of the entry + energy_method: + Index of the energy method to use + path: + Path to save the xyz file. If None, the current working directory is used. + ext: + Whether to include additional informations like forces and other metadatas (extxyz format) """ if path is None: path = os.getcwd() @@ -486,6 +542,12 @@ def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, def to_xyz(self, energy_method: int = 0, path: Optional[str] = None): """ Save dataset as single xyz file (extended xyz format). + + Parameters: + energy_method: + Index of the energy method to use + path: + Path to save the xyz file """ with open(p_join(path if path else os.getcwd(), f"{self.__name__}.xyz"), "w") as f: for atoms in tqdm( @@ -495,16 +557,20 @@ def to_xyz(self, energy_method: int = 0, path: Optional[str] = None): ): write_extxyz(f, atoms, append=True) - def get_ase_atoms(self, idx: int, energy_method: int = 0, ext=True): + def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms: """ Get the ASE atoms object for the entry at index idx. - Parameters - ---------- - idx : int - Index of the entry. - ext : bool, optional - Whether to include additional informations + Parameters: + idx: + Index of the entry. + energy_method: + Index of the energy method to use + ext: + Whether to include additional informations + + Returns: + ASE atoms object """ entry = self[idx] at = dict_to_atoms(entry, ext=ext, energy_method=energy_method) @@ -537,24 +603,23 @@ def calculate_descriptors( """ Compute the descriptors for the dataset. - Parameters - ---------- - descriptor_name : str - Name of the descriptor to use. Supported descriptors are ["soap"] - chemical_species : Optional[List[str]], optional - List of chemical species to use for the descriptor computation, by default None. - If None, the chemical species of the dataset are used. - n_samples : Optional[Union[List[int],int, float]], optional - Number of samples to use for the computation, by default None. If None, all the dataset is used. - If a list of integers is provided, the descriptors are computed for each of the specified idx of samples. - progress : bool, optional - Whether to show a progress bar, by default True. - **descriptor_kwargs : dict - Keyword arguments to pass to the descriptor instantiation of the model. - - Returns - ------- - Dict[str, np.ndarray] + Parameters: + descriptor_name: + Name of the descriptor to use. Supported descriptors are ["soap"] + chemical_species: + List of chemical species to use for the descriptor computation, by default None. + If None, the chemical species of the dataset are used. + n_samples: + Number of samples to use for the computation, by default None. + If None, all the dataset is used. + If a list of integers is provided, the descriptors are computed for + each of the specified idx of samples. + progress: + Whether to show a progress bar, by default True. + **descriptor_kwargs : dict + Keyword arguments to pass to the descriptor instantiation of the model. + + Returns: Dictionary containing the following keys: - values : np.ndarray of shape (N, M) containing the descriptors for the dataset - idxs : np.ndarray of shape (N,) containing the indices of the samples used @@ -577,14 +642,18 @@ def wrapper(idx): datum["idxs"] = idxs return datum - def as_iter(self, atoms: bool = False, energy_method: int = 0): + def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable: """ Return the dataset as an iterator. - Parameters - ---------- - atoms : bool, optional - Whether to return the items as ASE atoms object, by default False + Parameters: + atoms: + Whether to return the items as ASE atoms object, by default False + energy_method: + Index of the energy method to use + + Returns: + Iterator of the dataset """ func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__ @@ -592,12 +661,21 @@ def as_iter(self, atoms: bool = False, energy_method: int = 0): for i in range(len(self)): yield func(i) - def get_statistics(self, return_none: bool = True): + def __iter__(self): + for idxs in range(len(self)): + yield self[idxs] + + def get_statistics(self, return_none: bool = True) -> Dict: """ Get the converted statistics of the dataset. - return_none : bool, optional - Whether to return None if the statistics for the forces are not available, by default True - Otherwise, the statistics for the forces are set to 0.0 + + Parameters: + return_none : + Whether to return None if the statistics for the forces are not available, by default True + Otherwise, the statistics for the forces are set to 0.0 + + Returns: + Dictionary containing the statistics of the dataset """ selected_stats = self.statistics.get_results() if len(selected_stats) == 0: diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py index d90be07e..6788c41c 100644 --- a/openqdc/datasets/interaction/des.py +++ b/openqdc/datasets/interaction/des.py @@ -74,13 +74,19 @@ def _create_subsets(self, **kwargs): class DES370K(BaseInteractionDataset, IDES): """ - DE Shaw Research interaction energy of over 370K - small molecule dimers as described in the paper: - - Quantum chemical benchmark databases of gold-standard dimer interaction energies. - Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. - Sci Data 8, 55 (2021). - https://doi.org/10.1038/s41597-021-00833-x + DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies + computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules + and ions) including water and functional groups found in proteins. Dimer geometries are generated using + QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations. + + Usage: + ```python + from openqdc.datasets import DES370K + dataset = DES370K() + ``` + + Reference: + https://www.nature.com/articles/s41597-021-00833-x """ __name__ = "des370k_interaction" @@ -173,13 +179,18 @@ def read_raw_entries(self) -> List[Dict]: class DES5M(DES370K): """ - DE Shaw Research interaction energy calculations for - over 5M small molecule dimers as described in the paper: - - Quantum chemical benchmark databases of gold-standard dimer interaction energies. - Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. - Sci Data 8, 55 (2021). - https://doi.org/10.1038/s41597-021-00833-x + DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies + computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using + QM based optimization and MD simulations. + + Usage: + ```python + from openqdc.datasets import DES5M + dataset = DES5M() + ``` + + Reference: + https://www.nature.com/articles/s41597-021-00833-x """ __name__ = "des5m_interaction" @@ -242,18 +253,19 @@ class DES5M(DES370K): class DESS66(DES370K): """ - DE Shaw Research interaction energy - estimates of all 66 conformers from - the original S66 dataset as described - in the paper: - - Quantum chemical benchmark databases of gold-standard dimer interaction energies. - Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. - Sci Data 8, 55 (2021). - https://doi.org/10.1038/s41597-021-00833-x - - Data was downloaded from Zenodo: - https://zenodo.org/records/5676284 + DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS + dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total. + The protocol for estimating energies is based on the DES370K paper. + + Usage: + ```python + from openqdc.datasets import DESS66 + dataset = DESS66() + ``` + + Reference: + https://www.nature.com/articles/s41597-021-00833-x\n + S66: https://pubs.acs.org/doi/10.1021/ct2002946 """ __name__ = "des_s66" @@ -266,19 +278,18 @@ def _create_subsets(self, **kwargs): class DESS66x8(DESS66): """ - DE Shaw Research interaction energy - estimates of all 528 conformers from - the original S66x8 dataset as described - in the paper: - - Quantum chemical benchmark databases of gold-standard dimer interaction energies. - Donchev, A.G., Taube, A.G., Decolvenaere, E. et al. - Sci Data 8, 55 (2021). - https://doi.org/10.1038/s41597-021-00833-x - - Data was downloaded from Zenodo: - - https://zenodo.org/records/5676284 + DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS + dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve + giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper. + + Usage: + ```python + from openqdc.datasets import DESS66x8 + dataset = DESS66x8() + ``` + + Reference: + https://www.nature.com/articles/s41597-021-00833-x """ __name__ = "des_s66x8" diff --git a/openqdc/datasets/interaction/l7.py b/openqdc/datasets/interaction/l7.py index 75a63cd5..7307638c 100644 --- a/openqdc/datasets/interaction/l7.py +++ b/openqdc/datasets/interaction/l7.py @@ -7,15 +7,18 @@ class L7(YamlDataset): """ - The L7 interaction energy dataset as described in: - - Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes - Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza - Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374 - DOI: 10.1021/ct400036b - - Data was downloaded and extracted from: - http://cuby4.molecular.cz/dataset_l7.html + The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with + energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are + taken from crystal X-ray data and optimized with a DFT method specific to the complex. + + Usage: + ```python + from openqdc.datasets import L7 + dataset = L7() + ``` + + Reference: + https://pubs.acs.org/doi/10.1021/ct400036b """ __name__ = "l7" diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py index faf5324f..889370e8 100644 --- a/openqdc/datasets/interaction/metcalf.py +++ b/openqdc/datasets/interaction/metcalf.py @@ -84,20 +84,19 @@ def read_xyz(fname, subset): class Metcalf(BaseInteractionDataset): """ - Hydrogen-bonded dimers of NMA with 126 molecules as described in: - - Approaches for machine learning intermolecular interaction energies and - application to energy components from symmetry adapted perturbation theory. - Derek P. Metcalf, Alexios Koutsoukas, Steven A. Spronk, Brian L. Claus, - Deborah A. Loughney, Stephen R. Johnson, Daniel L. Cheney, C. David Sherrill; - J. Chem. Phys. 21 February 2020; 152 (7): 074103. - https://doi.org/10.1063/1.5142636 - - Further details: - "Hydrogen-bonded dimers involving N-methylacetamide (NMA) and 126 molecules - (46 donors and 80 acceptors; Figs. 2 and 3) were used. Optimized geometries - for the 126 individual monomers were obtained and paired with NMA in broad - arrays of spatial configurations to generate thousands of complexes for training. + Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to + 156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and + the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various + components. + + Usage: + ```python + from openqdc.datasets import Metcalf + dataset = Metcalf() + ``` + + Reference: + https://doi.org/10.1063/1.5142636 """ __name__ = "metcalf" diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py index bda10129..6ba3b4d5 100644 --- a/openqdc/datasets/interaction/splinter.py +++ b/openqdc/datasets/interaction/splinter.py @@ -12,13 +12,18 @@ class Splinter(BaseInteractionDataset): """ - A dataset of over 1.7 million protein-ligand - interactions as described in the paper: + Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated + by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies + and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method. - A quantum chemical interaction energy dataset for accurately modeling protein-ligand interactions. - Spronk, S.A., Glick, Z.L., Metcalf, D.P. et al. - Sci Data 10, 619 (2023). - https://doi.org/10.1038/s41597-023-02443-1 + Usage: + ```python + from openqdc.datasets import Splinter + dataset = Splinter() + ``` + + Reference: + https://doi.org/10.1038/s41597-023-02443-1 """ __energy_unit__ = "kcal/mol" diff --git a/openqdc/datasets/interaction/x40.py b/openqdc/datasets/interaction/x40.py index 64da5d87..d56a976d 100644 --- a/openqdc/datasets/interaction/x40.py +++ b/openqdc/datasets/interaction/x40.py @@ -8,16 +8,21 @@ class X40(YamlDataset): """ - X40 interaction dataset of 40 dimer pairs as - introduced in the following paper: - - Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules - Jan Řezáč, Kevin E. Riley, and Pavel Hobza - Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292 - DOI: 10.1021/ct300647k - - Dataset retrieved and processed from: - http://cuby4.molecular.cz/dataset_x40.html + X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules + where the halogens participate in various interaction types such as electrostatic interactions, london + dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic + molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries + are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are + computed with CCSD(T)/CBS level of theory. + + Usage: + ```python + from openqdc.datasets import X40 + dataset = X40() + ``` + + Reference: + https://pubs.acs.org/doi/10.1021/ct300647k """ __name__ = "x40" diff --git a/openqdc/datasets/io.py b/openqdc/datasets/io.py index 1e621f72..7316768b 100644 --- a/openqdc/datasets/io.py +++ b/openqdc/datasets/io.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Callable, List, Optional +from typing import Callable, Dict, List, Optional import datamol as dm import numpy as np @@ -17,6 +17,8 @@ def try_retrieve(obj, callable, default=None): class FromFileDataset(BaseDataset, ABC): + """Abstract class for datasets that read from a common format file like xzy, netcdf, gro, hdf5, etc.""" + def __init__( self, path: List[str], @@ -28,6 +30,7 @@ def __init__( array_format: Optional[str] = "numpy", level_of_theory: Optional[QmMethod] = None, transform: Optional[Callable] = None, + skip_statistics: bool = False, regressor_kwargs={ "solver_type": "linear", "sub_sample": None, @@ -35,18 +38,37 @@ def __init__( }, ): """ - Create a dataset from a xyz file. + Create a dataset from a list of files. Parameters ---------- path : List[str] The path to the file or a list of paths. + dataset_name : Optional[str], optional + The name of the dataset, by default None. + energy_type : Optional[str], optional + The type of isolated atom energy by default "regression". + Supported types: ["formation", "regression", "null", None] + energy_unit + Energy unit of the dataset. Default is "hartree". + distance_unit + Distance unit of the dataset. Default is "ang". + level_of_theory: Optional[QmMethod, str] + The level of theory of the dataset. + Used if energy_type is "formation" to fetch the correct isolated atom energies. + transform, optional + transformation to apply to the __getitem__ calls + regressor_kwargs + Dictionary of keyword arguments to pass to the regressor. + Default: {"solver_type": "linear", "sub_sample": None, "stride": 1} + solver_type can be one of ["linear", "ridge"] """ self.path = [path] if isinstance(path, str) else path self.__name__ = self.__class__.__name__ if dataset_name is None else dataset_name self.recompute_statistics = True self.refit_e0s = True self.energy_type = energy_type + self.skip_statistics = skip_statistics self.__energy_unit__ = energy_unit self._original_unit = self.energy_unit self.__distance_unit__ = distance_unit @@ -62,29 +84,19 @@ def __init__( self.set_array_format(array_format) self._post_init(True, energy_unit, distance_unit) - def __str__(self): - return self.__name__.lower() - - def __repr__(self): - return str(self) - @abstractmethod def read_as_atoms(self, path: str) -> List[Atoms]: """ - Method that reads a path and return a list of Atoms objects. + Method that reads a file and return a list of Atoms objects. + path : str + The path to the file. """ raise NotImplementedError - def collate_list(self, list_entries): - res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]} - csum = np.cumsum(res.get("n_atoms")) - x = np.zeros((csum.shape[0], 2), dtype=np.int32) - x[1:, 0], x[:, 1] = csum[:-1], csum - res["position_idx_range"] = x - - return res - - def read_raw_entries(self): + def read_raw_entries(self) -> List[Dict]: + """ + Process the files and return a list of data objects. + """ entries_list = [] for path in self.path: for entry in self.read_as_atoms(path): @@ -96,6 +108,11 @@ def _read_and_preprocess(self): self.data = self.collate_list(entries_list) def _convert_to_record(self, obj: Atoms): + """ + Convert an Atoms object to a record for the openQDC dataset processing. + obj : Atoms + The ase.Atoms object to convert + """ name = obj.info.get("name", None) subset = obj.info.get("subset", str(self)) positions = obj.positions @@ -116,8 +133,18 @@ def _convert_to_record(self, obj: Atoms): n_atoms=np.array([len(positions)], dtype=np.int32), ) + def __str__(self): + return self.__name__.lower() + + def __repr__(self): + return str(self) + class XYZDataset(FromFileDataset): + """ + Baseclass to read datasets from xyz and extxyz files. + """ + def read_as_atoms(self, path): from ase.io import iread diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py index b59fcad7..35721dde 100644 --- a/openqdc/datasets/potential/__init__.py +++ b/openqdc/datasets/potential/__init__.py @@ -1,6 +1,7 @@ +from .alchemy import Alchemy from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X from .comp6 import COMP6 -from .dummy import Dummy +from .dummy import Dummy, PredefinedDataset from .gdml import GDML from .geom import GEOM from .iso_17 import ISO17 @@ -10,18 +11,23 @@ from .nabladft import NablaDFT from .orbnet_denali import OrbnetDenali from .pcqm import PCQM_B3LYP, PCQM_PM6 +from .proteinfragments import MDDataset, ProteinFragments from .qm1b import QM1B, QM1B_SMALL from .qm7x import QM7X, QM7X_V2 from .qmugs import QMugs, QMugs_V2 +from .qmx import QM7, QM8, QM9, QM7b from .revmd17 import RevMD17 from .sn2_rxn import SN2RXN from .solvated_peptides import SolvatedPeptides from .spice import Spice, SpiceV2, SpiceVL2 from .tmqm import TMQM from .transition1x import Transition1X +from .vqm24 import VQM24 +from .waterclusters import SCANWaterClusters from .waterclusters3_30 import WaterClusters AVAILABLE_POTENTIAL_DATASETS = { + "Alchemy": Alchemy, "ANI1": ANI1, "ANI1CCX": ANI1CCX, "ANI1CCX_V2": ANI1CCX_V2, @@ -42,6 +48,10 @@ "QMugs_V2": QMugs_V2, "QM1B": QM1B, "QM1B_SMALL": QM1B_SMALL, + "QM7": QM7, + "QM7b": QM7b, + "QM8": QM8, + "QM9": QM9, "SN2RXN": SN2RXN, "SolvatedPeptides": SolvatedPeptides, "Spice": Spice, @@ -50,8 +60,12 @@ "TMQM": TMQM, "Transition1X": Transition1X, "WaterClusters": WaterClusters, + "SCANWaterClusters": SCANWaterClusters, "MultixcQM9": MultixcQM9, "MultixcQM9_V2": MultixcQM9_V2, "RevMD17": RevMD17, "MD22": MD22, + "VQM24": VQM24, + "ProteinFragments": ProteinFragments, + "MDDataset": MDDataset, } diff --git a/openqdc/datasets/potential/alchemy.py b/openqdc/datasets/potential/alchemy.py new file mode 100644 index 00000000..24c17cd9 --- /dev/null +++ b/openqdc/datasets/potential/alchemy.py @@ -0,0 +1,95 @@ +from os.path import join as p_join + +import datamol as dm +import numpy as np +import pandas as pd +from tqdm import tqdm + +from openqdc.datasets.base import BaseDataset +from openqdc.methods import PotentialMethod +from openqdc.utils.molecule import get_atomic_number_and_charge + +# ['gdb_idx', 'atom number', 'zpve\n(Ha, zero point vibrational energy)', +#'Cv\n(cal/molK, heat capacity at 298.15 K)', 'gap\n(Ha, LUMO-HOMO)', +# 'G\n(Ha, Free energy at 298.15 K)', 'HOMO\n(Ha, energy of HOMO)', +# 'U\n(Ha, internal energy at 298.15 K)', 'alpha\n(a_0^3, Isotropic polarizability)', +# 'U0\n(Ha, internal energy at 0 K)', 'H\n(Ha, enthalpy at 298.15 K)', +# 'LUMO\n(Ha, energy of LUMO)', 'mu\n(D, dipole moment)', +# 'R2\n(a_0^2, electronic spatial extent)'] + + +def read_mol(file, energy): + try: + mol = dm.read_sdf(file, remove_hs=False)[0] + positions = mol.GetConformer().GetPositions() + x = get_atomic_number_and_charge(mol) + n_atoms = positions.shape[0] + res = dict( + atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5), + name=np.array([dm.to_smiles(mol)]), + energies=np.array([energy], dtype=np.float64)[:, None], + n_atoms=np.array([n_atoms], dtype=np.int32), + subset=np.array([f"atoms_{n_atoms}"]), + ) + + except Exception as e: + print(f"Skipping due to {e}") + res = None + + return res + + +# e B3LYP/6-31G(2df,p) model with the density fitting +# approximation for electron repulsion integrals. The auxiliary basis cc-pVDZ-jkf + + +class Alchemy(BaseDataset): + """ + Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database. + Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level + with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used + to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G + is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the + B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The + auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange + matrix. + + Usage: + ```python + from openqdc.datasets import Alchemy + dataset = Alchemy() + ``` + + Reference: + https://arxiv.org/abs/1906.09427 + https://alchemy.tencent.com/ + """ + + __name__ = "alchemy" + + __energy_methods__ = [ + PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)" + ] + + energy_target_names = [ + "ωB97x:6-31G(d) Energy", + ] + + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __links__ = {"alchemy.zip": "https://alchemy.tencent.com/data/alchemy-v20191129.zip"} + + def read_raw_entries(self): + dir_path = p_join(self.root, "Alchemy-v20191129") + full_csv = pd.read_csv(p_join(dir_path, "final_version.csv")) + energies = full_csv["U0\n(Ha, internal energy at 0 K)"].tolist() + atom_folder = full_csv["atom number"] + gdb_idx = full_csv["gdb_idx"] + idxs = full_csv.index.tolist() + samples = [] + for i in tqdm(idxs): + sdf_file = p_join(dir_path, f"atom_{atom_folder[i]}", f"{gdb_idx[i]}.sdf") + energy = energies[i] + samples.append(read_mol(sdf_file, energy)) + return samples diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py index bcff384f..aac35635 100644 --- a/openqdc/datasets/potential/ani.py +++ b/openqdc/datasets/potential/ani.py @@ -39,19 +39,22 @@ def extract_ani2_entries(properties): class ANI1(BaseDataset): """ - The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small - organic molecules with energy labels calculated using DFT. The molecules - contain 4 distinct atoms, C, N, O and H. - - Usage + The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic + molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the + wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules + are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary + point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT + level. + + Usage: ```python from openqdc.datasets import ANI1 dataset = ANI1() ``` References: - - ANI-1: https://www.nature.com/articles/sdata2017193 - - Github: https://github.com/aiqm/ANI1x_datasets + https://www.nature.com/articles/sdata2017193\n + https://github.com/aiqm/ANI1x_datasets """ __name__ = "ani1" @@ -79,9 +82,6 @@ def config(self): return dict(dataset_name="ani", links=self.__links__) def __smiles_converter__(self, x): - """util function to convert string to smiles: useful if the smiles is - encoded in a different format than its display format - """ return "-".join(x.decode("ascii").split("-")[:-1]) @property @@ -96,64 +96,23 @@ def read_raw_entries(self): return samples -class ANI1CCX(ANI1): - """ - ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset. The selected - conformations are then labelled using a high accuracy CCSD(T)*/CBS method. - - Usage - ```python - from openqdc.datasets import ANI1CCX - dataset = ANI1CCX() - ``` - - References: - - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4 - - Github: https://github.com/aiqm/ANI1x_datasets - """ - - __name__ = "ani1ccx" - __energy_unit__ = "hartree" - __distance_unit__ = "ang" - __forces_unit__ = "hartree/ang" - - __energy_methods__ = [ - PotentialMethod.NONE, # "ccsd(t)/cbs", - PotentialMethod.NONE, # "ccsd(t)/cc-pvdz", - PotentialMethod.NONE, # "ccsd(t)/cc-pvtz", - PotentialMethod.NONE, # "tccsd(t)/cc-pvdz", - ] - - energy_target_names = [ - "CCSD(T)*:CBS Total Energy", - "NPNO-CCSD(T):cc-pVDZ Correlation Energy", - "NPNO-CCSD(T):cc-pVTZ Correlation Energy", - "TPNO-CCSD(T):cc-pVDZ Correlation Energy", - ] - force_target_names = [] - __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"} - - def __smiles_converter__(self, x): - """util function to convert string to smiles: useful if the smiles is - encoded in a different format than its display format - """ - return x - - class ANI1X(ANI1): """ The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to - a total of 5,496,771 conformers with 63,865 unique molecules. + a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL, + generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques + are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and + (4) torsion sampling. - Usage + Usage: ```python from openqdc.datasets import ANI1X dataset = ANI1X() ``` References: - - ANI-1x: https://doi.org/10.1063/1.5023802 - - Github: https://github.com/aiqm/ANI1x_datasets + https://doi.org/10.1063/1.5023802\n + https://github.com/aiqm/ANI1x_datasets """ __name__ = "ani1x" @@ -162,14 +121,14 @@ class ANI1X(ANI1): __forces_unit__ = "hartree/ang" __energy_methods__ = [ - "hf/cc-pvdz", - "hf/cc-pvqz", - "hf/cc-pvtz", - "mp2/cc-pvdz", - "mp2/cc-pvqz", - "mp2/cc-pvtz", - "wb97x/6-31g(d)", - "wb97x/cc-pvtz", + PotentialMethod.NONE, # "hf/cc-pvdz", + PotentialMethod.NONE, # "hf/cc-pvqz", + PotentialMethod.NONE, # "hf/cc-pvtz", + PotentialMethod.NONE, # "mp2/cc-pvdz", + PotentialMethod.NONE, # "mp2/cc-pvqz", + PotentialMethod.NONE, # "mp2/cc-pvtz", + PotentialMethod.NONE, # "wb97x/6-31g(d)", + PotentialMethod.NONE, # "wb97x/cc-pvtz", ] energy_target_names = [ @@ -194,6 +153,47 @@ class ANI1X(ANI1): def convert_forces(self, x): return super().convert_forces(x) * 0.529177249 # correct the Dataset error + def __smiles_converter__(self, x): + return x + + +class ANI1CCX(ANI1): + """ + ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active + learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method. + + Usage: + ```python + from openqdc.datasets import ANI1CCX + dataset = ANI1CCX() + ``` + + References: + https://doi.org/10.1038/s41467-019-10827-4\n + https://github.com/aiqm/ANI1x_datasets + """ + + __name__ = "ani1ccx" + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + + __energy_methods__ = [ + PotentialMethod.NONE, # "ccsd(t)/cbs", + PotentialMethod.NONE, # "ccsd(t)/cc-pvdz", + PotentialMethod.NONE, # "ccsd(t)/cc-pvtz", + PotentialMethod.NONE, # "tccsd(t)/cc-pvdz", + ] + + energy_target_names = [ + "CCSD(T)*:CBS Total Energy", + "NPNO-CCSD(T):cc-pVDZ Correlation Energy", + "NPNO-CCSD(T):cc-pVTZ Correlation Energy", + "TPNO-CCSD(T):cc-pVDZ Correlation Energy", + ] + force_target_names = [] + __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"} + def __smiles_converter__(self, x): """util function to convert string to smiles: useful if the smiles is encoded in a different format than its display format @@ -202,6 +202,21 @@ def __smiles_converter__(self, x): class ANI1CCX_V2(ANI1CCX): + """ + ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels + for each conformation. + + Usage: + ```python + from openqdc.datasets import ANI1CCX_V2 + dataset = ANI1CCX_V2() + ``` + + References: + https://doi.org/10.1038/s41467-019-10827-4\n + https://github.com/aiqm/ANI1x_datasets + """ + __name__ = "ani1ccx_v2" __energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB] @@ -211,19 +226,20 @@ class ANI1CCX_V2(ANI1CCX): class ANI2X(ANI1): """ - The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, - and s66x8. It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k - chemical isomers, optimized using the LBFGS algorithm and labeled with ωB97X/6-31G*. + The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8. + It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized + using the LBFGS algorithm and labeled with ωB97X/6-31G*. The same sampling techniques as done in ANI-1X are + used for generating geometries. - Usage + Usage: ```python - from openqdc.datasets import ANI@X + from openqdc.datasets import ANI2X dataset = ANI2X() ``` References: - - ANI-2x: https://doi.org/10.1021/acs.jctc.0c00121 - - Github: https://github.com/aiqm/ANI1x_datasets + https://doi.org/10.1021/acs.jctc.0c00121 + https://github.com/aiqm/ANI1x_datasets """ __name__ = "ani2x" @@ -258,9 +274,6 @@ class ANI2X(ANI1): } def __smiles_converter__(self, x): - """util function to convert string to smiles: useful if the smiles is - encoded in a different format than its display format - """ return x def read_raw_entries(self): diff --git a/openqdc/datasets/potential/comp6.py b/openqdc/datasets/potential/comp6.py index d5998e0a..fe24825c 100644 --- a/openqdc/datasets/potential/comp6.py +++ b/openqdc/datasets/potential/comp6.py @@ -7,19 +7,43 @@ class COMP6(BaseDataset): """ - COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space - developed for testing the ANI-1x potential. It is curated from 6 benchmark sets: - S66x8, ANI Molecular Dynamics, GDB7to9, GDB10to13, DrugBank, and Tripeptides. + COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the + ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and + Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using + the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and + molecular dipoles. - Usage + Details of the benchmark sets are as follows: + S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and + mixed influence interactions.\n + ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular + dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small + proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point + calculations are performed to calculate energies and forces.\n + GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset. + The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence + criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal + mode sampling (DNMS) is carried out to generate non-equilibrium conformations.\n + GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11 + and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are + generated via DNMS.\n + Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\n + DrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules. + Structures are optimized similar to GDB7to9. + + Usage: ```python from openqdc.datasets import COMP6 dataset = COMP6() ``` References: - - https://aip.scitation.org/doi/abs/10.1063/1.5023802 - - Github: https://github.com/isayev/COMP6 + https://aip.scitation.org/doi/abs/10.1063/1.5023802\n + https://github.com/isayev/COMP6\n + S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d\n + GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/\n + GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/\n + DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h """ __name__ = "comp6" diff --git a/openqdc/datasets/potential/gdml.py b/openqdc/datasets/potential/gdml.py index 24f283e6..24c74754 100644 --- a/openqdc/datasets/potential/gdml.py +++ b/openqdc/datasets/potential/gdml.py @@ -8,25 +8,32 @@ class GDML(BaseDataset): """ Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio - molecular dynamics (AIMD) trajectories. The dataset consists of, - - Benzene: 627000 samples - - Uracil: 133000 samples - - Naptalene: 326000 samples - - Aspirin: 211000 samples - - Salicylic Acid: 320000 samples - - Malonaldehyde: 993000 samples - - Ethanol: 555000 samples - - Toluene: 100000 samples + molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene + (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin + (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations), + Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for + each conformation are computed using the PBE + vdW-TS electronic structure method. + molecular dynamics (AIMD) trajectories. - Usage + The dataset consists of the following trajectories: + Benzene: 627000 samples\n + Uracil: 133000 samples\n + Naptalene: 326000 samples\n + Aspirin: 211000 samples\n + Salicylic Acid: 320000 samples\n + Malonaldehyde: 993000 samples\n + Ethanol: 555000 samples\n + Toluene: 100000 samples\n + + Usage: ```python from openqdc.datasets import GDML dataset = GDML() ``` References: - - https://www.science.org/doi/10.1126/sciadv.1603015 - - http://www.sgdml.org/#datasets + https://www.science.org/doi/10.1126/sciadv.1603015 + http://www.sgdml.org/#datasets """ __name__ = "gdml" diff --git a/openqdc/datasets/potential/geom.py b/openqdc/datasets/potential/geom.py index d07a3d93..7c86b1e3 100644 --- a/openqdc/datasets/potential/geom.py +++ b/openqdc/datasets/potential/geom.py @@ -61,9 +61,11 @@ def read_mol(mol_id: str, mol_dict, base_path: str, partition: str) -> Dict[str, class GEOM(BaseDataset): """ - The Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules - from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, - and physical chemistry. The dataset is generated using the GFN2-xTB semi-empirical method. + Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules + from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry. + For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and + the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the + conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB. Usage: ```python @@ -72,8 +74,9 @@ class GEOM(BaseDataset): ``` References: - - https://www.nature.com/articles/s41597-022-01288-4 - - https://github.com/learningmatter-mit/geom + https://www.nature.com/articles/s41597-022-01288-4\n + https://github.com/learningmatter-mit/geom\n + CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d """ __name__ = "geom" diff --git a/openqdc/datasets/potential/iso_17.py b/openqdc/datasets/potential/iso_17.py index fe6aab5c..5672650b 100644 --- a/openqdc/datasets/potential/iso_17.py +++ b/openqdc/datasets/potential/iso_17.py @@ -7,11 +7,12 @@ class ISO17(BaseDataset): """ - ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed - composition of atoms (C7O2H10) arranged in different chemically valid structures. It consist - of 129 molecules, each containing 5,000 conformational geometries, energies and forces with a resolution - of 1 femtosecond in the molecular dynamics trajectories. The simulations were carried out using the - Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method. + ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of + atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing + 5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics + trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient + approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der + Waals correction method. Usage: ```python @@ -20,7 +21,11 @@ class ISO17(BaseDataset): ``` References: - - https://paperswithcode.com/dataset/iso17 + https://arxiv.org/abs/1706.08566\n + https://arxiv.org/abs/1609.08259\n + https://www.nature.com/articles/sdata201422\n + https://pubmed.ncbi.nlm.nih.gov/10062328/\n + https://pubmed.ncbi.nlm.nih.gov/19257665/ """ __name__ = "iso_17" diff --git a/openqdc/datasets/potential/md22.py b/openqdc/datasets/potential/md22.py index b9976426..0eb4a72c 100644 --- a/openqdc/datasets/potential/md22.py +++ b/openqdc/datasets/potential/md22.py @@ -40,6 +40,22 @@ def create_path(filename, root): class MD22(RevMD17): + """ + MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules, + ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories + are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD + level of theory. + + Usage: + ```python + from openqdc.datasets import MD22 + dataset = MD22() + ``` + + Reference: + https://arxiv.org/abs/2209.14865 + """ + __name__ = "md22" __links__ = { f"{x}.npz": f"http://www.quantum-machine.org/gdml/repo/datasets/md22_{x}.npz" diff --git a/openqdc/datasets/potential/molecule3d.py b/openqdc/datasets/potential/molecule3d.py index ec1dbd00..fa4f4683 100644 --- a/openqdc/datasets/potential/molecule3d.py +++ b/openqdc/datasets/potential/molecule3d.py @@ -67,9 +67,10 @@ def _read_sdf(sdf_path: str, properties_path: str) -> List[Dict[str, np.ndarray] class Molecule3D(BaseDataset): """ - Molecule3D dataset consists of 3,899,647 molecules with ground state geometries and energies - calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the - PubChem database and cleaned by removing invalid molecule files. + Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the + B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing + molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems, + or with damaged log files. Usage: ```python @@ -78,8 +79,8 @@ class Molecule3D(BaseDataset): ``` References: - - https://arxiv.org/abs/2110.01717 - - https://github.com/divelab/MoleculeX + https://arxiv.org/abs/2110.01717\n + https://github.com/divelab/MoleculeX """ __name__ = "molecule3d" diff --git a/openqdc/datasets/potential/multixcqm9.py b/openqdc/datasets/potential/multixcqm9.py index 70dab1ea..41d7a4dc 100644 --- a/openqdc/datasets/potential/multixcqm9.py +++ b/openqdc/datasets/potential/multixcqm9.py @@ -37,20 +37,21 @@ def read_xyz_files(folder_path): class MultixcQM9(BaseDataset): """ - MultixcQM9 is a dataset of molecular and reaction energies from - multi-level quantum chemical methods consisting of 133 K QM9 molecules - calculated with 76 different DFT functionals and three different basis sets - (228 energy numbers for each molecule) + 1 GFN2-XTB calculation. + MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting + of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets + resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the + molecules are used directly from Kim et al. which uses G4MP2 method. Usage: ```python - from openqdc.datasets import NablaDFT - dataset = NablaDFT() + from openqdc.datasets import MultixcQM9 + dataset = MultixcQM9() ``` References: - - https://www.nature.com/articles/s41597-023-02690-2 - - https://github.com/chemsurajit/largeDFTdata + https://www.nature.com/articles/s41597-023-02690-2\n + https://github.com/chemsurajit/largeDFTdata\n + https://www.nature.com/articles/s41597-019-0121-7\n """ __name__ = "multixcqm9" diff --git a/openqdc/datasets/potential/nabladft.py b/openqdc/datasets/potential/nabladft.py index 4700ade5..f83f1c00 100644 --- a/openqdc/datasets/potential/nabladft.py +++ b/openqdc/datasets/potential/nabladft.py @@ -52,7 +52,11 @@ class NablaDFT(BaseDataset): """ NablaDFT is a dataset constructed from a subset of the [Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules - with 5,340,152 unique conformations generated using ωB97X-D/def2-SVP level of theory. + with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of + conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that + cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set. + This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at + wB97X-D/def2-XVP levels are used to generate the energy. Usage: ```python @@ -61,8 +65,8 @@ class NablaDFT(BaseDataset): ``` References: - - https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D - - https://github.com/AIRI-Institute/nablaDFT + https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D\n + https://github.com/AIRI-Institute/nablaDFT """ __name__ = "nabladft" @@ -76,6 +80,15 @@ class NablaDFT(BaseDataset): __forces_unit__ = "hartree/bohr" __links__ = {"nabladft.db": "https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db"} + @property + def data_types(self): + return { + "atomic_inputs": np.float32, + "position_idx_range": np.int32, + "energies": np.float32, + "forces": np.float32, + } + @requires_package("nablaDFT") def read_raw_entries(self): from nablaDFT.dataset import HamiltonianDatabase diff --git a/openqdc/datasets/potential/orbnet_denali.py b/openqdc/datasets/potential/orbnet_denali.py index 6a7c3f47..1dd70468 100644 --- a/openqdc/datasets/potential/orbnet_denali.py +++ b/openqdc/datasets/potential/orbnet_denali.py @@ -36,10 +36,14 @@ def read_archive(mol_id, conf_dict, base_path, energy_target_names: List[str]) - class OrbnetDenali(BaseDataset): """ - Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. It performs - DFT (ωB97X-D3/def2-TZVP) calculations on molecules and geometries consisting of organic molecules - and chemistries, with protonation and tautomeric states, non-covalent interactions, common salts, - and counterions, spanning the most common elements in bio and organic chemistry. + Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range + of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and + counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps. + First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer + generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using + normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of + theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of + theory. Usage: ```python @@ -48,8 +52,8 @@ class OrbnetDenali(BaseDataset): ``` References: - - https://arxiv.org/pdf/2107.00299.pdf - - https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867 + https://arxiv.org/abs/2107.00299\n + https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867 """ __name__ = "orbnet_denali" @@ -74,13 +78,6 @@ def read_raw_entries(self): for mol_id, group in df.groupby("mol_id") } - # print(df.head()) - # tmp = df.to_dict('index') - # for i, k in enumerate(tmp): - # print(k, tmp[k]) - # if i > 10: - # break - # exit() fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names) res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True) samples = sum(res, []) diff --git a/openqdc/datasets/potential/pcqm.py b/openqdc/datasets/potential/pcqm.py index 535b90dc..cd32b838 100644 --- a/openqdc/datasets/potential/pcqm.py +++ b/openqdc/datasets/potential/pcqm.py @@ -66,6 +66,23 @@ def read_preprocessed_archive(path): class PCQM_PM6(BaseDataset): + """ + PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized + molecular geometries and electronic properties. To generate the dataset, only molecules with weights less + than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel + and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also + computed using the PM6 method. + + Usage: + ```python + from openqdc.datasets import PCQM_PM6 + dataset = PCQM_PM6() + ``` + + References: + https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740 + """ + __name__ = "pubchemqc_pm6" __energy_methods__ = [PotentialMethod.PM6] @@ -93,6 +110,15 @@ def collate_list(self, list_entries): res = None return res + @property + def data_types(self): + return { + "atomic_inputs": np.float32, + "position_idx_range": np.int32, + "energies": np.float32, + "forces": np.float32, + } + def read_raw_entries(self): arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl")) f = lambda x: self.collate_list(read_preprocessed_archive(x)) @@ -150,6 +176,21 @@ def collate_and_save_list(self, list_entries): class PCQM_B3LYP(PCQM_PM6): + """ + PubChemQC B3LYP/6-31G* (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to + biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry, + the electronic structure and properties are calculated using B3LIP/6-31G* method. + + Usage: + ```python + from openqdc.datasets import PCQM_B3LYP + dataset = PCQM_B3LYP() + ``` + + References: + https://arxiv.org/abs/2305.18454 + """ + __name__ = "pubchemqc_b3lyp" __energy_methods__ = ["b3lyp/6-31g*"] energy_target_names = ["b3lyp"] diff --git a/openqdc/datasets/potential/proteinfragments.py b/openqdc/datasets/potential/proteinfragments.py new file mode 100644 index 00000000..d6289750 --- /dev/null +++ b/openqdc/datasets/potential/proteinfragments.py @@ -0,0 +1,192 @@ +import os +from os.path import join as p_join + +import numpy as np +from tqdm import tqdm + +from openqdc.datasets.base import BaseDataset +from openqdc.methods import PotentialMethod +from openqdc.utils.io import get_local_cache +from openqdc.utils.package_utils import requires_package + + +def convert_entries(r, e, f, z, subset): + coordinates = r + species = z + forces = f + energies = e + n_atoms = coordinates.shape[0] + flattened_coordinates = coordinates[:].reshape((-1, 3)) + xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1) + res = dict( + name=np.array([subset]), + subset=np.array([subset]), + energies=energies[:].reshape((-1, 1)).astype(np.float64), + atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32), + n_atoms=np.array([n_atoms], dtype=np.int32), + forces=forces[:].reshape(-1, 3, 1).astype(np.float32), + ) + return res + + +@requires_package("apsw") +def read_db(path): + database = Database(path) + subset = os.path.basename(path).split(".")[0] + # Read an entry from the database. + # entry = 0 + n = len(database) + entries = [] + for entry in tqdm(range(n)): + q, s, z, r, e, f, d = database[entry] + entries.append(convert_entries(r, e, f, z, subset)) + return entries + + # assert entry < len(database) + # q, s, z, r, e, f, d = database[entry] + # with np.printoptions(threshold=7): + # print(f'entry {entry} of {len(database)}') + # print('total charge\n', q) + # print('number of unpaired electrons\n', s) + # print('atomic numbers\n', z) + # print('positions [Å]\n', r) + # print('energy [eV]\n', e) + # print('forces [eV/Å]\n', f) + # print('dipole [e*Å]\n', d) + + +class Database: + @requires_package("apsw") + def __init__(self, filename): + import apsw + + self.cursor = apsw.Connection(filename, flags=apsw.SQLITE_OPEN_READONLY).cursor() + + def __len__(self): + return self.cursor.execute("""SELECT * FROM metadata WHERE id=1""").fetchone()[-1] + + def __getitem__(self, idx): + data = self.cursor.execute("""SELECT * FROM data WHERE id=""" + str(idx)).fetchone() + return self._unpack_data_tuple(data) + + def _deblob(self, buffer, dtype, shape=None): + array = np.frombuffer(buffer, dtype) + if not np.little_endian: + array = array.byteswap() + array.shape = shape + return np.copy(array) + + def _unpack_data_tuple(self, data): + n = len(data[3]) // 4 # A single int32 is 4 bytes long. + q = np.asarray([0.0 if data[1] is None else data[1]], dtype=np.float32) + s = np.asarray([0.0 if data[2] is None else data[2]], dtype=np.float32) + z = self._deblob(data[3], dtype=np.int32, shape=(n,)) + r = self._deblob(data[4], dtype=np.float32, shape=(n, 3)) + e = np.asarray([0.0 if data[5] is None else data[5]], dtype=np.float32) + f = self._deblob(data[6], dtype=np.float32, shape=(n, 3)) + d = self._deblob(data[7], dtype=np.float32, shape=(1, 3)) + return q, s, z, r, e, f, d + + +class ProteinFragments(BaseDataset): + """ + ProteinFragments is a dataset constructed from a subset of the + the data was generated from a top-down and bottom-up approach: + + Top-down: + Fragments are generated by cutting out a spherical + region around an atom (including solvent molecules) + and saturating all dangling bonds. + Sampling was done with the Molecular Dynamics (MD) method from + conventional FF at room temperature. + + Bottom-up: + Fragments are generated by constructing chemical graphs + of one to eight nonhydrogen atoms. + Sampling of multiple conformers per fragments was done with + MD simulations at high temperatures or normal mode sampling. + + + Usage: + ```python + from openqdc.datasets import ProteinFragments + dataset = ProteinFragments() + ``` + + References: + https://www.science.org/doi/10.1126/sciadv.adn4397 + """ + + __name__ = "proteinfragments" + # PBE0/def2-TZVPP+MBD + __energy_methods__ = [ + PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)" + ] + + energy_target_names = [ + "ωB97x:6-31G(d) Energy", + ] + # PBE0/def2-TZVPP+MBD + __energy_unit__ = "ev" + __distance_unit__ = "ang" + __forces_unit__ = "ev/ang" + __links__ = { + f"{name}.db": f"https://zenodo.org/records/10720941/files/{name}.db?download=1" + for name in ["general_protein_fragments"] + } + + @property + def root(self): + return p_join(get_local_cache(), "proteinfragments") + + @property + def config(self): + assert len(self.__links__) > 0, "No links provided for fetching" + return dict(dataset_name="proteinfragments", links=self.__links__) + + @property + def preprocess_path(self): + path = p_join(self.root, "preprocessed", self.__name__) + os.makedirs(path, exist_ok=True) + return path + + def read_raw_entries(self): + samples = [] + for name in self.__links__: + raw_path = p_join(self.root, f"{name}") + samples.extend(read_db(raw_path)) + return samples + + +class MDDataset(ProteinFragments): + """ + MDDataset is a subset of the proteinfragments dataset that + generated from the molecular dynamics with their model. + The sampling was done with Molecular Dynamics + at room temperature 300K in various solvent phase: + + Subsets: + Polyalanine: + All the polyalanine are sampled in gas phase. AceAla15Lys is + a polyalanine peptides capped with an N-terminal acetyl group + and a protonated lysine residue at the C-terminus, + Acela15nme is polyalanine peptide capped with an N-terminal acetyl group + and a C-terminal N-methyl amide group\n + Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms) + + Usage: + ```python + from openqdc.datasets import MDDataset + dataset = MDDataset() + ``` + + References: + https://www.science.org/doi/10.1126/sciadv.adn4397 + """ + + __name__ = "mddataset" + + __links__ = { + f"{name}.db": f"https://zenodo.org/records/10720941/files/{name}.db?download=1" + for name in ["acala15nme_folding_clusters", "crambin", "minimahopping_acala15lysh", "minimahopping_acala15nme"] + } diff --git a/openqdc/datasets/potential/qm1b.py b/openqdc/datasets/potential/qm1b.py index 5e10ed23..edccae0d 100644 --- a/openqdc/datasets/potential/qm1b.py +++ b/openqdc/datasets/potential/qm1b.py @@ -78,11 +78,11 @@ def extract_from_row(row, file_idx=None): class QM1B(BaseDataset): """ - QM1B is a low-resolution DFT dataset generated using PySCF IPU. - It is composed of one billion training examples containing 9-11 heavy atoms. - It was created by taking 1.09M SMILES strings from the GDB-11 database and - computing molecular properties (e.g. HOMO-LUMO gap) for a set of up to 1000 - conformers per molecule at the B3LYP/STO-3G level of theory. + QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom + PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are + subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit. + Electronic properties for each conformation are then calculated using the density functional B3LYP + and the basis set STO-3G. Usage: ```python @@ -91,8 +91,8 @@ class QM1B(BaseDataset): ``` References: - - https://arxiv.org/pdf/2311.01135 - - https://github.com/graphcore-research/qm1b-dataset/ + https://arxiv.org/pdf/2311.01135\n + https://github.com/graphcore-research/qm1b-dataset/ """ __name__ = "qm1b" @@ -144,8 +144,7 @@ def extract_parallel(df, i): class QM1B_SMALL(QM1B): """ - QM1B_SMALL is a subset of the QM1B dataset containing a - maximum of 15 random conformers per molecule. + QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule. Usage: ```python diff --git a/openqdc/datasets/potential/qm7x.py b/openqdc/datasets/potential/qm7x.py index 7bf1323c..351162c0 100644 --- a/openqdc/datasets/potential/qm7x.py +++ b/openqdc/datasets/potential/qm7x.py @@ -35,12 +35,15 @@ def read_mol(mol_h5, mol_name, energy_target_names, force_target_names): class QM7X(BaseDataset): """ - QM7X is a collection of almost 4.2 million conformers from 6,950 unique molecules. It contains DFT - energy and force labels at the PBE0+MBD level of theory. It consists of structures for molecules with - up to seven heavy (C, N, O, S, Cl) atoms from the GDB13 database. For each molecule, (meta-)stable - equilibrium structures including constitutional/structural isomers and stereoisomers are - searched using density-functional tight binding (DFTB). Then, for each (meta-)stable structure, 100 - off-equilibrium structures are obtained and labeled with PBE0+MBD. + QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with + up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations, + OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta- + stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure + is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD) + interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non + -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of + normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has + energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method. Usage: ```python @@ -49,8 +52,8 @@ class QM7X(BaseDataset): ``` References: - - https://arxiv.org/abs/2006.15139 - - https://zenodo.org/records/4288677 + https://arxiv.org/abs/2006.15139\n + https://zenodo.org/records/4288677 """ __name__ = "qm7x" @@ -59,9 +62,9 @@ class QM7X(BaseDataset): energy_target_names = ["ePBE0+MBD", "eDFTB+MBD"] - __force_mask__ = [True, True] + __force_mask__ = [True, False] - force_target_names = ["pbe0FOR", "vdwFOR"] + force_target_names = ["pbe0FOR"] __energy_unit__ = "ev" __distance_unit__ = "ang" @@ -81,6 +84,16 @@ def read_raw_entries(self): class QM7X_V2(QM7X): + """ + QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries. + + Usage: + ```python + from openqdc.datasets import QM7X_V2 + dataset = QM7X_V2() + ``` + """ + __name__ = "qm7x_v2" __energy_methods__ = QM7X.__energy_methods__ + [PotentialMethod.PM6] __force_mask__ = QM7X.__force_mask__ + [False] diff --git a/openqdc/datasets/potential/qmugs.py b/openqdc/datasets/potential/qmugs.py index 6cc38900..b819b214 100644 --- a/openqdc/datasets/potential/qmugs.py +++ b/openqdc/datasets/potential/qmugs.py @@ -38,8 +38,9 @@ def read_mol(mol_dir): class QMugs(BaseDataset): """ The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules - extracted from the ChEMBL database. The atomic and molecular properties are calculated using both, - semi-empirical methods (GFN2-xTB) and DFT method (ωB97X-D/def2-SVP). + extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB + method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical + method (GFN2-xTB) and DFT method (ωB97X-D/def2-SVP). Usage: ```python @@ -48,8 +49,9 @@ class QMugs(BaseDataset): ``` References: - - https://www.nature.com/articles/s41597-022-01390-7#ethics - - https://www.research-collection.ethz.ch/handle/20.500.11850/482129 + https://arxiv.org/abs/2107.00367\n + https://www.nature.com/articles/s41597-022-01390-7#ethics\n + https://www.research-collection.ethz.ch/handle/20.500.11850/482129 """ __name__ = "qmugs" @@ -76,6 +78,16 @@ def read_raw_entries(self): class QMugs_V2(QMugs): + """ + QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries. + + Usage: + ```python + from openqdc.datasets import QMugs_V2 + dataset = QMugs_V2() + ``` + """ + __name__ = "qmugs_v2" __energy_methods__ = QMugs.__energy_methods__ + [PotentialMethod.PM6] energy_target_names = QMugs.energy_target_names + ["PM6"] diff --git a/openqdc/datasets/potential/qmx.py b/openqdc/datasets/potential/qmx.py new file mode 100644 index 00000000..2dfb8443 --- /dev/null +++ b/openqdc/datasets/potential/qmx.py @@ -0,0 +1,402 @@ +import os +from abc import ABC +from os.path import join as p_join + +import datamol as dm +import numpy as np +import pandas as pd + +from openqdc.datasets.base import BaseDataset +from openqdc.methods import PotentialMethod +from openqdc.utils import read_qc_archive_h5 +from openqdc.utils.io import get_local_cache +from openqdc.utils.molecule import get_atomic_number_and_charge + + +def extract_ani2_entries(properties): + coordinates = properties["coordinates"] + species = properties["species"] + forces = properties["forces"] + energies = properties["energies"] + n_atoms = coordinates.shape[1] + n_entries = coordinates.shape[0] + flattened_coordinates = coordinates[:].reshape((-1, 3)) + xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1) + res = dict( + name=np.array(["ANI2"] * n_entries), + subset=np.array([str(n_atoms)] * n_entries), + energies=energies[:].reshape((-1, 1)).astype(np.float64), + atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32), + n_atoms=np.array([n_atoms] * n_entries, dtype=np.int32), + forces=forces[:].reshape(-1, 3, 1).astype(np.float32), + ) + return res + + +class QMX(ABC, BaseDataset): + """ + QMX dataset base abstract class + """ + + __name__ = "qm9" + + __energy_methods__ = [ + PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)" + ] + + energy_target_names = [ + "ωB97x:6-31G(d) Energy", + ] + + __energy_unit__ = "hartree" + __distance_unit__ = "bohr" + __forces_unit__ = "hartree/bohr" + __links__ = {} + + @property + def root(self): + return p_join(get_local_cache(), "qmx") + + @property + def preprocess_path(self): + path = p_join(self.root, "preprocessed", self.__name__) + os.makedirs(path, exist_ok=True) + return path + + @property + def config(self): + assert len(self.__links__) > 0, "No links provided for fetching" + return dict(dataset_name="qmx", links=self.__links__) + + def read_raw_entries(self): + raw_path = p_join(self.root, f"{self.__name__}.h5.gz") + samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, None) + return samples + + +# ['smiles', 'E1-CC2', 'E2-CC2', 'f1-CC2', 'f2-CC2', 'E1-PBE0', 'E2-PBE0', 'f1-PBE0', 'f2-PBE0', +# 'E1-PBE0.1', 'E2-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1', 'E1-CAM', 'E2-CAM', 'f1-CAM', 'f2-CAM'] +class QM7(QMX): + """ + QM7 is a dataset constructed from subsets of the GDB-13 database ( + stable and synthetically accessible organic molecules), + containing up to seven “heavy” atoms. + The molecules conformation are optimized using DFT at the + PBE0/def2-TZVP level of theory. + + Chemical species: + [C, N, O, S, H] + + Usage: + ```python + from openqdc.datasets import QM7 + dataset = QM7() + ``` + + References: + https://arxiv.org/pdf/1703.00564 + """ + + __links__ = {"qm7.hdf5.gz": "https://zenodo.org/record/3588337/files/150.hdf5.gz?download=1"} + __name__ = "qm7" + + energy_target_names = [ + "B2PLYP-D3(BJ):aug-cc-pvdz", + "B2PLYP-D3(BJ):aug-cc-pvtz", + "B2PLYP-D3(BJ):def2-svp", + "B2PLYP-D3(BJ):def2-tzvp", + "B2PLYP-D3(BJ):sto-3g", + "B2PLYP-D3:aug-cc-pvdz", + "B2PLYP-D3:aug-cc-pvtz", + "B2PLYP-D3:def2-svp", + "B2PLYP-D3:def2-tzvp", + "B2PLYP-D3:sto-3g", + "B2PLYP-D3M(BJ):aug-cc-pvdz", + "B2PLYP-D3M(BJ):aug-cc-pvtz", + "B2PLYP-D3M(BJ):def2-svp", + "B2PLYP-D3M(BJ):def2-tzvp", + "B2PLYP-D3M(BJ):sto-3g", + "B2PLYP-D3M:aug-cc-pvdz", + "B2PLYP-D3M:aug-cc-pvtz", + "B2PLYP-D3M:def2-svp", + "B2PLYP-D3M:def2-tzvp", + "B2PLYP-D3M:sto-3g", + "B2PLYP:aug-cc-pvdz", + "B2PLYP:aug-cc-pvtz", + "B2PLYP:def2-svp", + "B2PLYP:def2-tzvp", + "B2PLYP:sto-3g", + "B3LYP-D3(BJ):aug-cc-pvdz", + "B3LYP-D3(BJ):aug-cc-pvtz", + "B3LYP-D3(BJ):def2-svp", + "B3LYP-D3(BJ):def2-tzvp", + "B3LYP-D3(BJ):sto-3g", + "B3LYP-D3:aug-cc-pvdz", + "B3LYP-D3:aug-cc-pvtz", + "B3LYP-D3:def2-svp", + "B3LYP-D3:def2-tzvp", + "B3LYP-D3:sto-3g", + "B3LYP-D3M(BJ):aug-cc-pvdz", + "B3LYP-D3M(BJ):aug-cc-pvtz", + "B3LYP-D3M(BJ):def2-svp", + "B3LYP-D3M(BJ):def2-tzvp", + "B3LYP-D3M(BJ):sto-3g", + "B3LYP-D3M:aug-cc-pvdz", + "B3LYP-D3M:aug-cc-pvtz", + "B3LYP-D3M:def2-svp", + "B3LYP-D3M:def2-tzvp", + "B3LYP-D3M:sto-3g", + "B3LYP:aug-cc-pvdz", + "B3LYP:aug-cc-pvtz", + "B3LYP:def2-svp", + "B3LYP:def2-tzvp", + "B3LYP:sto-3g", + "HF:aug-cc-pvdz", + "HF:aug-cc-pvtz", + "HF:def2-svp", + "HF:def2-tzvp", + "HF:sto-3g", + "MP2:aug-cc-pvdz", + "MP2:aug-cc-pvtz", + "MP2:def2-svp", + "MP2:def2-tzvp", + "MP2:sto-3g", + "PBE0:aug-cc-pvdz", + "PBE0:aug-cc-pvtz", + "PBE0:def2-svp", + "PBE0:def2-tzvp", + "PBE0:sto-3g", + "PBE:aug-cc-pvdz", + "PBE:aug-cc-pvtz", + "PBE:def2-svp", + "PBE:def2-tzvp", + "PBE:sto-3g", + "WB97M-V:aug-cc-pvdz", + "WB97M-V:aug-cc-pvtz", + "WB97M-V:def2-svp", + "WB97M-V:def2-tzvp", + "WB97M-V:sto-3g", + "WB97X-D:aug-cc-pvdz", + "WB97X-D:aug-cc-pvtz", + "WB97X-D:def2-svp", + "WB97X-D:def2-tzvp", + "WB97X-D:sto-3g", + ] + + __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))] # "wb97x/6-31g(d)" + + +class QM7b(QMX): + """ + QM7b is a dataset constructed from subsets of the GDB-13 database ( + stable and synthetically accessible organic molecules), + containing up to seven “heavy” atoms. + The molecules conformation are optimized using DFT at the + PBE0/def2-TZVP level of theory. + + Chemical species: + [C, N, O, S, Cl, H] + + Usage: + ```python + from openqdc.datasets import QM7b + dataset = QM7b() + ``` + + References: + https://arxiv.org/pdf/1703.00564 + """ + + __links__ = {"qm7b.hdf5.gz": "https://zenodo.org/record/3588335/files/200.hdf5.gz?download=1"} + __name__ = "qm7b" + energy_target_names = [ + "CCSD(T0):cc-pVDZ", + "HF:cc-pVDZ", + "HF:cc-pVTZ", + "MP2:cc-pVTZ", + "B2PLYP-D3:aug-cc-pvdz", + "B2PLYP-D3:aug-cc-pvtz", + "B2PLYP-D3:def2-svp", + "B2PLYP-D3:def2-tzvp", + "B2PLYP-D3:sto-3g", + "B2PLYP-D3M(BJ):aug-cc-pvdz", + "B2PLYP-D3M(BJ):aug-cc-pvtz", + "B2PLYP-D3M(BJ):def2-svp", + "B2PLYP-D3M(BJ):def2-tzvp", + "B2PLYP-D3M(BJ):sto-3g", + "B2PLYP-D3M:aug-cc-pvdz", + "B2PLYP-D3M:aug-cc-pvtz", + "B2PLYP-D3M:def2-svp", + "B2PLYP-D3M:def2-tzvp", + "B2PLYP-D3M:sto-3g", + "B2PLYP:aug-cc-pvdz", + "B2PLYP:aug-cc-pvtz", + "B2PLYP:def2-svp", + "B2PLYP:def2-tzvp", + "B2PLYP:sto-3g", + "B3LYP-D3(BJ):aug-cc-pvdz", + "B3LYP-D3(BJ):aug-cc-pvtz", + "B3LYP-D3(BJ):def2-svp", + "B3LYP-D3(BJ):def2-tzvp", + "B3LYP-D3(BJ):sto-3g", + "B3LYP-D3:aug-cc-pvdz", + "B3LYP-D3:aug-cc-pvtz", + "B3LYP-D3:def2-svp", + "B3LYP-D3:def2-tzvp", + "B3LYP-D3:sto-3g", + "B3LYP-D3M(BJ):aug-cc-pvdz", + "B3LYP-D3M(BJ):aug-cc-pvtz", + "B3LYP-D3M(BJ):def2-svp", + "B3LYP-D3M(BJ):def2-tzvp", + "B3LYP-D3M(BJ):sto-3g", + "B3LYP-D3M:aug-cc-pvdz", + "B3LYP-D3M:aug-cc-pvtz", + "B3LYP-D3M:def2-svp", + "B3LYP-D3M:def2-tzvp", + "B3LYP-D3M:sto-3g", + "B3LYP:aug-cc-pvdz", + "B3LYP:aug-cc-pvtz", + "B3LYP:def2-svp", + "B3LYP:def2-tzvp", + "B3LYP:sto-3g", + "HF:aug-cc-pvdz", + "HF:aug-cc-pvtz", + "HF:cc-pvtz", + "HF:def2-svp", + "HF:def2-tzvp", + "HF:sto-3g", + "PBE0:aug-cc-pvdz", + "PBE0:aug-cc-pvtz", + "PBE0:def2-svp", + "PBE0:def2-tzvp", + "PBE0:sto-3g", + "PBE:aug-cc-pvdz", + "PBE:aug-cc-pvtz", + "PBE:def2-svp", + "PBE:def2-tzvp", + "PBE:sto-3g", + "SVWN:sto-3g", + "WB97M-V:aug-cc-pvdz", + "WB97M-V:aug-cc-pvtz", + "WB97M-V:def2-svp", + "WB97M-V:def2-tzvp", + "WB97M-V:sto-3g", + "WB97X-D:aug-cc-pvdz", + "WB97X-D:aug-cc-pvtz", + "WB97X-D:def2-svp", + "WB97X-D:def2-tzvp", + "WB97X-D:sto-3g", + ] + __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))] # "wb97x/6-31g(d)"] + + +class QM8(QMX): + """QM8 is the subset of QM9 used in a study on modeling quantum + mechanical calculations of electronic spectra and excited + state energy (a increase of energy from the ground states) of small molecules + up to eight heavy atoms. + Multiple methods were used, including + time-dependent density functional theories (TDDFT) and + second-order approximate coupled-cluster (CC2). + The molecules conformations are relaxed geometries computed using + the DFT B3LYP with basis set 6-31G(2df,p). + For more information about the sampling, check QM9 dataset. + + Usage: + ```python + from openqdc.datasets import QM8 + dataset = QM8() + ``` + + References: + https://arxiv.org/pdf/1504.01966 + """ + + __name__ = "qm8" + + __energy_methods__ = [ + PotentialMethod.NONE, # "wb97x/6-31g(d)" + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + ] + + __links__ = { + "qm8.csv": "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv", + "qm8.tar.gz": "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz", + } + + def read_raw_entries(self): + df = pd.read_csv(p_join(self.root, "qm8.csv")) + mols = dm.read_sdf(p_join(self.root, "qm8.sdf"), sanitize=False, remove_hs=False) + samples = [] + for idx_row, mol in zip(df.iterrows(), mols): + _, row = idx_row + positions = mol.GetConformer().GetPositions() + x = get_atomic_number_and_charge(mol) + n_atoms = positions.shape[0] + samples.append( + dict( + atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5), + name=np.array([row["smiles"]]), + energies=np.array( + [ + row[ + ["E1-CC2", "E2-CC2", "E1-PBE0", "E2-PBE0", "E1-PBE0.1", "E2-PBE0.1", "E1-CAM", "E2-CAM"] + ].tolist() + ], + dtype=np.float64, + ).reshape(1, -1), + n_atoms=np.array([n_atoms], dtype=np.int32), + subset=np.array([f"{self.__name__}"]), + ) + ) + return samples + + +class QM9(QMX): + """ + QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database, + containing up to 9 “heavy” atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p) + level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed + by relaxing geometries with quantum mechanical method B3LYP. + + Usage: + ```python + from openqdc.datasets import QM9 + dataset = QM9() + ``` + + Reference: + https://www.nature.com/articles/sdata201422 + """ + + __links__ = {"qm9.hdf5.gz": "https://zenodo.org/record/3588339/files/155.hdf5.gz?download=1"} + __name__ = "qm9" + energy_target_names = [ + "Internal energy at 0 K", + "B3LYP:def2-svp", + "HF:cc-pvtz", + "HF:sto-3g", + "PBE:sto-3g", + "SVWN:sto-3g", + "WB97X-D:aug-cc-pvtz", + "WB97X-D:def2-svp", + "WB97X-D:def2-tzvp", + ] + + __energy_methods__ = [ + PotentialMethod.NONE, # "wb97x/6-31g(d)" + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + PotentialMethod.NONE, + ] diff --git a/openqdc/datasets/potential/revmd17.py b/openqdc/datasets/potential/revmd17.py index 613ce91f..425f9784 100644 --- a/openqdc/datasets/potential/revmd17.py +++ b/openqdc/datasets/potential/revmd17.py @@ -54,23 +54,27 @@ def create_path(filename, root): class RevMD17(BaseDataset): """ - - Benzene: 627000 samples - - Uracil: 133000 samples - - Naptalene: 326000 samples - - Aspirin: 211000 samples - - Salicylic Acid: 320000 samples - - Malonaldehyde: 993000 samples - - Ethanol: 555000 samples - - Toluene: 100000 samples - - Usage + Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original + dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies + are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration + grid. The dataset contains the following molecules: + Benzene: 627000 samples\n + Uracil: 133000 samples\n + Naptalene: 326000 samples\n + Aspirin: 211000 samples\n + Salicylic Acid: 320000 samples\n + Malonaldehyde: 993000 samples\n + Ethanol: 555000 samples\n + Toluene: 100000 samples\n + + Usage: ```python from openqdc.datasets import RevMD17 dataset = RevMD17() ``` References: - - https://arxiv.org/abs/2007.09593 + https://arxiv.org/abs/2007.09593 """ __name__ = "revmd17" diff --git a/openqdc/datasets/potential/sn2_rxn.py b/openqdc/datasets/potential/sn2_rxn.py index 29337573..2194775b 100644 --- a/openqdc/datasets/potential/sn2_rxn.py +++ b/openqdc/datasets/potential/sn2_rxn.py @@ -39,10 +39,12 @@ def extract_npz_entry(data): class SN2RXN(BaseDataset): """ - This dataset probes chemical reactions of methyl halides with halide anions, i.e. - X- + CH3Y -> CH3X + Y-, and contains structures for all possible combinations of - X,Y = F, Cl, Br, I. It contains energy and forces for 452709 conformations calculated - at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. + This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X + Y-, and + contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by + running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment + (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and + for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset + contains 452,709 structures along with the energy, force and dipole moments. Usage: ```python @@ -51,8 +53,8 @@ class SN2RXN(BaseDataset): ``` References: - - https://doi.org/10.1021/acs.jctc.9b00181 - - https://zenodo.org/records/2605341 + https://doi.org/10.1021/acs.jctc.9b00181\n + https://zenodo.org/records/2605341 """ __name__ = "sn2_rxn" diff --git a/openqdc/datasets/potential/solvated_peptides.py b/openqdc/datasets/potential/solvated_peptides.py index 4fead36f..f00e1a05 100644 --- a/openqdc/datasets/potential/solvated_peptides.py +++ b/openqdc/datasets/potential/solvated_peptides.py @@ -7,10 +7,10 @@ class SolvatedPeptides(BaseDataset): """ - The solvated protein fragments dataset probes many-body intermolecular - interactions between "protein fragments" and water molecules. - It contains energy and forces for 2731180 structures calculated - at the revPBE-D3(BJ)/def2-TZVP level of theory. + The solvated protein fragments dataset probes many-body intermolecular interactions between "protein fragments" + and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are + run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10 + steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory. Usage: ```python @@ -19,8 +19,8 @@ class SolvatedPeptides(BaseDataset): ``` References: - - https://doi.org/10.1021/acs.jctc.9b00181 - - https://zenodo.org/records/2605372 + https://doi.org/10.1021/acs.jctc.9b00181\n + https://zenodo.org/records/2605372 """ __name__ = "solvated_peptides" diff --git a/openqdc/datasets/potential/spice.py b/openqdc/datasets/potential/spice.py index 27525bb4..2f8cc36f 100644 --- a/openqdc/datasets/potential/spice.py +++ b/openqdc/datasets/potential/spice.py @@ -40,9 +40,12 @@ def read_record(r, obj): class Spice(BaseDataset): """ - The Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of - small molecules, dimers, dipeptides, and solvated amino acids. It consists of both forces and energies calculated - at the {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory. + Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of + small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit, + and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate + 100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and + molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the + wB97M-D3(BJ)/def2-TZVPPD level of theory. Usage: ```python @@ -51,8 +54,8 @@ class Spice(BaseDataset): ``` References: - - https://arxiv.org/abs/2209.10702 - - https://github.com/openmm/spice-dataset + https://arxiv.org/abs/2209.10702\n + https://github.com/openmm/spice-dataset """ __name__ = "spice" @@ -96,10 +99,11 @@ def read_raw_entries(self): class SpiceV2(Spice): """ - SpiceV2 dataset augmented with amino acids complexes, water boxes, - pubchem solvated molecules. - It consists of both forces and energies calculated - at the {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory. + SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules. + The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain + silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve + sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and + (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset. Usage: ```python @@ -108,8 +112,8 @@ class SpiceV2(Spice): ``` References: - - https://github.com/openmm/spice-dataset/releases/tag/2.0.0 - - https://github.com/openmm/spice-dataset + https://github.com/openmm/spice-dataset/releases/tag/2.0.0\n + https://github.com/openmm/spice-dataset """ __name__ = "spicev2" @@ -150,6 +154,20 @@ def read_raw_entries(self): class SpiceVL2(SpiceV2): + """ + SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods. + + Usage: + ```python + from openqdc.datasets import SpiceVL2 + dataset = SpiceVL2() + ``` + + References: + https://github.com/openmm/spice-dataset/releases/tag/2.0.0\n + https://github.com/openmm/spice-dataset + """ + __name__ = "spice_vl2" __energy_methods__ = SpiceV2.__energy_methods__ + [PotentialMethod.GFN2_XTB, PotentialMethod.PM6] diff --git a/openqdc/datasets/potential/tmqm.py b/openqdc/datasets/potential/tmqm.py index 1da6901a..987fa10f 100644 --- a/openqdc/datasets/potential/tmqm.py +++ b/openqdc/datasets/potential/tmqm.py @@ -47,10 +47,10 @@ def read_xyz(fname, e_map): class TMQM(BaseDataset): """ - The tmQM dataset contains the geometries of a large transition metal-organic - compound space with a large variety of organic ligands and 30 transition metals. - It contains energy labels for 86,665 mononuclear complexe calculated - at the TPSSh-D3BJ/def2-SV DFT level of theory. + tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of + organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated + at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database + and then optimized in gas phase with the extended tight-binding GFN2-xTB method. Usage: ```python @@ -59,8 +59,8 @@ class TMQM(BaseDataset): ``` References: - - https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041 - - https://github.com/bbskjelstad/tmqm + https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041\n + https://github.com/bbskjelstad/tmqm """ __name__ = "tmqm" diff --git a/openqdc/datasets/potential/transition1x.py b/openqdc/datasets/potential/transition1x.py index 8b5b4bc1..d15d71c1 100644 --- a/openqdc/datasets/potential/transition1x.py +++ b/openqdc/datasets/potential/transition1x.py @@ -39,9 +39,9 @@ def read_record(r, group): class Transition1X(BaseDataset): """ - The Transition1x dataset contains structures from 10k organic reaction pathways of various types. - It contains DFT energy and force labels for 9.6 mio. conformers calculated at the - wB97x/6-31-G(d) level of theory. + Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy + and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and + the transition states are generated by running Nudged Elastic Band (NEB) with DFT. Usage: ```python @@ -50,8 +50,8 @@ class Transition1X(BaseDataset): ``` References: - - https://www.nature.com/articles/s41597-022-01870-w - - https://gitlab.com/matschreiner/Transition1x + - https://www.nature.com/articles/s41597-022-01870-w\n + - https://gitlab.com/matschreiner/Transition1x\n """ __name__ = "transition1x" diff --git a/openqdc/datasets/potential/vqm24.py b/openqdc/datasets/potential/vqm24.py new file mode 100644 index 00000000..1710e1dd --- /dev/null +++ b/openqdc/datasets/potential/vqm24.py @@ -0,0 +1,82 @@ +import os +from os.path import join as p_join + +import numpy as np + +from openqdc.datasets.base import BaseDataset +from openqdc.methods import PotentialMethod + + +def shape_atom_inputs(coords, atom_species): + xs = np.stack((atom_species, np.zeros_like(atom_species)), axis=-1) + return np.concatenate((xs, coords), axis=-1, dtype=np.float32) + + +def read_npz_entry(raw_path): + samples = np.load(raw_path, allow_pickle=True) + # get name of file without extension + subset = os.path.basename(raw_path).split(".")[0] + + # atoms + # coordinates + coordinates = np.concatenate(samples["coordinates"]) + atom_species = np.concatenate(samples["atoms"]).ravel() + names = list(map(lambda x: x.split("_")[0], samples["compounds"])) + n_comps = len(names) + + # graphs + # inchi + # Etot + # Eatomization + res = dict( + name=np.array(list(map(lambda x: x.split("_")[0], samples["compounds"]))), + subset=np.array([subset] * n_comps), + energies=samples["Etot"][:, None].astype(np.float64), + atomic_inputs=shape_atom_inputs(coordinates, atom_species), + n_atoms=np.array(list(map(lambda x: len(x), samples["coordinates"])), dtype=np.int32), + ) + return res + + +# graphs is smiles +class VQM24(BaseDataset): + """ + Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical + properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional + isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and + relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ. + + Usage: + ```python + from openqdc.datasets import VQM24 + dataset = VQM24() + ``` + + Reference: + https://arxiv.org/abs/2405.05961 + """ + + __name__ = "vqm24" + + __energy_methods__ = [ + PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)" + ] + + energy_target_names = [ + "ωB97x:6-31G(d) Energy", + ] + # ωB97X-D3/cc-pVDZ + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + __links__ = { + f"{name}.npz": f"https://zenodo.org/records/11164951/files/{name}.npz?download=1" + for name in ["DFT_all", "DFT_saddles", "DFT_uniques", "DMC"] + } + + def read_raw_entries(self): + samples = [] + for name in self.__links__: + raw_path = p_join(self.root, f"{name}") + samples.append(read_npz_entry(raw_path)) + return samples diff --git a/openqdc/datasets/potential/waterclusters.py b/openqdc/datasets/potential/waterclusters.py new file mode 100644 index 00000000..8c791474 --- /dev/null +++ b/openqdc/datasets/potential/waterclusters.py @@ -0,0 +1,175 @@ +from collections import defaultdict +from os.path import join as p_join + +import numpy as np + +from openqdc.datasets.base import BaseDataset +from openqdc.methods import PotentialMethod +from openqdc.utils.package_utils import requires_package + +_default_basis_sets = { + "BEGDB_H2O": "aug-cc-pVQZ", + "WATER27": "aug-cc-pVQZ", + "H2O_alkali_clusters": "def2-QZVPPD", + "H2O_halide_clusters": "def2-QZVPPD", +} + + +@requires_package("monty") +@requires_package("pymatgen") +def read_geometries(fname, dataset): + from monty.serialization import loadfn + + geometries = {k: v.to_ase_atoms() for k, v in loadfn(fname)[dataset].items()} + return geometries + + +@requires_package("monty") +def read_energies(fname, dataset): + from monty.serialization import loadfn + + # fname + _energies = loadfn(fname)[dataset] + metadata_restrictions = {"basis_set": _default_basis_sets.get(dataset)} + + functionals_to_return = [] + for dfa, at_dfa_d in _energies.items(): + functionals_to_return += [f"{dfa}" if dfa == at_dfa else f"{dfa}@{at_dfa}" for at_dfa in at_dfa_d] + + energies = defaultdict(dict) + for f in functionals_to_return: + if "-FLOSIC" in f and "@" not in f: + func = f.split("-FLOSIC")[0] + at_f = "-FLOSIC" + else: + func = f.split("@")[0] + at_f = f.split("@")[-1] + + if func not in _energies: + print(f"No functional {func} included in dataset" f"- available options:\n{', '.join(_energies.keys())}") + elif at_f not in _energies[func]: + print( + f"No @functional {at_f} included in {func} dataset" + f"- available options:\n{', '.join(_energies[func].keys())}" + ) + else: + if isinstance(_energies[func][at_f], list): + for entry in _energies[func][at_f]: + if all(entry["metadata"].get(k) == v for k, v in metadata_restrictions.items()): + energies[f] = entry + break + else: + energies[f] = _energies[func][at_f] + return dict(energies) + + +def extract_desc(atom): + # atom_dict=atom.__dict__ + # arrays -> numbers, positions + # charge, spin_multiplicity + pos = atom.get_positions() + z = atom.get_atomic_numbers() + charges = atom.get_initial_charges() + formula = atom.get_chemical_formula() + return pos, z, charges, formula + + +def format_geometry_and_entries(geometries, energies, subset): + entries_list = [] + for entry, atoms in geometries.items(): + pos, z, charges, formula = extract_desc(atoms) + energies_list = [] + for level_of_theory, entry_en_dict in energies.items(): + en = entry_en_dict.get(entry, np.nan) + energies_list.append(en) + energy_array = np.array(energies_list) + if subset in ["WATER27", "H2O_alkali_clusters", "H2O_halide_clusters"]: + # only the first 9 energies are available + energy_array.resize(19) + energy_array[energy_array == 0] = np.nan + res = dict( + atomic_inputs=np.concatenate( + (np.hstack((z[:, None], charges[:, None])), pos), axis=-1, dtype=np.float32 + ).reshape(-1, 5), + name=np.array([formula]), + energies=np.array(energy_array, dtype=np.float64).reshape(1, -1), + n_atoms=np.array([pos.shape[0]], dtype=np.int32), + subset=np.array([subset]), + ) + entries_list.append(res) + return entries_list + + +class SCANWaterClusters(BaseDataset): + """ + The SCAN Water Clusters dataset contains conformations of + neutral water clusters containing up to 20 monomers, charged water clusters, + and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters: + the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14 + neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of + ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F−, Cl−, or Br−. + Water clusters were obtained from 10 nanosecond gas-phase molecular dynamics + simulations using AMBER 9 and optimized to obtain + lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies. + + + Chemical Species: + [H, O, Li, Na, K, F, Cl, Br] + + Usage: + ```python + from openqdc.datasets import SCANWaterClusters + dataset = SCANWaterClusters() + ``` + + References: + https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec\n + https://github.com/esoteric-ephemera/water_cluster_density_errors + """ + + __name__ = "scanwaterclusters" + + __energy_unit__ = "hartree" + __distance_unit__ = "ang" + __forces_unit__ = "hartree/ang" + energy_target_names = [ + "HF", + "HF-r2SCAN-DC4", + "SCAN", + "SCAN@HF", + "SCAN@r2SCAN50", + "r2SCAN", + "r2SCAN@HF", + "r2SCAN@r2SCAN50", + "r2SCAN50", + "r2SCAN100", + "r2SCAN10", + "r2SCAN20", + "r2SCAN25", + "r2SCAN30", + "r2SCAN40", + "r2SCAN60", + "r2SCAN70", + "r2SCAN80", + "r2SCAN90", + ] + __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))] + force_target_names = [] + # 27 # 9 level + subsets = ["BEGDB_H2O", "WATER27", "H2O_alkali_clusters", "H2O_halide_clusters"] + __links__ = { + "geometries.json.gz": "https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/geometries.json.gz?raw=True", # noqa + "total_energies.json.gz": "https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/total_energies.json.gz?raw=True", # noqa + } + + def read_raw_entries(self): + entries = [] # noqa + for i, subset in enumerate(self.subsets): + geometries = read_geometries(p_join(self.root, "geometries.json.gz"), subset) + energies = read_energies(p_join(self.root, "total_energies.json.gz"), subset) + datum = {} + for k in energies: + _ = energies[k].pop("metadata") + datum[k] = energies[k]["total_energies"] + entries.extend(format_geometry_and_entries(geometries, datum, subset)) + return entries diff --git a/openqdc/datasets/potential/waterclusters3_30.py b/openqdc/datasets/potential/waterclusters3_30.py index f4c7d88e..a52b9e17 100644 --- a/openqdc/datasets/potential/waterclusters3_30.py +++ b/openqdc/datasets/potential/waterclusters3_30.py @@ -53,6 +53,10 @@ class WaterClusters(BaseDataset): clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with the TTM2.1-F ab-initio based interaction potential for water. It contains approximately 4.5 mil. structures. + Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method. + + Chemical Species: + ["H", "O"] Usage: ```python @@ -61,8 +65,8 @@ class WaterClusters(BaseDataset): ``` References: - - https://doi.org/10.1063/1.5128378 - - https://sites.uw.edu/wdbase/database-of-water-clusters/ + https://doi.org/10.1063/1.5128378\n + https://sites.uw.edu/wdbase/database-of-water-clusters/\n """ __name__ = "waterclusters3_30" diff --git a/openqdc/datasets/statistics.py b/openqdc/datasets/statistics.py index d471387b..6b1adeb5 100644 --- a/openqdc/datasets/statistics.py +++ b/openqdc/datasets/statistics.py @@ -2,7 +2,7 @@ from copy import deepcopy from dataclasses import asdict, dataclass from os.path import join as p_join -from typing import Optional +from typing import Callable, Dict, Optional import numpy as np from loguru import logger @@ -17,9 +17,15 @@ class StatisticsResults: """ def to_dict(self): + """ + Convert the class to a dictionary + """ return asdict(self) - def transform(self, func): + def transform(self, func: Callable): + """ + Apply a function to all the attributes of the class + """ for k, v in self.to_dict().items(): if v is not None: setattr(self, k, func(v)) @@ -55,6 +61,14 @@ class StatisticManager: """ def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"): + """ + dataset : openqdc.datasets.base.BaseDataset + The dataset object to compute the statistics + recompute : bool, default = False + Flag to recompute the statistics + *statistic_calculators : AbstractStatsCalculator + statistic calculators to run + """ self._state = {} self._results = {} self._statistic_calculators = [ @@ -63,7 +77,7 @@ def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "Ab ] @property - def state(self) -> dict: + def state(self) -> Dict: """ Return the dictionary state of the manager """ @@ -120,7 +134,7 @@ class AbstractStatsCalculator(ABC): """ Abstract class that defines the interface for all the calculators object and the methods to - compute the statistics + compute the statistics. """ # State Dependencies of the calculator to skip part of the calculation @@ -140,6 +154,28 @@ def __init__( atom_charges: Optional[np.ndarray] = None, forces: Optional[np.ndarray] = None, ): + """ + name : str + Name of the dataset for saving and loading. + energy_type : str, default = None + Type of the energy for the computation of the statistics. Used for loading and saving. + force_recompute : bool, default = False + Flag to force the recomputation of the statistics + energies : np.ndarray, default = None + Energies of the dataset + n_atoms : np.ndarray, default = None + Number of atoms in the dataset + atom_species : np.ndarray, default = None + Atomic species of the dataset + position_idx_range : np.ndarray, default = None + Position index range of the dataset + e0_matrix : np.ndarray, default = None + Isolated atom energies matrix of the dataset + atom_charges : np.ndarray, default = None + Atomic charges of the dataset + forces : np.ndarray, default = None + Forces of the dataset + """ self.name = name self.energy_type = energy_type self.force_recompute = force_recompute @@ -149,6 +185,7 @@ def __init__( self.e0_matrix = e0_matrix self.n_atoms = n_atoms self.atom_species_charges_tuple = (atom_species, atom_charges) + self._root = p_join(get_local_cache(), self.name) if atom_species is not None and atom_charges is not None: # by value not reference self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1) @@ -159,7 +196,7 @@ def has_forces(self) -> bool: @property def preprocess_path(self): - path = p_join(self.root, "preprocessed", str(self) + ".pkl") + path = p_join(self.root, "statistics", self.name + f"_{str(self)}" + ".pkl") return path @property @@ -167,14 +204,14 @@ def root(self): """ Path to the dataset folder """ - return p_join(get_local_cache(), self.name) + return self._root @classmethod def from_openqdc_dataset(cls, dataset, recompute: bool = False): """ - Create a calculator object from a dataset object + Create a calculator object from a dataset object. """ - return cls( + obj = cls( name=dataset.__name__, force_recompute=recompute, energy_type=dataset.energy_type, @@ -186,6 +223,8 @@ def from_openqdc_dataset(cls, dataset, recompute: bool = False): atom_charges=dataset.data["atomic_inputs"][:, 1].ravel(), e0_matrix=dataset.__isolated_atom_energies__, ) + obj._root = dataset.root # set to the dataset root in case of multiple datasets + return obj @abstractmethod def compute(self) -> StatisticsResults: @@ -214,7 +253,7 @@ def attempt_load(self) -> bool: logger.warning(f"Statistics for {str(self)} not found. Computing...") return False - def _setup_deps(self, state: dict) -> None: + def _setup_deps(self, state: Dict) -> None: """ Check if the dependencies of calculators are satisfied from the state object and set the attributes of the calculator @@ -226,7 +265,7 @@ def _setup_deps(self, state: dict) -> None: for dep in self.state_dependency: setattr(self, dep, state[dep]) - def write_state(self, update: dict) -> None: + def write_state(self, update: Dict) -> None: """ Write/update the state dictionary with the update dictionary @@ -235,7 +274,7 @@ def write_state(self, update: dict) -> None: """ self.state.update(update) - def run(self, state: dict) -> None: + def run(self, state: Dict) -> None: """ Main method to run the calculator. Setup the dependencies from the state dictionary diff --git a/openqdc/datasets/structure.py b/openqdc/datasets/structure.py new file mode 100644 index 00000000..f6dc077e --- /dev/null +++ b/openqdc/datasets/structure.py @@ -0,0 +1,276 @@ +import pickle as pkl +from abc import ABC, abstractmethod +from os import PathLike +from os.path import join as p_join +from typing import Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import zarr + +from openqdc.utils.io import pull_locally + + +class GeneralStructure(ABC): + """ + Abstract Factory class for datasets type in the openQDC package. + """ + + _ext: Optional[str] = None + _extra_files: Optional[List[str]] = None + + @property + def ext(self): + return self._ext + + @property + @abstractmethod + def load_fn(self) -> Callable: + """ + Function to use for loading the data. + Must be implemented by the child class. + + Returns: + the function to use for loading the data + """ + raise NotImplementedError + + def add_extension(self, filename: str) -> str: + """ + Add the correct extension to a filename + + Parameters: + filename: the filename to add the extension to + + Returns: + the filename with the extension + """ + return filename + self.ext + + @abstractmethod + def save_preprocess( + self, + preprocess_path: Union[str, PathLike], + data_keys: List[str], + data_dict: Dict[str, np.ndarray], + extra_data_keys: List[str], + extra_data_types: Dict[str, type], + ) -> List[str]: + """ + Save the preprocessed data to the cache directory and optionally upload it to the remote storage. + Must be implemented by the child class. + + Parameters: + preprocess_path: path to the preprocessed data file + data_keys: list of keys to load from the data file + data_dict: dictionary of data to save + extra_data_keys: list of keys to load from the extra data file + extra_data_types: dictionary of data types for each key + """ + raise NotImplementedError + + @abstractmethod + def load_extra_files( + self, + data: Dict[str, np.ndarray], + preprocess_path: Union[str, PathLike], + data_keys: List[str], + pkl_data_keys: List[str], + overwrite: bool, + ): + """ + Load extra files required to define other types of data. + Must be implemented by the child class. + + Parameters: + data: dictionary of data to load + preprocess_path: path to the preprocessed data file + data_keys: list of keys to load from the data file + pkl_data_keys: list of keys to load from the extra files + overwrite: whether to overwrite the local cache + """ + raise NotImplementedError + + def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]: + """ + Join a path and a filename and add the correct extension. + + Parameters: + path: the path to join + filename: the filename to join + + Returns: + the joined path with the correct extension + """ + return p_join(path, self.add_extension(filename)) + + def load_data( + self, + preprocess_path: Union[str, PathLike], + data_keys: List[str], + data_types: Dict[str, np.dtype], + data_shapes: Dict[str, Tuple[int, int]], + extra_data_keys: List[str], + overwrite: bool, + ): + """ + Main method to load the data from a filetype structure like memmap or zarr. + + Parameters: + preprocess_path: path to the preprocessed data file + data_keys: list of keys to load from the data file + data_types: dictionary of data types for each key + data_shapes: dictionary of shapes for each key + extra_data_keys: list of keys to load from the extra data file + overwrite: whether to overwrite the local cache + """ + data = {} + for key in data_keys: + filename = self.join_and_ext(preprocess_path, key) + pull_locally(filename, overwrite=overwrite) + data[key] = self.load_fn(filename, mode="r", dtype=data_types[key]) + data[key] = self.unpack(data[key]) + data[key] = data[key].reshape(*data_shapes[key]) + + data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite) + return data + + def unpack(self, data: any) -> any: + """ + Unpack the data from the loaded file. + + Parameters: + data: the data to unpack + + Returns: + the unpacked data + """ + return data + + +class MemMapDataset(GeneralStructure): + """ + Dataset structure for memory-mapped numpy arrays and props.pkl files. + """ + + _ext = ".mmap" + _extra_files = ["props.pkl"] + + @property + def load_fn(self): + return np.memmap + + def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]: + local_paths = [] + for key in data_keys: + local_path = self.join_and_ext(preprocess_path, key) + out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape) + out[:] = data_dict.pop(key)[:] + out.flush() + local_paths.append(local_path) + + # save smiles and subset + local_path = p_join(preprocess_path, "props.pkl") + + # assert that (required) pkl keys are present in data_dict + assert all([key in data_dict.keys() for key in extra_data_keys]) + + # store unique and inverse indices for str-based pkl keys + for key in extra_data_keys: + if extra_data_types[key] == str: + data_dict[key] = np.unique(data_dict[key], return_inverse=True) + + with open(local_path, "wb") as f: + pkl.dump(data_dict, f) + + local_paths.append(local_path) + return local_paths + + def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite): + filename = p_join(preprocess_path, "props.pkl") + pull_locally(filename, overwrite=overwrite) + with open(filename, "rb") as f: + tmp = pkl.load(f) + all_pkl_keys = set(tmp.keys()) - set(data_keys) + # assert required pkl_keys are present in all_pkl_keys + assert all([key in all_pkl_keys for key in pkl_data_keys]) + for key in all_pkl_keys: + x = tmp.pop(key) + if len(x) == 2: + data[key] = x[0][x[1]] + else: + data[key] = x + return data + + +class ZarrDataset(GeneralStructure): + """ + Dataset structure for zarr files. + """ + + _ext = ".zip" + _extra_files = ["metadata.zip"] + _zarr_version = 2 + + @property + def load_fn(self): + return zarr.open + + def unpack(self, data): + return data[:] + + def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]: + # os.makedirs(p_join(ds.root, "zips", ds.__name__), exist_ok=True) + local_paths = [] + for key, value in data_dict.items(): + if key not in data_keys: + continue + zarr_path = self.join_and_ext(preprocess_path, key) + value = data_dict.pop(key) + z = zarr.open( + zarr.storage.ZipStore(zarr_path), + "w", + zarr_version=self._zarr_version, + shape=value.shape, + dtype=value.dtype, + ) + z[:] = value[:] + local_paths.append(zarr_path) + # if key in attrs: + # z.attrs.update(attrs[key]) + + metadata = p_join(preprocess_path, "metadata.zip") + + group = zarr.group(zarr.storage.ZipStore(metadata)) + + for key in extra_data_keys: + if extra_data_types[key] == str: + data_dict[key] = np.unique(data_dict[key], return_inverse=True) + + for key, value in data_dict.items(): + # sub=group.create_group(key) + if key in ["name", "subset"]: + data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype) + data[:] = value[0][:] + data2 = group.create_dataset(key + "_ptr", shape=value[1].shape, dtype=np.int32) + data2[:] = value[1][:] + else: + data = group.create_dataset(key, shape=value.shape, dtype=value.dtype) + data[:] = value[:] + local_paths.append(metadata) + return local_paths + + def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite): + filename = self.join_and_ext(preprocess_path, "metadata") + pull_locally(filename, overwrite=overwrite) + tmp = self.load_fn(filename) + all_pkl_keys = set(tmp.keys()) - set(data_keys) + # assert required pkl_keys are present in all_pkl_keys + assert all([key in all_pkl_keys for key in pkl_data_keys]) + for key in all_pkl_keys: + if key not in pkl_data_keys: + data[key] = tmp[key][:][tmp[key][:]] + else: + data[key] = tmp[key][:] + return data + + # TODO: checksum , maybe convert to archive instead of zips diff --git a/openqdc/methods/atom_energies.py b/openqdc/methods/atom_energies.py index fed41dc4..523ff171 100644 --- a/openqdc/methods/atom_energies.py +++ b/openqdc/methods/atom_energies.py @@ -1,6 +1,6 @@ import ast import pkgutil -from typing import Tuple +from typing import Dict, Tuple import numpy as np from loguru import logger @@ -18,19 +18,15 @@ atom_energy_collection = {k.lower(): v for k, v in atom_energy_collection.items()} -def to_e_matrix(atom_energies: dict) -> np.ndarray: +def to_e_matrix(atom_energies: Dict) -> np.ndarray: """ Get the matrix of isolated atom energies for a dict of non-null values calculates - Parameters - ---------- - atom_energies: dict - Dict of energies computed for a given QM method. - Keys are pairs of (atom, charge) and values are energy values + Parameters: + atom_energies: Dict of energies computed for a given QM method. + Keys are pairs of (atom, charge) and values are energy values - Returns - ------- - np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1) + Returns: np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1) Matrix containing the isolated atom energies for each atom and charge written in the form: | | -2 | -1 | 0 | +1 | +2 | <- charges diff --git a/openqdc/utils/download_api.py b/openqdc/utils/download_api.py index c96f3d91..c73c752d 100644 --- a/openqdc/utils/download_api.py +++ b/openqdc/utils/download_api.py @@ -14,7 +14,9 @@ import gdown import requests import tqdm -from aiohttp import ClientTimeout + +# from aiohttp import ClientTimeout +from dotenv import load_dotenv from fsspec import AbstractFileSystem from fsspec.callbacks import TqdmCallback from fsspec.implementations.local import LocalFileSystem @@ -27,25 +29,39 @@ @dataclass class FileSystem: """ - A class to handle file system operations + A basic class to handle file system operations """ public_endpoint: Optional[AbstractFileSystem] = None private_endpoint: Optional[AbstractFileSystem] = None local_endpoint: AbstractFileSystem = LocalFileSystem() + def __init__(self): + load_dotenv() # load environment variables from .env + self.KEY = os.getenv("CLOUDFARE_KEY", None) + self.SECRET = os.getenv("CLOUDFARE_SECRET", None) + @property def public(self): + """ + Return the public remote filesystem with read permission + """ self.connect() return self.public_endpoint @property def private(self): + """ + Return the private remote filesystem with write permission + """ self.connect() return self.private_endpoint @property def local(self): + """ + Return the local filesystem + """ return self.local_endpoint @property @@ -57,23 +73,29 @@ def is_connected(self): def connect(self): """ - Attempt connection to the public and private endpoints + Attempt connection to the public and private remote endpoints """ if not self.is_connected: with warnings.catch_warnings(): warnings.simplefilter("ignore") # No quota warning self.public_endpoint = self.get_default_endpoint("public") self.private_endpoint = self.get_default_endpoint("private") - self.public_endpoint.client_kwargs = {"timeout": ClientTimeout(total=3600, connect=1000)} + # self.public_endpoint.client_kwargs = {"timeout": ClientTimeout(total=3600, connect=1000)} def get_default_endpoint(self, endpoint: str) -> AbstractFileSystem: """ Return a default endpoint for the given str [public, private] """ if endpoint == "private": - return fsspec.filesystem("gs") + return fsspec.filesystem( + "s3", + key=self.KEY, + secret=self.SECRET, + endpoint_url=ioqdc.request_s3fs_config()["endpoint_url"], + ) elif endpoint == "public": - return fsspec.filesystem("https") + # return fsspec.filesystem("https") + return fsspec.filesystem("s3", **ioqdc.request_s3fs_config()) else: return self.local_endpoint diff --git a/openqdc/utils/io.py b/openqdc/utils/io.py index 5e039960..08f69e1a 100644 --- a/openqdc/utils/io.py +++ b/openqdc/utils/io.py @@ -3,6 +3,8 @@ import json import os import pickle as pkl + +# from os.path import join as p_join from typing import Dict, List, Optional import fsspec @@ -23,6 +25,12 @@ "~/.cache/openqdc" if "OPENQDC_CACHE_DIR" not in os.environ else os.path.normpath(os.environ["OPENQDC_CACHE_DIR"]) ) +_OPENQDC_DOWNLOAD_API = { + "s3": "/openqdc/v1", + # "https" : "https://storage.openqdc.org/v1", + "gs": "https://storage.googleapis.com/qmdata-public/openqdc", +} + def set_cache_dir(d): r""" @@ -54,9 +62,11 @@ def get_remote_cache(write_access=False) -> str: Returns the entry point based on the write access. """ if write_access: - remote_cache = "gs://qmdata-public/openqdc" + remote_cache = "openqdc/v1" # "gs://qmdata-public/openqdc" + # remote_cache = "gs://qmdata-public/openqdc" else: - remote_cache = "https://storage.googleapis.com/qmdata-public/openqdc" + remote_cache = _OPENQDC_DOWNLOAD_API.get(os.environ.get("OPENQDC_DOWNLOAD_API", "s3")) + # remote_cache = "https://storage.googleapis.com/qmdata-public/openqdc" return remote_cache @@ -78,6 +88,7 @@ def pull_locally(local_path, overwrite=False): """ Retrieve file from remote gs path or local cache """ + remote_path = local_path.replace(get_local_cache(), get_remote_cache()) os.makedirs(os.path.dirname(local_path), exist_ok=True) if not os.path.exists(local_path) or overwrite: @@ -85,6 +96,15 @@ def pull_locally(local_path, overwrite=False): return local_path +def request_s3fs_config(): + import httpx + + response = httpx.get("https://storage.openqdc.org/config.json") + response.raise_for_status() + config = response.json() + return config + + def copy_exists(local_path): remote_path = local_path.replace(get_local_cache(), get_remote_cache()) return os.path.exists(local_path) or API.exists(remote_path) @@ -150,8 +170,8 @@ def load_hdf5_file(hdf5_file_path: str): # inorder to enable multiprocessing: # https://github.com/fsspec/gcsfs/issues/379#issuecomment-839929801 - fsspec.asyn.iothread[0] = None - fsspec.asyn.loop[0] = None + # fsspec.asyn.iothread[0] = None + # fsspec.asyn.loop[0] = None return file @@ -177,7 +197,7 @@ def load_xyz(path): return MolFromXYZFile(path) -def dict_to_atoms(d: dict, ext: bool = False, energy_method: int = 0) -> Atoms: +def dict_to_atoms(d: Dict, ext: bool = False, energy_method: int = 0) -> Atoms: """ Converts dictionary to ase atoms object diff --git a/openqdc/utils/package_utils.py b/openqdc/utils/package_utils.py index 990f6cb3..e1381dad 100644 --- a/openqdc/utils/package_utils.py +++ b/openqdc/utils/package_utils.py @@ -1,3 +1,4 @@ +# from openFF package import importlib from functools import wraps from typing import Any, Callable, TypeVar diff --git a/openqdc/utils/preprocess.py b/openqdc/utils/preprocess.py deleted file mode 100644 index 1171a68c..00000000 --- a/openqdc/utils/preprocess.py +++ /dev/null @@ -1,38 +0,0 @@ -"""Dataset preprocessing.""" - -import click -import numpy as np -from loguru import logger - -from openqdc import AVAILABLE_DATASETS - -options = list(AVAILABLE_DATASETS.values()) -options_map = {d.__name__.lower(): d for d in options} - - -@click.command() -@click.option("--dataset", "-d", type=str, default="ani1", help="Dataset name or index.") -@click.option("--upload", "-u", type=bool, default=False, help="Try to upload it to the remote storage.") -def preprocess(dataset, upload): - if dataset not in options_map: - dataset_id = int(dataset) - data_class = options[dataset_id] - else: - data_class = options_map[dataset] - - data_class.no_init().preprocess(upload=upload, overwrite=True) - data = data_class() - logger.info(f"Preprocessing {data.__name__}") - - n = len(data) - for i in np.random.choice(n, 3, replace=False): - x = data[i] - print(x.name, x.subset, end=" ") - for k in x: - if isinstance(x[k], np.ndarray): - print(k, x[k].shape, end=" ") - print() - - -if __name__ == "__main__": - preprocess() diff --git a/openqdc/utils/regressor.py b/openqdc/utils/regressor.py index 1d3e50ad..0c23d9b4 100644 --- a/openqdc/utils/regressor.py +++ b/openqdc/utils/regressor.py @@ -7,8 +7,6 @@ import pandas as pd from loguru import logger -SubSampleFrac = Union[float, int] - def non_nan_idxs(array): """ @@ -24,7 +22,18 @@ class Solver(ABC): @staticmethod @abstractmethod - def solve(X, Y): + def solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]: + """ + Main method to solve the regression problem. + Must be implemented in all the subclasses. + + Parameters: + X: Input features of shape (n_samples, n_species) + Y: Target values of shape (n_samples,) (energy values for the regression) + + Returns: + Tuple of predicted values and the estimated uncertainty. + """ pass def __call__(self, X, Y): @@ -38,7 +47,26 @@ def __repr__(self): class Regressor: - """Regressor class for preparing and solving regression problem for isolated atom energies.""" + """ + Regressor class for preparing and solving regression problem for isolated atom energies. + A isolated atom energy regression problem is defined as:\n + X = [n_samples, n_species] (number of atoms of each species per sample)\n + Y = [n_samples, ] (energies)\n + The regression problem is solved by solving the linear system X E0 = Y. + + Example: + For a sytem of 2 samples (H20, CH4)\n + n_species = 3, n_samples = 2\n + H20 = 2H , 1O -> X = [2, 1, 0]\n + CH4 = 4C, 1H -> X = [1, 0, 4]\n + X = [[2, 1, 0], + [ 1, 0, 4]]\n + Y = [[10, 20]]\n + X E0 = Y\n + Linear system to solve\n + [[2 eH, 1 eO, 0 eC], + [ 1 eH, 0 eO, 4 eC]] = [[10, 20]] + """ solver: Solver @@ -49,27 +77,29 @@ def __init__( position_idx_range: np.ndarray, solver_type: str = "linear", stride: int = 1, - subsample: Optional[SubSampleFrac] = None, + subsample: Optional[Union[float, int]] = None, remove_nan: bool = True, - *args, - **kwargs, + *args: any, + **kwargs: any, ): """ - Parameters - ---------- - energies - numpy array of energies in the shape (n_samples, n_energy_methods) - atomic_numbers - numpy array of atomic numbers in the shape (n_atoms,) - position_idx_range - array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset - stride - Stride to use for the regression. - subsample - Sumsample the dataset. If a float, it is interpreted as a fraction of the dataset to use. - If >1 it is interpreted as the number of samples to use. - remove_nan - Sanitize the dataset by removing energies samples with NaN values. + Regressor class for preparing and solving regression problem for isolated atom energies. + + Parameters: + energies: + numpy array of energies in the shape (n_samples, n_energy_methods) + atomic_numbers: + numpy array of atomic numbers in the shape (n_atoms,) + position_idx_range: + array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset + solver_type: Type of solver to use. ["linear", "ridge"] + stride: Stride to use for the regression. + subsample: Sumsample the dataset. + If a float, it is interpreted as a fraction of the dataset to use. + If >1 it is interpreted as the number of samples to use. + remove_nan: Sanitize the dataset by removing energies samples with NaN values. + *args: Additional arguments to be passed to the regressor. + **kwargs: Additional keyword arguments to be passed to the regressor. """ self.subsample = subsample self.stride = stride @@ -87,7 +117,19 @@ def __init__( self._post_init() @classmethod - def from_openqdc_dataset(cls, dataset, *args, **kwargs): + def from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> "Regressor": + """ + Initialize the regressor object from an openqdc dataset. This is the default method. + *args and and **kwargs are passed to the __init__ method and depends on the specific regressor. + + Parameters: + dataset: openqdc dataset object. + *args: Additional arguments to be passed to the regressor. + **kwargs: Additional keyword arguments to be passed to the regressor. + + Returns: + Instance of the regressor class. + """ energies = dataset.data["energies"] position_idx_range = dataset.data["position_idx_range"] atomic_numbers = dataset.data["atomic_inputs"][:, 0].astype("int32") @@ -116,12 +158,11 @@ def _downsample(self): self.update_hparams({"idxs": idxs}) def _get_solver(self): - if self.solver_type == "linear": + try: + return AVAILABLE_SOLVERS[self.solver_type]() + except KeyError: + logger.warning(f"Unknown solver type {self.solver_type}, defaulting to linear regression.") return LinearSolver() - elif self.solver_type == "ridge": - return RidgeSolver() - logger.warning(f"Unknown solver type {self.solver_type}, defaulting to linear regression.") - return LinearSolver() def _prepare_inputs(self) -> Tuple[np.ndarray, np.ndarray]: logger.info("Preparing inputs for regression.") @@ -137,6 +178,9 @@ def _prepare_inputs(self) -> Tuple[np.ndarray, np.ndarray]: self.y = B def solve(self): + """ + Solve the regression problem and return the predicted isolated energies and the estimated uncertainty. + """ logger.info(f"Solving regression with {self.solver}.") E0_list, cov_list = [], [] for energy_idx in range(self.y.shape[1]): @@ -157,6 +201,11 @@ def __call__(self): def atom_standardization(X, y): + """ + Standardize the energies and the atom counts. + This will make the calculated uncertainty more + meaningful. + """ X_norm = X.sum() X = X / X_norm y = y / X_norm @@ -165,7 +214,14 @@ def atom_standardization(X, y): class LinearSolver(Solver): - _regr_str = "LinearRegression" + """ + Linear regression solver. + + Note: + No Uncertainty associated as it is quite small. + """ + + _regr_str = "linear" @staticmethod def solve(X, y): @@ -175,7 +231,11 @@ def solve(X, y): class RidgeSolver(Solver): - _regr_str = "RidgeRegression" + """ + Ridge regression solver. + """ + + _regr_str = "ridge" @staticmethod def solve(X, y): @@ -189,3 +249,10 @@ def solve(X, y): cov = np.sqrt(sigma2 * np.einsum("ij,kj,kl,li->i", Ainv, X, X, Ainv)) mean = mean + y_mean.reshape([-1]) return mean, cov + + +AVAILABLE_SOLVERS = { + cls._regr_str: cls + for str_name, cls in globals().items() + if isinstance(cls, type) and issubclass(cls, Solver) and str_name != "Solver" # Exclude the base class +} diff --git a/openqdc/utils/units.py b/openqdc/utils/units.py index d8613a58..898faf8a 100644 --- a/openqdc/utils/units.py +++ b/openqdc/utils/units.py @@ -1,11 +1,14 @@ """ -Unit conversion utils. +Units conversion utilities module. -Energy units: - ["kcal/mol", "kj/mol", "hartree", "ev"] +Available Energy units: + ["kcal/mol", "kj/mol", "hartree", "ev" "mev", "ryd] -Distance units: +Available Distance units: ["ang", "nm", "bohr"] + +Available Force units: + Combinations between Energy and Distance units """ from enum import Enum, unique @@ -40,7 +43,16 @@ class EnergyTypeConversion(ConversionEnum, StrEnum): MEV = "mev" RYD = "ryd" - def to(self, energy: "EnergyTypeConversion"): + def to(self, energy: "EnergyTypeConversion") -> Callable[[float], float]: + """ + Get the conversion function to convert the energy to the desired units. + + Parameters: + energy: energy unit to convert to + + Returns: + Callable to convert the distance to the desired units + """ return get_conversion(str(self), str(energy)) @@ -54,7 +66,17 @@ class DistanceTypeConversion(ConversionEnum, StrEnum): NM = "nm" BOHR = "bohr" - def to(self, distance: "DistanceTypeConversion", fraction: bool = False): + def to(self, distance: "DistanceTypeConversion", fraction: bool = False) -> Callable[[float], float]: + """ + Get the conversion function to convert the distance to the desired units. + + Parameters: + distance: distance unit to convert to + fraction: whether it is distance^1 or distance^-1 + + Returns: + callable to convert the distance to the desired units + """ return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self)) @@ -91,33 +113,32 @@ def __init__(self, energy: EnergyTypeConversion, distance: DistanceTypeConversio def __str__(self): return f"{self.energy}/{self.distance}" - def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion): + def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]: + """ + Get the conversion function to convert the force to the desired units. + + Parameters: + energy: energy unit to convert to + distance: distance unit to convert to + + Returns: + callable to convert the distance to the desired units + """ return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x)) class Conversion: """ - Conversion from one unit system to another. - - Attributes - ---------- - name - A human-readable name for the conversion - fn: - The callable to compute the conversion + Conversion from one unit system to another defined by a name and a callable """ def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]): """ - Parameters - ---------- - in_unit - String defining the units of the current values - out_unit - String defining the target units - func - The callable to compute the conversion + Parameters: + in_unit: String defining the units of the current values + out_unit: String defining the target units + func: The callable to compute the conversion """ name = "convert_" + in_unit.lower().strip() + "_to_" + out_unit.lower().strip() @@ -129,11 +150,20 @@ def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]): self.fn = func def __call__(self, x): - """Convert measure""" return self.fn(x) -def get_conversion(in_unit: str, out_unit: str): +def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]: + """ + Utility function to get the conversion function between two units. + + Parameters: + in_unit : The input unit + out_unit : The output unit + + Returns: + The conversion function + """ name = "convert_" + in_unit.lower().strip() + "_to_" + out_unit.lower().strip() if in_unit.lower().strip() == out_unit.lower().strip(): return lambda x: x @@ -142,6 +172,8 @@ def get_conversion(in_unit: str, out_unit: str): return CONVERSION_REGISTRY[name] +# Conversion definitions + # ev conversion Conversion("ev", "kcal/mol", lambda x: x * 23.0605) Conversion("ev", "hartree", lambda x: x * 0.0367493) diff --git a/pyproject.toml b/pyproject.toml index 43f414b8..d5e6a002 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,10 @@ dependencies = [ "ase" , "gdown", "h5py >= 3.8.0" , - "dscribe" + "dscribe", + "zarr", + "python-dotenv", + "s3fs", ] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..0718b5fb --- /dev/null +++ b/pytest.ini @@ -0,0 +1,8 @@ +[pytest] +tmp_path_retention_policy = none +filterwarnings = + ignore::DeprecationWarning + ignore::UserWarning + +markers = + download: tests for datasets downloading diff --git a/tests/test_download.py b/tests/test_download.py new file mode 100644 index 00000000..dd53d405 --- /dev/null +++ b/tests/test_download.py @@ -0,0 +1,15 @@ +from os.path import join as p_join +from pathlib import Path + +import pytest + +from openqdc.datasets import QM7 + + +@pytest.mark.download +def test_API_download(tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + ds = QM7(cache_dir=tmp_path) + for filename in ["energies.mmap", "position_idx_range.mmap", "atomic_inputs.mmap", "props.pkl"]: + assert (Path(p_join(tmp_path, ds.preprocess_path, filename))).exists() + monkeypatch.undo()