diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9194d2ce..1abf329f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -16,8 +16,8 @@ jobs:
strategy:
fail-fast: false
matrix:
- python-version: ["3.9", "3.10", "3.11", "3.12"]
- os: ["ubuntu-latest"]
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+ os: ["ubuntu-latest", "macos-latest"] #,"windows-latest"
runs-on: ${{ matrix.os }}
timeout-minutes: 30
@@ -53,5 +53,5 @@ jobs:
- name: Run tests
run: python -m pytest
- #- name: Test building the doc
- # run: mkdocs build
+ - name: Test building the doc
+ run: mkdocs build
diff --git a/.gitignore b/.gitignore
index 7a6dd93f..ffd7edf6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -149,3 +149,6 @@ cookie.txt
*.txt
*.sh
.DS_Store
+*.zarr/
+scripts/
+notebooks/
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..94a3b7e2
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,352 @@
+Creative Commons Attribution-NonCommercial 4.0 International
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright and
+certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+- Considerations for licensors: Our public licenses are intended for
+ use by those authorized to give the public permission to use
+ material in ways otherwise restricted by copyright and certain other
+ rights. Our licenses are irrevocable. Licensors should read and
+ understand the terms and conditions of the license they choose
+ before applying it. Licensors should also secure all rights
+ necessary before applying our licenses so that the public can reuse
+ the material as expected. Licensors should clearly mark any material
+ not subject to the license. This includes other CC-licensed
+ material, or material used under an exception or limitation to
+ copyright. More considerations for licensors :
+ wiki.creativecommons.org/Considerations\_for\_licensors
+
+- Considerations for the public: By using one of our public licenses,
+ a licensor grants the public permission to use the licensed material
+ under specified terms and conditions. If the licensor's permission
+ is not necessary for any reason–for example, because of any
+ applicable exception or limitation to copyright–then that use is not
+ regulated by the license. Our licenses grant only permissions under
+ copyright and certain other rights that a licensor has authority to
+ grant. Use of the licensed material may still be restricted for
+ other reasons, including because others have copyright or other
+ rights in the material. A licensor may make special requests, such
+ as asking that all changes be marked or described. Although not
+ required by our licenses, you are encouraged to respect those
+ requests where reasonable. More considerations for the public :
+ wiki.creativecommons.org/Considerations\_for\_licensees
+
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and conditions.
+
+- Section 1 – Definitions.
+
+ - a. Adapted Material means material subject to Copyright and
+ Similar Rights that is derived from or based upon the Licensed
+ Material and in which the Licensed Material is translated,
+ altered, arranged, transformed, or otherwise modified in a
+ manner requiring permission under the Copyright and Similar
+ Rights held by the Licensor. For purposes of this Public
+ License, where the Licensed Material is a musical work,
+ performance, or sound recording, Adapted Material is always
+ produced where the Licensed Material is synched in timed
+ relation with a moving image.
+ - b. Adapter's License means the license You apply to Your
+ Copyright and Similar Rights in Your contributions to Adapted
+ Material in accordance with the terms and conditions of this
+ Public License.
+ - c. Copyright and Similar Rights means copyright and/or similar
+ rights closely related to copyright including, without
+ limitation, performance, broadcast, sound recording, and Sui
+ Generis Database Rights, without regard to how the rights are
+ labeled or categorized. For purposes of this Public License, the
+ rights specified in Section 2(b)(1)-(2) are not Copyright and
+ Similar Rights.
+ - d. Effective Technological Measures means those measures that,
+ in the absence of proper authority, may not be circumvented
+ under laws fulfilling obligations under Article 11 of the WIPO
+ Copyright Treaty adopted on December 20, 1996, and/or similar
+ international agreements.
+ - e. Exceptions and Limitations means fair use, fair dealing,
+ and/or any other exception or limitation to Copyright and
+ Similar Rights that applies to Your use of the Licensed
+ Material.
+ - f. Licensed Material means the artistic or literary work,
+ database, or other material to which the Licensor applied this
+ Public License.
+ - g. Licensed Rights means the rights granted to You subject to
+ the terms and conditions of this Public License, which are
+ limited to all Copyright and Similar Rights that apply to Your
+ use of the Licensed Material and that the Licensor has authority
+ to license.
+ - h. Licensor means the individual(s) or entity(ies) granting
+ rights under this Public License.
+ - i. NonCommercial means not primarily intended for or directed
+ towards commercial advantage or monetary compensation. For
+ purposes of this Public License, the exchange of the Licensed
+ Material for other material subject to Copyright and Similar
+ Rights by digital file-sharing or similar means is NonCommercial
+ provided there is no payment of monetary compensation in
+ connection with the exchange.
+ - j. Share means to provide material to the public by any means or
+ process that requires permission under the Licensed Rights, such
+ as reproduction, public display, public performance,
+ distribution, dissemination, communication, or importation, and
+ to make material available to the public including in ways that
+ members of the public may access the material from a place and
+ at a time individually chosen by them.
+ - k. Sui Generis Database Rights means rights other than copyright
+ resulting from Directive 96/9/EC of the European Parliament and
+ of the Council of 11 March 1996 on the legal protection of
+ databases, as amended and/or succeeded, as well as other
+ essentially equivalent rights anywhere in the world.
+ - l. You means the individual or entity exercising the Licensed
+ Rights under this Public License. Your has a corresponding
+ meaning.
+
+- Section 2 – Scope.
+
+ - a. License grant.
+ - 1. Subject to the terms and conditions of this Public
+ License, the Licensor hereby grants You a worldwide,
+ royalty-free, non-sublicensable, non-exclusive, irrevocable
+ license to exercise the Licensed Rights in the Licensed
+ Material to:
+ - A. reproduce and Share the Licensed Material, in whole
+ or in part, for NonCommercial purposes only; and
+ - B. produce, reproduce, and Share Adapted Material for
+ NonCommercial purposes only.
+ - 2. Exceptions and Limitations. For the avoidance of doubt,
+ where Exceptions and Limitations apply to Your use, this
+ Public License does not apply, and You do not need to comply
+ with its terms and conditions.
+ - 3. Term. The term of this Public License is specified in
+ Section 6(a).
+ - 4. Media and formats; technical modifications allowed. The
+ Licensor authorizes You to exercise the Licensed Rights in
+ all media and formats whether now known or hereafter
+ created, and to make technical modifications necessary to do
+ so. The Licensor waives and/or agrees not to assert any
+ right or authority to forbid You from making technical
+ modifications necessary to exercise the Licensed Rights,
+ including technical modifications necessary to circumvent
+ Effective Technological Measures. For purposes of this
+ Public License, simply making modifications authorized by
+ this Section 2(a)(4) never produces Adapted Material.
+ - 5. Downstream recipients.
+ - A. Offer from the Licensor – Licensed Material. Every
+ recipient of the Licensed Material automatically
+ receives an offer from the Licensor to exercise the
+ Licensed Rights under the terms and conditions of this
+ Public License.
+ - B. No downstream restrictions. You may not offer or
+ impose any additional or different terms or conditions
+ on, or apply any Effective Technological Measures to,
+ the Licensed Material if doing so restricts exercise of
+ the Licensed Rights by any recipient of the Licensed
+ Material.
+ - 6. No endorsement. Nothing in this Public License
+ constitutes or may be construed as permission to assert or
+ imply that You are, or that Your use of the Licensed
+ Material is, connected with, or sponsored, endorsed, or
+ granted official status by, the Licensor or others
+ designated to receive attribution as provided in Section
+ 3(a)(1)(A)(i).
+ - b. Other rights.
+ - 1. Moral rights, such as the right of integrity, are not
+ licensed under this Public License, nor are publicity,
+ privacy, and/or other similar personality rights; however,
+ to the extent possible, the Licensor waives and/or agrees
+ not to assert any such rights held by the Licensor to the
+ limited extent necessary to allow You to exercise the
+ Licensed Rights, but not otherwise.
+ - 2. Patent and trademark rights are not licensed under this
+ Public License.
+ - 3. To the extent possible, the Licensor waives any right to
+ collect royalties from You for the exercise of the Licensed
+ Rights, whether directly or through a collecting society
+ under any voluntary or waivable statutory or compulsory
+ licensing scheme. In all other cases the Licensor expressly
+ reserves any right to collect such royalties, including when
+ the Licensed Material is used other than for NonCommercial
+ purposes.
+
+- Section 3 – License Conditions.
+
+ Your exercise of the Licensed Rights is expressly made subject to
+ the following conditions.
+
+ - a. Attribution.
+ - 1. If You Share the Licensed Material (including in modified
+ form), You must:
+ - A. retain the following if it is supplied by the
+ Licensor with the Licensed Material:
+ - i. identification of the creator(s) of the Licensed
+ Material and any others designated to receive
+ attribution, in any reasonable manner requested by
+ the Licensor (including by pseudonym if designated);
+ - ii. a copyright notice;
+ - iii. a notice that refers to this Public License;
+ - iv. a notice that refers to the disclaimer of
+ warranties;
+ - v. a URI or hyperlink to the Licensed Material to
+ the extent reasonably practicable;
+ - B. indicate if You modified the Licensed Material and
+ retain an indication of any previous modifications; and
+ - C. indicate the Licensed Material is licensed under this
+ Public License, and include the text of, or the URI or
+ hyperlink to, this Public License.
+ - 2. You may satisfy the conditions in Section 3(a)(1) in any
+ reasonable manner based on the medium, means, and context in
+ which You Share the Licensed Material. For example, it may
+ be reasonable to satisfy the conditions by providing a URI
+ or hyperlink to a resource that includes the required
+ information.
+ - 3. If requested by the Licensor, You must remove any of the
+ information required by Section 3(a)(1)(A) to the extent
+ reasonably practicable.
+ - 4. If You Share Adapted Material You produce, the Adapter's
+ License You apply must not prevent recipients of the Adapted
+ Material from complying with this Public License.
+
+- Section 4 – Sui Generis Database Rights.
+
+ Where the Licensed Rights include Sui Generis Database Rights that
+ apply to Your use of the Licensed Material:
+
+ - a. for the avoidance of doubt, Section 2(a)(1) grants You the
+ right to extract, reuse, reproduce, and Share all or a
+ substantial portion of the contents of the database for
+ NonCommercial purposes only;
+ - b. if You include all or a substantial portion of the database
+ contents in a database in which You have Sui Generis Database
+ Rights, then the database in which You have Sui Generis Database
+ Rights (but not its individual contents) is Adapted Material;
+ and
+ - c. You must comply with the conditions in Section 3(a) if You
+ Share all or a substantial portion of the contents of the
+ database.
+
+ For the avoidance of doubt, this Section 4 supplements and does not
+ replace Your obligations under this Public License where the
+ Licensed Rights include other Copyright and Similar Rights.
+
+- Section 5 – Disclaimer of Warranties and Limitation of Liability.
+
+ - a. Unless otherwise separately undertaken by the Licensor, to
+ the extent possible, the Licensor offers the Licensed Material
+ as-is and as-available, and makes no representations or
+ warranties of any kind concerning the Licensed Material, whether
+ express, implied, statutory, or other. This includes, without
+ limitation, warranties of title, merchantability, fitness for a
+ particular purpose, non-infringement, absence of latent or other
+ defects, accuracy, or the presence or absence of errors, whether
+ or not known or discoverable. Where disclaimers of warranties
+ are not allowed in full or in part, this disclaimer may not
+ apply to You.
+ - b. To the extent possible, in no event will the Licensor be
+ liable to You on any legal theory (including, without
+ limitation, negligence) or otherwise for any direct, special,
+ indirect, incidental, consequential, punitive, exemplary, or
+ other losses, costs, expenses, or damages arising out of this
+ Public License or use of the Licensed Material, even if the
+ Licensor has been advised of the possibility of such losses,
+ costs, expenses, or damages. Where a limitation of liability is
+ not allowed in full or in part, this limitation may not apply to
+ You.
+ - c. The disclaimer of warranties and limitation of liability
+ provided above shall be interpreted in a manner that, to the
+ extent possible, most closely approximates an absolute
+ disclaimer and waiver of all liability.
+
+- Section 6 – Term and Termination.
+
+ - a. This Public License applies for the term of the Copyright and
+ Similar Rights licensed here. However, if You fail to comply
+ with this Public License, then Your rights under this Public
+ License terminate automatically.
+ - b. Where Your right to use the Licensed Material has terminated
+ under Section 6(a), it reinstates:
+
+ - 1. automatically as of the date the violation is cured,
+ provided it is cured within 30 days of Your discovery of the
+ violation; or
+ - 2. upon express reinstatement by the Licensor.
+
+ For the avoidance of doubt, this Section 6(b) does not affect
+ any right the Licensor may have to seek remedies for Your
+ violations of this Public License.
+
+ - c. For the avoidance of doubt, the Licensor may also offer the
+ Licensed Material under separate terms or conditions or stop
+ distributing the Licensed Material at any time; however, doing
+ so will not terminate this Public License.
+ - d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+ License.
+
+- Section 7 – Other Terms and Conditions.
+
+ - a. The Licensor shall not be bound by any additional or
+ different terms or conditions communicated by You unless
+ expressly agreed.
+ - b. Any arrangements, understandings, or agreements regarding the
+ Licensed Material not stated herein are separate from and
+ independent of the terms and conditions of this Public License.
+
+- Section 8 – Interpretation.
+
+ - a. For the avoidance of doubt, this Public License does not, and
+ shall not be interpreted to, reduce, limit, restrict, or impose
+ conditions on any use of the Licensed Material that could
+ lawfully be made without permission under this Public License.
+ - b. To the extent possible, if any provision of this Public
+ License is deemed unenforceable, it shall be automatically
+ reformed to the minimum extent necessary to make it enforceable.
+ If the provision cannot be reformed, it shall be severed from
+ this Public License without affecting the enforceability of the
+ remaining terms and conditions.
+ - c. No term or condition of this Public License will be waived
+ and no failure to comply consented to unless expressly agreed to
+ by the Licensor.
+ - d. Nothing in this Public License constitutes or may be
+ interpreted as a limitation upon, or waiver of, any privileges
+ and immunities that apply to the Licensor or You, including from
+ the legal processes of any jurisdiction or authority.
+
+Creative Commons is not a party to its public licenses. Notwithstanding,
+Creative Commons may elect to apply one of its public licenses to
+material it publishes and in those instances will be considered the
+"Licensor." The text of the Creative Commons public licenses is
+dedicated to the public domain under the CC0 Public Domain Dedication.
+Except for the limited purpose of indicating that material is shared
+under a Creative Commons public license or as otherwise permitted by the
+Creative Commons policies published at creativecommons.org/policies,
+Creative Commons does not authorize the use of the trademark "Creative
+Commons" or any other trademark or logo of Creative Commons without its
+prior written consent including, without limitation, in connection with
+any unauthorized modifications to any of its public licenses or any
+other arrangements, understandings, or agreements concerning use of
+licensed material. For the avoidance of doubt, this paragraph does not
+form part of the public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
diff --git a/docs/API/available_datasets.md b/docs/API/available_datasets.md
deleted file mode 100644
index fa630b8a..00000000
--- a/docs/API/available_datasets.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Available Datasets
-
-::: openqdc.datasets
diff --git a/docs/API/basedataset.md b/docs/API/basedataset.md
new file mode 100644
index 00000000..cdaeee77
--- /dev/null
+++ b/docs/API/basedataset.md
@@ -0,0 +1 @@
+::: openqdc.datasets.base
diff --git a/docs/API/datasets/alchemy.md b/docs/API/datasets/alchemy.md
new file mode 100644
index 00000000..096774c3
--- /dev/null
+++ b/docs/API/datasets/alchemy.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.alchemy
diff --git a/docs/API/datasets/ani.md b/docs/API/datasets/ani.md
new file mode 100644
index 00000000..4f79f587
--- /dev/null
+++ b/docs/API/datasets/ani.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.ani
diff --git a/docs/API/datasets/comp6.md b/docs/API/datasets/comp6.md
new file mode 100644
index 00000000..e473e211
--- /dev/null
+++ b/docs/API/datasets/comp6.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.comp6
diff --git a/docs/API/datasets/des.md b/docs/API/datasets/des.md
new file mode 100644
index 00000000..dbff5035
--- /dev/null
+++ b/docs/API/datasets/des.md
@@ -0,0 +1 @@
+::: openqdc.datasets.interaction.des
diff --git a/docs/API/datasets/gdml.md b/docs/API/datasets/gdml.md
new file mode 100644
index 00000000..a91cf993
--- /dev/null
+++ b/docs/API/datasets/gdml.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.gdml
diff --git a/docs/API/datasets/geom.md b/docs/API/datasets/geom.md
new file mode 100644
index 00000000..f290eb93
--- /dev/null
+++ b/docs/API/datasets/geom.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.geom.GEOM
diff --git a/docs/API/datasets/iso_17.md b/docs/API/datasets/iso_17.md
new file mode 100644
index 00000000..01a04e67
--- /dev/null
+++ b/docs/API/datasets/iso_17.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.iso_17
diff --git a/docs/API/datasets/l7.md b/docs/API/datasets/l7.md
new file mode 100644
index 00000000..512e7f37
--- /dev/null
+++ b/docs/API/datasets/l7.md
@@ -0,0 +1 @@
+::: openqdc.datasets.interaction.l7
diff --git a/docs/API/datasets/md22.md b/docs/API/datasets/md22.md
new file mode 100644
index 00000000..d793b5cf
--- /dev/null
+++ b/docs/API/datasets/md22.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.md22
diff --git a/docs/API/datasets/metcalf.md b/docs/API/datasets/metcalf.md
new file mode 100644
index 00000000..58566b02
--- /dev/null
+++ b/docs/API/datasets/metcalf.md
@@ -0,0 +1 @@
+::: openqdc.datasets.interaction.metcalf
diff --git a/docs/API/datasets/molecule3d.md b/docs/API/datasets/molecule3d.md
new file mode 100644
index 00000000..d7b6a5a4
--- /dev/null
+++ b/docs/API/datasets/molecule3d.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.molecule3d
diff --git a/docs/API/datasets/multixcqm9.md b/docs/API/datasets/multixcqm9.md
new file mode 100644
index 00000000..55993cd7
--- /dev/null
+++ b/docs/API/datasets/multixcqm9.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.multixcqm9
diff --git a/docs/API/datasets/nabladft.md b/docs/API/datasets/nabladft.md
new file mode 100644
index 00000000..a69d68d7
--- /dev/null
+++ b/docs/API/datasets/nabladft.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.nabladft
diff --git a/docs/API/datasets/orbnet_denali.md b/docs/API/datasets/orbnet_denali.md
new file mode 100644
index 00000000..1b4ee6a7
--- /dev/null
+++ b/docs/API/datasets/orbnet_denali.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.orbnet_denali
diff --git a/docs/API/datasets/pcqm.md b/docs/API/datasets/pcqm.md
new file mode 100644
index 00000000..6cd1b92b
--- /dev/null
+++ b/docs/API/datasets/pcqm.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.pcqm
diff --git a/docs/API/datasets/proteinfragments.md b/docs/API/datasets/proteinfragments.md
new file mode 100644
index 00000000..d5aa28a5
--- /dev/null
+++ b/docs/API/datasets/proteinfragments.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.proteinfragments
diff --git a/docs/API/datasets/qm1b.md b/docs/API/datasets/qm1b.md
new file mode 100644
index 00000000..b92dfff4
--- /dev/null
+++ b/docs/API/datasets/qm1b.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.qm1b
diff --git a/docs/API/datasets/qm7x.md b/docs/API/datasets/qm7x.md
new file mode 100644
index 00000000..d649b40d
--- /dev/null
+++ b/docs/API/datasets/qm7x.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.qm7x
diff --git a/docs/API/datasets/qmugs.md b/docs/API/datasets/qmugs.md
new file mode 100644
index 00000000..06773b68
--- /dev/null
+++ b/docs/API/datasets/qmugs.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.qmugs
diff --git a/docs/API/datasets/qmx.md b/docs/API/datasets/qmx.md
new file mode 100644
index 00000000..b7343767
--- /dev/null
+++ b/docs/API/datasets/qmx.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.qmx
diff --git a/docs/API/datasets/revmd17.md b/docs/API/datasets/revmd17.md
new file mode 100644
index 00000000..e63ba031
--- /dev/null
+++ b/docs/API/datasets/revmd17.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.revmd17
diff --git a/docs/API/datasets/sn2_rxn.md b/docs/API/datasets/sn2_rxn.md
new file mode 100644
index 00000000..9095532c
--- /dev/null
+++ b/docs/API/datasets/sn2_rxn.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.sn2_rxn
diff --git a/docs/API/datasets/solvated_peptides.md b/docs/API/datasets/solvated_peptides.md
new file mode 100644
index 00000000..a6139c12
--- /dev/null
+++ b/docs/API/datasets/solvated_peptides.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.solvated_peptides
diff --git a/docs/API/datasets/spice.md b/docs/API/datasets/spice.md
new file mode 100644
index 00000000..c0e95b79
--- /dev/null
+++ b/docs/API/datasets/spice.md
@@ -0,0 +1,2 @@
+
+::: openqdc.datasets.potential.spice
diff --git a/docs/API/datasets/splinter.md b/docs/API/datasets/splinter.md
new file mode 100644
index 00000000..00789cfa
--- /dev/null
+++ b/docs/API/datasets/splinter.md
@@ -0,0 +1 @@
+::: openqdc.datasets.interaction.splinter
diff --git a/docs/API/datasets/tmqm.md b/docs/API/datasets/tmqm.md
new file mode 100644
index 00000000..70b56781
--- /dev/null
+++ b/docs/API/datasets/tmqm.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.tmqm
diff --git a/docs/API/datasets/transition1x.md b/docs/API/datasets/transition1x.md
new file mode 100644
index 00000000..63eceaa3
--- /dev/null
+++ b/docs/API/datasets/transition1x.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.transition1x
diff --git a/docs/API/datasets/vqm24.md b/docs/API/datasets/vqm24.md
new file mode 100644
index 00000000..ed117b9f
--- /dev/null
+++ b/docs/API/datasets/vqm24.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.vqm24
diff --git a/docs/API/datasets/waterclusters.md b/docs/API/datasets/waterclusters.md
new file mode 100644
index 00000000..f1f90883
--- /dev/null
+++ b/docs/API/datasets/waterclusters.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.waterclusters
diff --git a/docs/API/datasets/waterclusters3_30.md b/docs/API/datasets/waterclusters3_30.md
new file mode 100644
index 00000000..3f0ccf7f
--- /dev/null
+++ b/docs/API/datasets/waterclusters3_30.md
@@ -0,0 +1 @@
+::: openqdc.datasets.potential.waterclusters3_30
diff --git a/docs/API/datasets/x40.md b/docs/API/datasets/x40.md
new file mode 100644
index 00000000..799738c5
--- /dev/null
+++ b/docs/API/datasets/x40.md
@@ -0,0 +1 @@
+::: openqdc.datasets.interaction.x40
diff --git a/docs/API/formats.md b/docs/API/formats.md
new file mode 100644
index 00000000..fab98169
--- /dev/null
+++ b/docs/API/formats.md
@@ -0,0 +1 @@
+::: openqdc.datasets.structure
diff --git a/docs/API/methods.md b/docs/API/methods.md
index 7814334e..ce1d94a4 100644
--- a/docs/API/methods.md
+++ b/docs/API/methods.md
@@ -1,3 +1,7 @@
# QM Methods
-::: openqdc.methods
+::: openqdc.methods.enums
+
+# Isolated Atom Energies
+
+::: openqdc.methods.atom_energies
diff --git a/docs/API/regressor.md b/docs/API/regressor.md
new file mode 100644
index 00000000..dff0ad98
--- /dev/null
+++ b/docs/API/regressor.md
@@ -0,0 +1 @@
+::: openqdc.utils.regressor
diff --git a/docs/API/units.md b/docs/API/units.md
new file mode 100644
index 00000000..0401bdc4
--- /dev/null
+++ b/docs/API/units.md
@@ -0,0 +1,3 @@
+# UNITS
+
+::: openqdc.utils.units
diff --git a/docs/API/utils.md b/docs/API/utils.md
new file mode 100644
index 00000000..35fae5c8
--- /dev/null
+++ b/docs/API/utils.md
@@ -0,0 +1 @@
+::: openqdc.utils
diff --git a/docs/_overrides/main.html b/docs/_overrides/main.html
deleted file mode 100644
index 2eafd76b..00000000
--- a/docs/_overrides/main.html
+++ /dev/null
@@ -1,46 +0,0 @@
-{% extends "base.html" %}
-
-{% block content %}
-{{ super() }}
-
-
-{% endblock content %}
diff --git a/docs/assets/StorageView.png b/docs/assets/StorageView.png
new file mode 100644
index 00000000..8d398926
Binary files /dev/null and b/docs/assets/StorageView.png differ
diff --git a/docs/assets/qdc_logo.png b/docs/assets/qdc_logo.png
new file mode 100644
index 00000000..a8138dcc
Binary files /dev/null and b/docs/assets/qdc_logo.png differ
diff --git a/docs/cli.md b/docs/cli.md
new file mode 100644
index 00000000..8e2cd53b
--- /dev/null
+++ b/docs/cli.md
@@ -0,0 +1,113 @@
+# CLI for dataset downloading and uploading
+You can quickly download, fetch, preprocess and upload openQDC datasets using the command line interface (CLI).
+
+## Datasets
+Print a formatted table of the available openQDC datasets and some informations.
+
+Usage:
+
+ openqdc datasets [OPTIONS]
+
+Options:
+
+ --help Show this message and exit.
+
+## Cache
+Get the current local cache path of openQDC
+
+Usage:
+
+ openqdc cache [OPTIONS]
+
+Options:
+
+ --help Show this message and exit.
+
+
+## Download
+Download preprocessed ml-ready datasets from the main openQDC hub.
+
+Usage:
+
+ openqdc download DATASETS... [OPTIONS]
+
+Options:
+
+ --help Show this message and exit.
+ --overwrite Whether to force the re-download of the datasets and overwrite the current cached dataset. [default: no-overwrite]
+ --cache-dir Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]
+ --as-zarr Whether to use a zarr format for the datasets instead of memmap. [default: no-as-zarr]
+ --gs Whether source to use for downloading. If True, Google Storage will be used.Otherwise, AWS S3 will be used [default: no-gs]
+
+Example:
+
+ openqdc download Spice
+
+## Fetch
+Download the raw datasets files from the main openQDC hub
+
+Note:
+
+ Special case: if the dataset is "all", "potential", "interaction".
+
+Usage:
+
+ openqdc fetch DATASETS... [OPTIONS]
+
+Options:
+
+ --help Show this message and exit.
+ --overwrite Whether to overwrite or force the re-download of the raw files. [default: no-overwrite]
+ --cache-dir Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used. [default: None]
+
+Example:
+
+ openqdc fetch Spice
+
+## Preprocess
+Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.
+
+Usage:
+
+ openqdc preprocess DATASETS... [OPTIONS]
+
+Options:
+
+ --help Show this message and exit.
+ --overwrite Whether to overwrite the current cached datasets. [default: overwrite]
+ --upload Whether to attempt the upload to the remote storage. Must have write permissions. [default: no-upload]
+ --as-zarr Whether to preprocess as a zarr format or a memmap format. [default: no-as-zarr]
+
+Example:
+
+ openqdc preprocess Spice QMugs
+
+## Upload
+Upload a preprocessed dataset to the remote storage
+
+Usage:
+
+ openqdc upload DATASETS... [OPTIONS]
+
+Options:
+
+ --help Show this message and exit.
+ --overwrite Whether to overwrite the remote files if they are present. [default: overwrite]
+ --as-zarr Whether to upload the zarr files if available. [default: no-as-zarr]
+
+Example:
+
+ openqdc upload Spice --overwrite
+
+## Convert
+Convert a preprocessed dataset from a memmap dataset to a zarr dataset.
+
+Usage:
+
+ openqdc convert DATASETS... [OPTIONS]
+
+Options:
+
+ --help Show this message and exit.
+ --overwrite Whether to overwrite the current zarr cached datasets. [default: no-overwrite]
+ --download Whether to force the re-download of the memmap datasets. [default: no-download]
diff --git a/docs/contribute.md b/docs/contribute.md
new file mode 100644
index 00000000..e0e22721
--- /dev/null
+++ b/docs/contribute.md
@@ -0,0 +1,59 @@
+# Contribute
+
+The below documents the development lifecycle of OpenQDC.
+
+## Setup a dev environment
+
+```bash
+mamba env create -n openqdc -f env.yml
+mamba activate datamol
+pip install -e .
+```
+
+## Pre commit installation
+
+```bash
+pre-commit install
+pre-commit run --all-files
+```
+
+## Continuous Integration
+
+OpenQDC uses Github Actions to:
+
+- **Build and test** `openQDC`.
+ - Multiple combinations of OS and Python versions are tested.
+- **Check** the code:
+ - Formatting with `black`.
+ - Static type check with `mypy`.
+ - Modules import formatting with `isort`.
+ - Pre-commit hooks.
+- **Documentation**:
+ - Google docstring format.
+ - build and deploy the documentation on `main` and for every new git tag.
+
+
+## Run tests
+
+```bash
+pytest
+```
+
+## Build the documentation
+
+You can build and serve the documentation locally with:
+
+```bash
+# Build and serve the doc
+mike serve
+```
+
+or with
+
+```bash
+mkdocs serve
+```
+
+### Multi-versionning
+
+The doc is built for eash push on `main` and every git tags using [mike](https://github.com/jimporter/mike). Everything is automated using Github Actions. Please refer to the official mike's documentation for the details.
diff --git a/docs/css/custom-openqdc.css b/docs/css/custom-openqdc.css
new file mode 100644
index 00000000..a1d97cf7
--- /dev/null
+++ b/docs/css/custom-openqdc.css
@@ -0,0 +1,92 @@
+:root {
+ --openqdc-primary: ##201342;
+ --openqdc-secondary: #4A1E7E;
+
+ /* Primary color shades */
+ --md-primary-fg-color: var(--openqdc-primary);
+ --md-primary-fg-color--light: var(--openqdc-primary);
+ --md-primary-fg-color--dark: var(--openqdc-primary);
+ --md-primary-bg-color: var(--openqdc-secondary);
+ --md-primary-bg-color--light: var(--openqdc-secondary);
+ --md-text-link-color: var(--openqdc-secondary);
+
+ /* Accent color shades */
+ --md-accent-fg-color: var(--openqdc-secondary);
+ --md-accent-fg-color--transparent: var(--openqdc-secondary);
+ --md-accent-bg-color: var(--openqdc-secondary);
+ --md-accent-bg-color--light: var(--openqdc-secondary);
+ }
+
+ :root>* {
+ /* Code block color shades */
+ --md-code-bg-color: hsla(0, 0%, 96%, 1);
+ --md-code-fg-color: hsla(200, 18%, 26%, 1);
+
+ /* Footer */
+ --md-footer-bg-color: var(--openqdc-primary);
+ /* --md-footer-bg-color--dark: hsla(0, 0%, 0%, 0.32); */
+ --md-footer-fg-color: var(--openqdc-secondary);
+ --md-footer-fg-color--light: var(--openqdc-secondary);
+ --md-footer-fg-color--lighter: var(--openqdc-secondary);
+
+ }
+
+ .md-header {
+ background-image: linear-gradient(to right, #131036, #4A1E7E);
+ }
+
+ .md-footer {
+ background-image: linear-gradient(to right, #131036, #4A1E7E);
+ }
+
+ .md-tabs {
+ background-image: linear-gradient(to right, #F4F6F9, #b39bce);
+ }
+
+ .md-header__topic {
+ color: rgb(255, 255, 255);
+ }
+
+ .md-source__repository,
+ .md-source__icon,
+ .md-search__input,
+ .md-search__input::placeholder,
+ .md-search__input~.md-search__icon,
+ .md-footer__inner.md-grid,
+ .md-copyright__highlight,
+ .md-copyright,
+ .md-footer-meta.md-typeset a,
+ .md-version {
+ color: rgb(255, 255, 255) !important;
+ }
+
+ .md-search__form {
+ background-color: rgba(255, 255, 255, 0.2);
+ }
+
+ .md-search__input {
+ color: #222222 !important;
+ }
+
+ .md-header__topic {
+ color: rgb(255, 255, 255);
+ font-size: 1.4em;
+ }
+
+ /* Increase the size of the logo */
+ .md-header__button.md-logo img,
+ .md-header__button.md-logo svg {
+ height: 2rem !important;
+ }
+
+ /* Reduce the margin around the logo */
+ .md-header__button.md-logo {
+ margin: 0.4em;
+ padding: 0.4em;
+ }
+
+ /* Remove the `In` and `Out` block in rendered Jupyter notebooks */
+ .md-container .jp-Cell-outputWrapper .jp-OutputPrompt.jp-OutputArea-prompt,
+ .md-container .jp-Cell-inputWrapper .jp-InputPrompt.jp-InputArea-prompt {
+ display: none !important;
+ }
diff --git a/docs/data_storage.md b/docs/data_storage.md
new file mode 100644
index 00000000..b24bec3b
--- /dev/null
+++ b/docs/data_storage.md
@@ -0,0 +1,33 @@
+## Dataset structure
+
+For a dataset with N geometries, M atoms across all geometries, ne energy labels,
+and nf force labels, we use zarr or memory-mapped arrays of various sizes:
+
+- (M, 5) for atomic numbers (1),
+charges (1), and positions (3) of individual geometries;
+
+- (N, 2) for the beginning and end indices of
+each geometry in the previous array;
+
+- (N, ne) for the energy labels of each geometry, extendable to
+store other geometry-level QM properties such as HOMO-LUMO gap;
+
+- (M, nf , 3) for the force labels
+of each geometry, extendable to store other atom-level QM properties.
+
+
+The memory-mapped files efficiently access data stored on disk or in the cloud without reading
+them into memory, enabling training on machines with smaller RAM than the dataset size and
+accommodating concurrent reads in multi-GPU training. This allows for very efficient indexing,
+batching and iteration.
+
+![](assets/StorageView.png)
+
+
+## Formats
+
+We currently support the following formats:
+
+1) Zarr : https://zarr.readthedocs.io/en/stable/index.html
+
+2) Memmap : https://numpy.org/doc/stable/index.html
diff --git a/docs/dataset_upload.md b/docs/dataset_upload.md
new file mode 100644
index 00000000..e4740f75
--- /dev/null
+++ b/docs/dataset_upload.md
@@ -0,0 +1,69 @@
+# How to Add a Dataset to OpenQDC
+
+Do you think that OpenQDC is missing some important dataset? Do you think your dataset would be a good fit for OpenQDC?
+If so, you can contribute to OpenQDC by adding your dataset to the OpenQDC repository in two ways:
+
+1. Opening a PR to add a new dataset
+2. Request a new dataset through Google Form
+
+## OpenQDC PR Guidelines
+
+Implement your dataset in the OpenQDC repository by following the guidelines below:
+
+### Dataset class
+
+- The dataset class should be implemented in the `openqdc/datasets` directory.
+- The dataset class should inherit from the `openqdc.datasets.base.BaseDataset` class.
+- Add your `dataset.py` file to the `openqdc/datasets/potential` or `openqdc/datasets/interaction/` directory based on the type of energy.
+- Implement the following for your dataset:
+ - Add the metadata of the dataset:
+ - Docstrings for the dataset class. Docstrings should report links and references to the dataset. A small description and if possible, the sampling strategy used to generate the dataset.
+ - `__links__`: Dictionary of name and link to download the dataset.
+ - `__name__`: Name of the dataset. This will create a folder with the name of the dataset in the cache directory.
+ - The original units for the dataset `__energy_unit__` and `__distance_unit__`.
+ - `__force_mask__`: Boolean to indicate if the dataset has forces. Or if multiple forces are present. A list of booleans.
+ - `__energy_methods__`: List of the `QmMethod` methods present in the dataset.
+ - `read_raw_entries(self)` -> `List[Dict[str, Any]]`: Preprocess the raw dataset and return a list of dictionaries containing the data. For a better overview of the data format. Look at data storage. This data should have the following keys:
+ - `atomic_inputs` : Atomic inputs of the molecule. numpy.Float32.
+ - `name`: Atomic numbers of the atoms in the molecule. numpy.Object.
+ - `subset`: Positions of the atoms in the molecule. numpy.Object.
+ - `energies`: Energies of the molecule. numpy.Float64.
+ - `n_atoms`: Number of atoms in the molecule. numpy.Int32
+ - `forces`: Forces of the molecule. [Optional] numpy.Float32.
+ - Add the dataset import to the `openqdc/datasets//__init__.py` file and to `openqdc/__init__.py`.
+
+### Test the dataset
+
+Try to run the openQDC CLI pipeline with the dataset you implemented.
+
+Run the following command to download the dataset:
+
+- Fetch the dataset files
+```bash
+openqdc fetch DATASET_NAME
+```
+
+- Preprocess the dataset
+```bash
+openqdc preprocess DATASET_NAME
+```
+
+- Load it on python and check if the dataset is correctly loaded.
+```python
+from openqdc import DATASET_NAME
+ds=DATASET_NAME()
+```
+
+If the dataset is correctly loaded, you can open a PR to add the dataset to OpenQDC.
+
+- Select for your PR the `dataset` label.
+
+Our team will review your PR and provide feedback if necessary. If everything is correct, your dataset will be added to OpenQDC remote storage.
+
+## OpenQDC Google Form
+
+Alternatively, you can ask the OpenQDC main development team to take care of the dataset upload for you.
+You can fill out the Google Form [here](https://docs.google.com/forms/d/e/1FAIpQLSeh0YHRn-OoqPpUbrL7G-EOu3LtZC24rtQWwbjJaZ-2V8P2vQ/viewform?usp=sf_link)
+
+As the openQDC team will strive to provide a high quality curation and upload,
+please be patient as the team will need to review the dataset and carry out the necessary steps to ensure the dataset is uploaded correctly.
diff --git a/docs/index.md b/docs/index.md
index 264211f1..db497b10 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,30 +1,65 @@
-# openQDC
+# Overview
-Open Quantum Data Commons
+OpenQDC is a python library to work with quantum datasets. It's a package aimed at providing a simple and efficient way to download, load and utilize various datasets and provide a way to standardize the data for easy use in machine learning models.
-## Setup Datasets
+- 🐍 Simple pythonic API
+- 🕹️ ML-Ready: all you manipulate are `torch.Tensor`,`jax.Array` or `numpy.Array`objects.
+- ⚛️ Quantum Ready: The quantum methods are checked and standardized to provide addictional values.
+- ✅ Standardized: The datasets are written in standard and performant formats with annotated metadata like units and labels.
+- 🧠 Performance matters: read and write multiple formats (memmap, zarr, xyz, etc).
+- 📈 Data: have access to 1.5+ billion datapoints
-Use the scripts in `setup/` to download the datasets. For more information, see the [README](setup/README.md) in the `setup/` directory.
+Visit our website at TOFILL .
+
+## Installation
+
+Use mamba:
-# Install the library in dev mode
```bash
-# Install the deps
-mamba env create -n qdc -f env.yml
+mamba install -c conda-forge openqdc
+```
-# Activate the environment
-mamba activate qdc
+_**Tips:** You can replace `mamba` by `conda`._
-# Install the qdc library in dev mode
-pip install -e .
+_**Note:** We highly recommend using a [Conda Python distribution](https://github.com/conda-forge/miniforge) to install OpenQDC. The package is also pip installable if you need it: `pip install openqdc`._
-```
+## Quick API Tour
-## Development lifecycle
+```python
+from openqdc as Spice
-### Tests
+# Load the original dataset
+dataset = Spice()
-You can run tests locally with:
+# Load the dataset with a different units
+dataset = Spice(
+ energy_unit = "kcal/mol",
+ distance_unit = "ang",
+ energy_type = "formation",
+ array_format = "torch"
+)
-```bash
-pytest .
+# Access the data
+data = dataset[0]
+
+# Get relevant statistics
+dataset.get_statistics()
+
+# Get dataset metadata
+dataset.average_n_atoms
+dataset.chemical_species
+dataset.charges
+
+# Compute physical descriptors
+dataset.calculate_descriptors(
+ descriptor_name = "soap"
+)
```
+
+## How to cite
+
+Please cite OpenQDC if you use it in your research: [![DOI](zenodo_badge)](zenodo_link).
+
+## Compatibilities
+
+OpenQDC is compatible with Python >= 3.8 and is tested on Linux, MacOS and Windows.
diff --git a/docs/licensing.md b/docs/licensing.md
new file mode 100644
index 00000000..ec5a3857
--- /dev/null
+++ b/docs/licensing.md
@@ -0,0 +1,3 @@
+```
+{!LICENSE!}
+```
diff --git a/docs/normalization_e0s.md b/docs/normalization_e0s.md
new file mode 100644
index 00000000..426e7d0d
--- /dev/null
+++ b/docs/normalization_e0s.md
@@ -0,0 +1,38 @@
+# Overview of QM Methods and Normalization
+
+OpenQDC provides support for 250+ QM Methods and provides a way to standardize and categorize
+the usage of different level of theories used for Quantum Mechanics Single Point Calculations
+to add value and information to the datasets.
+
+## Level of Theory
+
+To avoid inconsistencies, level of theories are standardized and categorized into Python Enums
+consisting of a functional, a basis set, and a correction method.
+OpenQDC covers more than 106 functionals, 20 basis sets, and 11
+correction methods.
+OpenQDC provides the computed the isolated atom energies `e0` for each QM method.
+
+
+## Normalization
+
+
+We provide support of energies through "physical" and "regression" normalization to conserve the size extensivity of chemical systems.
+OpenQDC through this normalization, provide a way to transform the potential energy to atomization energy by subtracting isolated atom energies `e0`
+physically interpretable and extensivity-conserving normalization method. Alternatively, we pre-
+compute the average contribution of each atom species to potential energy via linear or ridge
+regression, centering the distribution at 0 and providing uncertainty estimation for the computed
+values. Predicted atomic energies can also be scaled to approximate a standard normal distribution.
+
+### Physical Normalization
+
+`e0` energies are calculated for each atom in the dataset at the appropriate level of theory and then subtracted from
+the potential energy to obtain the atomization energy. This normalization method is physically interpretable and
+only remove the atom energy contribution from the potential energy.
+
+
+### Regression Normalization
+
+`e0` energies are calculated for each atom in the dataset from fitting a regression model to the potential energy.
+The `e0` energies are then subtracted from the potential energy to obtain the atomization energy. This normalization
+provides uncertainty estimation for the computed values and remove part of the interatomic energy contribution from the potential energy.
+The resulting formation energy is centered at 0.
diff --git a/docs/usage.md b/docs/usage.md
new file mode 100644
index 00000000..af62f453
--- /dev/null
+++ b/docs/usage.md
@@ -0,0 +1,42 @@
+# Usage
+
+## How to use
+
+OpenQDC has been designed to be used with a single import:
+
+```python
+import openqdc as qdc
+dataset = qdc.QM9()
+```
+
+All `openQDC` functions are available under `qdc`.
+Or if you want to directly import a specific dataset:
+
+```python
+from openqdc as Spice
+# Spice dataset with distance unit in angstrom instead of bohr
+dataset = Spice(distance_unit="ang",
+ array_format = "jax"
+)
+dataset[0] # dict of jax array
+```
+
+Or if you prefer handling `ase.Atoms` objects:
+
+```python
+dataset.get_ase_atoms(0)
+```
+
+## Iterators
+
+OpenQDC provides a simple way to get the data as iterators:
+
+```python
+for data in dataset.as_iter(atoms=True):
+ print(data) # Atoms object
+ break
+```
+
+## Lazy loading
+
+OpenQDC uses lazy loading to dynamically expose all its API without imposing a long import time during `import openqdc as qdc`. In case of trouble you can always disable lazy loading by setting the environment variable `OPENQDC_DISABLE_LAZY_LOADING` to `1`.
diff --git a/env.yml b/env.yml
index 16ccc3c2..87a9ccac 100644
--- a/env.yml
+++ b/env.yml
@@ -11,10 +11,15 @@ dependencies:
- gcsfs
- typer
- prettytable
+ - s3fs
+ - pydantic
+ - python-dotenv
+
# Scientific
- pandas
- numpy
+ - zarr
# Chem
- datamol #==0.9.0
@@ -36,6 +41,7 @@ dependencies:
- ruff
# Doc
+ - mike
- mkdocs
- mkdocs-material
- mkdocs-material-extensions
diff --git a/mkdocs.yml b/mkdocs.yml
index caac43c9..fdb8856a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,39 +1,91 @@
-site_name: "Open Quantum Data Commons (openQDC)"
+site_name: "OpenQDC"
site_description: "I don't know... Something about data and Quantum stuff I guess :D"
-site_url: "https://github.com/OpenDrugDiscovery/openQDC"
repo_url: "https://github.com/OpenDrugDiscovery/openQDC"
repo_name: "openQDC"
copyright: Copyright 2023 Valence Labs
+site_url: "https://github.com/OpenDrugDiscovery/openQDC"
remote_branch: "privpage"
use_directory_urls: false
docs_dir: "docs"
+# Fail on warnings to detect issues with types and docstring
+strict: true
+
nav:
- Overview: index.md
+ - Usage:
+ - Base usage : usage.md
+ - CLI: cli.md
- Available Datasets: datasets.md
- #- Tutorials:
- # #- Really hard example: tutorials/usage.ipynb
+ - QM methods: normalization_e0s.md
+ - Data structure: data_storage.md
+ - Tutorials:
+ - Really hard example: tutorials/usage.ipynb
- API:
- - Datasets: API/available_datasets.md
- - Isolated Atoms Energies: API/isolated_atom_energies.md
+ - QM methods: API/methods.md
+ - Normalization regressor: API/regressor.md
+ - Main class: API/basedataset.md
+ - Format loading: API/formats.md
+ - Datasets:
+ - Potential Energy:
+ - Alchemy : API/datasets/alchemy.md
+ - ANI : API/datasets/ani.md
+ - Spice : API/datasets/spice.md
+ - GEOM : API/datasets/geom.md
+ - Qmugs : API/datasets/qmugs.md
+ - ISO_17 : API/datasets/iso_17.md
+ - Comp6 : API/datasets/comp6.md
+ - GDML : API/datasets/gdml.md
+ - Molecule3D : API/datasets/molecule3d.md
+ - Orbnet Denali : API/datasets/orbnet_denali.md
+ - SN2 RXN : API/datasets/sn2_rxn.md
+ - QM7X : API/datasets/qm7x.md
+ - QM1B : API/datasets/qm1b.md
+ - NablaDFT : API/datasets/nabladft.md
+ - Solvated Peptides : API/datasets/solvated_peptides.md
+ - Waterclusters3_30 : API/datasets/waterclusters3_30.md
+ - SCAN Waterclusters : API/datasets/waterclusters.md
+ - TMQM : API/datasets/tmqm.md
+ - PCQM : API/datasets/pcqm.md
+ - RevMD17 : API/datasets/revmd17.md
+ - MD22 : API/datasets/md22.md
+ - Transition1X : API/datasets/transition1x.md
+ - MultixcQM9 : API/datasets/multixcqm9.md
+ - QMX : API/datasets/qmx.md
+ - Protein Fragments : API/datasets/proteinfragments.md
+ - VQM24 : API/datasets/vqm24.md
+ - Interaction Energy:
+ - DES : API/datasets/des.md
+ - L7 : API/datasets/l7.md
+ - X40 : API/datasets/x40.md
+ - Metcalf : API/datasets/metcalf.md
+ - Splinter : API/datasets/splinter.md
+ - Units: API/units.md
+ - Utils: API/utils.md
+ - Contribute:
+ - Mantaining: contribute.md
+ - Add a dataset: dataset_upload.md
+ - License: licensing.md
+
theme:
name: material
- custom_dir: docs/_overrides
- palette:
- primary: teal
- accent: purple
+ #custom_dir: docs/_overrides
features:
- navigation.tabs
- - navigation.expand
+ #- navigation.expand
+ #favicon: assets/qdc_logo.png
+ logo: assets/qdc_logo.png
extra_css:
- css/custom.css
+ - css/custom-openqdc.css
extra_javascript:
- javascripts/config.js
- https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
+ #- https://unpkg.com/mermaid@10.9.0/dist/mermaid.min.js
markdown_extensions:
- admonition
@@ -53,11 +105,14 @@ markdown_extensions:
- toc:
permalink: true
+watch:
+ - openqdc/
+
plugins:
- search
- mkdocstrings:
- watch:
- - openqdc/
+ #watch:
+ # - openqdc/
handlers:
python:
setup_commands:
@@ -69,7 +124,11 @@ plugins:
rendering:
show_root_heading: yes
heading_level: 3
- show_if_no_docstring: true
+ show_if_no_docstring: false
- mkdocs-jupyter:
execute: False
# kernel_name: python3
+
+extra:
+ version:
+ provider: mike
diff --git a/openqdc/__init__.py b/openqdc/__init__.py
index 7e2eb2cf..c6be72d4 100644
--- a/openqdc/__init__.py
+++ b/openqdc/__init__.py
@@ -15,6 +15,7 @@ def get_project_root():
"__version__": "openqdc._version",
"BaseDataset": "openqdc.datasets.base",
# POTENTIAL
+ "Alchemy": "openqdc.datasets.potential.alchemy",
"ANI1": "openqdc.datasets.potential.ani",
"ANI1CCX": "openqdc.datasets.potential.ani",
"ANI1CCX_V2": "openqdc.datasets.potential.ani",
@@ -39,6 +40,7 @@ def get_project_root():
"NablaDFT": "openqdc.datasets.potential.nabladft",
"SolvatedPeptides": "openqdc.datasets.potential.solvated_peptides",
"WaterClusters": "openqdc.datasets.potential.waterclusters3_30",
+ "SCANWaterClusters": "openqdc.datasets.potential.waterclusters",
"TMQM": "openqdc.datasets.potential.tmqm",
"PCQM_B3LYP": "openqdc.datasets.potential.pcqm",
"PCQM_PM6": "openqdc.datasets.potential.pcqm",
@@ -47,6 +49,13 @@ def get_project_root():
"Transition1X": "openqdc.datasets.potential.transition1x",
"MultixcQM9": "openqdc.datasets.potential.multixcqm9",
"MultixcQM9_V2": "openqdc.datasets.potential.multixcqm9",
+ "QM7": "openqdc.datasets.potential.qmx",
+ "QM7b": "openqdc.datasets.potential.qmx",
+ "QM8": "openqdc.datasets.potential.qmx",
+ "QM9": "openqdc.datasets.potential.qmx",
+ "ProteinFragments": "openqdc.datasets.potential.proteinfragments",
+ "MDDataset": "openqdc.datasets.potential.proteinfragments",
+ "VQM24": "openqdc.datasets.potential.vqm24",
# INTERACTION
"DES5M": "openqdc.datasets.interaction.des",
"DES370K": "openqdc.datasets.interaction.des",
@@ -58,6 +67,7 @@ def get_project_root():
"Splinter": "openqdc.datasets.interaction.splinter",
# DEBUG
"Dummy": "openqdc.datasets.potential.dummy",
+ "PredefinedDataset": "openqdc.datasets.potential.dummy",
# ALL
"AVAILABLE_DATASETS": "openqdc.datasets",
"AVAILABLE_POTENTIAL_DATASETS": "openqdc.datasets.potential",
@@ -105,9 +115,10 @@ def __dir__():
from .datasets.interaction.x40 import X40
# POTENTIAL
+ from .datasets.potential.alchemy import Alchemy
from .datasets.potential.ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X
from .datasets.potential.comp6 import COMP6
- from .datasets.potential.dummy import Dummy
+ from .datasets.potential.dummy import Dummy, PredefinedDataset
from .datasets.potential.gdml import GDML
from .datasets.potential.geom import GEOM
from .datasets.potential.iso_17 import ISO17
@@ -117,13 +128,17 @@ def __dir__():
from .datasets.potential.nabladft import NablaDFT
from .datasets.potential.orbnet_denali import OrbnetDenali
from .datasets.potential.pcqm import PCQM_B3LYP, PCQM_PM6
+ from .datasets.potential.proteinfragments import MDDataset, ProteinFragments
from .datasets.potential.qm1b import QM1B, QM1B_SMALL
from .datasets.potential.qm7x import QM7X, QM7X_V2
from .datasets.potential.qmugs import QMugs, QMugs_V2
+ from .datasets.potential.qmx import QM7, QM8, QM9, QM7b
from .datasets.potential.revmd17 import RevMD17
from .datasets.potential.sn2_rxn import SN2RXN
from .datasets.potential.solvated_peptides import SolvatedPeptides
from .datasets.potential.spice import Spice, SpiceV2, SpiceVL2
from .datasets.potential.tmqm import TMQM
from .datasets.potential.transition1x import Transition1X
+ from .datasets.potential.vqm24 import VQM24
+ from .datasets.potential.waterclusters import SCANWaterClusters
from .datasets.potential.waterclusters3_30 import WaterClusters
diff --git a/openqdc/cli.py b/openqdc/cli.py
index 1d985090..7b32c9ae 100644
--- a/openqdc/cli.py
+++ b/openqdc/cli.py
@@ -1,3 +1,4 @@
+import os
from typing import List, Optional
import typer
@@ -12,27 +13,40 @@
AVAILABLE_INTERACTION_DATASETS,
AVAILABLE_POTENTIAL_DATASETS,
)
+from openqdc.utils.io import get_local_cache
app = typer.Typer(help="OpenQDC CLI")
def sanitize(dictionary):
+ """
+ Sanitize dataset names to be used in the CLI.
+ """
return {k.lower().replace("_", "").replace("-", ""): v for k, v in dictionary.items()}
SANITIZED_AVAILABLE_DATASETS = sanitize(AVAILABLE_DATASETS)
-def exist_dataset(dataset):
+def exist_dataset(dataset) -> bool:
+ """
+ Check if dataset is available in the openQDC datasets.
+ """
if dataset not in sanitize(AVAILABLE_DATASETS):
logger.error(f"{dataset} is not available. Please open an issue on Github for the team to look into it.")
return False
return True
-def format_entry(empty_dataset):
+def format_entry(empty_dataset, max_num_to_display: int = 6):
+ """
+ Format the entry for the table.
+ max_num_to_display: int = 6,
+ Maximum number of energy methods to display. Used to keep the table format
+ readable in case of datasets with many energy methods. [ex. MultiXQM9]
+ """
energy_methods = [str(x) for x in empty_dataset.__energy_methods__]
- max_num_to_display = 6
+
if len(energy_methods) > 6:
entry = ",".join(energy_methods[:max_num_to_display]) + "..."
else:
@@ -46,7 +60,7 @@ def download(
overwrite: Annotated[
bool,
typer.Option(
- help="Whether to overwrite or force the re-download of the datasets.",
+ help="Whether to force the re-download of the datasets and overwrite the current cached dataset.",
),
] = False,
cache_dir: Annotated[
@@ -55,6 +69,19 @@ def download(
help="Path to the cache. If not provided, the default cache directory (.cache/openqdc/) will be used.",
),
] = None,
+ as_zarr: Annotated[
+ bool,
+ typer.Option(
+ help="Whether to use a zarr format for the datasets instead of memmap.",
+ ),
+ ] = False,
+ gs: Annotated[
+ bool,
+ typer.Option(
+ help="Whether source to use for downloading. If True, Google Storage will be used."
+ + "Otherwise, AWS S3 will be used",
+ ),
+ ] = False,
):
"""
Download preprocessed ml-ready datasets from the main openQDC hub.
@@ -62,18 +89,25 @@ def download(
Example:
openqdc download Spice QMugs
"""
+ if gs:
+ os.environ["OPENQDC_DOWNLOAD_API"] = "gs"
+
for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
if exist_dataset(dataset):
- if SANITIZED_AVAILABLE_DATASETS[dataset].no_init().is_cached() and not overwrite:
+ ds = SANITIZED_AVAILABLE_DATASETS[dataset].no_init()
+ ds.read_as_zarr = as_zarr
+ if ds.is_cached() and not overwrite:
logger.info(f"{dataset} is already cached. Skipping download")
else:
- SANITIZED_AVAILABLE_DATASETS[dataset](overwrite_local_cache=True, cache_dir=cache_dir)
+ SANITIZED_AVAILABLE_DATASETS[dataset](
+ overwrite_local_cache=True, cache_dir=cache_dir, read_as_zarr=as_zarr, skip_statistics=True
+ )
@app.command()
def datasets():
"""
- Print a table of the available openQDC datasets and some informations.
+ Print a formatted table of the available openQDC datasets and some informations.
"""
table = PrettyTable(["Name", "Type of Energy", "Forces", "Level of theory"])
for dataset in AVAILABLE_DATASETS:
@@ -98,7 +132,7 @@ def fetch(
overwrite: Annotated[
bool,
typer.Option(
- help="Whether to overwrite or force the re-download of the files.",
+ help="Whether to overwrite or force the re-download of the raw files.",
),
] = False,
cache_dir: Annotated[
@@ -109,17 +143,14 @@ def fetch(
] = None,
):
"""
- Download the raw datasets files from the main openQDC hub.
- overwrite: bool = False,
- If True, the files will be re-downloaded and overwritten.
- cache_dir: Optional[str] = None,
- Path to the cache. If not provided, the default cache directory will be used.
- Special case: if the dataset is "all", "potential", "interaction".
- all: all available datasets will be downloaded.
- potential: all the potential datasets will be downloaded
- interaction: all the interaction datasets will be downloaded
- Example:
- openqdc fetch Spice
+ Download the raw datasets files from the main openQDC hub.\n
+ Special case: if the dataset is "all", "potential", "interaction".\n
+ all: all available datasets will be downloaded.\n
+ potential: all the potential datasets will be downloaded\n
+ interaction: all the interaction datasets will be downloaded\n\n
+
+ Example:\n
+ openqdc fetch Spice
"""
if datasets[0].lower() == "all":
dataset_names = list(sanitize(AVAILABLE_DATASETS).keys())
@@ -143,18 +174,27 @@ def preprocess(
overwrite: Annotated[
bool,
typer.Option(
- help="Whether to overwrite or force the re-download of the datasets.",
+ help="Whether to overwrite the current cached datasets.",
),
] = True,
upload: Annotated[
bool,
typer.Option(
- help="Whether to try the upload to the remote storage.",
+ help="Whether to attempt the upload to the remote storage. Must have write permissions.",
+ ),
+ ] = False,
+ as_zarr: Annotated[
+ bool,
+ typer.Option(
+ help="Whether to preprocess as a zarr format or a memmap format.",
),
] = False,
):
"""
Preprocess a raw dataset (previously fetched) into a openqdc dataset and optionally push it to remote.
+
+ Example:
+ openqdc preprocess Spice QMugs
"""
for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
if exist_dataset(dataset):
@@ -166,5 +206,137 @@ def preprocess(
raise e
+@app.command()
+def upload(
+ datasets: List[str],
+ overwrite: Annotated[
+ bool,
+ typer.Option(
+ help="Whether to overwrite the remote files if they are present.",
+ ),
+ ] = True,
+ as_zarr: Annotated[
+ bool,
+ typer.Option(
+ help="Whether to upload the zarr files if available.",
+ ),
+ ] = False,
+):
+ """
+ Upload a preprocessed dataset to the remote storage.
+
+ Example:
+ openqdc upload Spice --overwrite
+ """
+ for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
+ if exist_dataset(dataset):
+ logger.info(f"Uploading {SANITIZED_AVAILABLE_DATASETS[dataset].__name__}")
+ try:
+ SANITIZED_AVAILABLE_DATASETS[dataset](skip_statistics=True).upload(overwrite=overwrite, as_zarr=as_zarr)
+ except Exception as e:
+ logger.error(f"Error while uploading {dataset}. {e}. Did you preprocess the dataset first?")
+ raise e
+
+
+@app.command()
+def convert(
+ datasets: List[str],
+ overwrite: Annotated[
+ bool,
+ typer.Option(
+ help="Whether to overwrite the current zarr cached datasets.",
+ ),
+ ] = False,
+ download: Annotated[
+ bool,
+ typer.Option(
+ help="Whether to force the re-download of the memmap datasets.",
+ ),
+ ] = False,
+):
+ """
+ Convert a preprocessed dataset from a memmap dataset to a zarr dataset.
+ """
+ import os
+ from os.path import join as p_join
+
+ import numpy as np
+ import zarr
+
+ from openqdc.utils.io import load_pkl
+
+ def silent_remove(filename):
+ """
+ Zarr zip files are currently not overwritable. This function is used to remove the file if it exists.
+ """
+ try:
+ os.remove(filename)
+ except OSError:
+ pass
+
+ for dataset in list(map(lambda x: x.lower().replace("_", ""), datasets)):
+ if exist_dataset(dataset):
+ logger.info(f"Converting {SANITIZED_AVAILABLE_DATASETS[dataset].__name__}")
+ try:
+ ds = SANITIZED_AVAILABLE_DATASETS[dataset](overwrite_local_cache=download, skip_statistics=True)
+ # os.makedirs(p_join(ds.root, "zips", ds.__name__), exist_ok=True)
+
+ pkl = load_pkl(p_join(ds.preprocess_path, "props.pkl"))
+ metadata = p_join(ds.preprocess_path, "metadata.zip")
+ if overwrite:
+ silent_remove(metadata)
+ group = zarr.group(zarr.storage.ZipStore(metadata))
+ for key, value in pkl.items():
+ # sub=group.create_group(key)
+ if key in ["name", "subset"]:
+ data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype)
+ data[:] = value[0][:]
+ data2 = group.create_dataset(key + "_ptr", shape=value[1].shape, dtype=np.int32)
+ data2[:] = value[1][:]
+ else:
+ data = group.create_dataset(key, shape=value.shape, dtype=value.dtype)
+ data[:] = value[:]
+
+ force_attrs = {
+ "unit": str(ds.force_unit),
+ "level_of_theory": ds.force_methods,
+ }
+
+ energy_attrs = {"unit": str(ds.energy_unit), "level_of_theory": ds.energy_methods}
+
+ atomic_inputs_attrs = {
+ "unit": str(ds.distance_unit),
+ }
+ attrs = {"forces": force_attrs, "energies": energy_attrs, "atomic_inputs": atomic_inputs_attrs}
+
+ # os.makedirs(p_join(ds.root, "zips", ds.__name__), exist_ok=True)
+ for key, value in ds.data.items():
+ if key not in ds.data_keys:
+ continue
+ print(key, value.shape)
+
+ zarr_path = p_join(ds.preprocess_path, key + ".zip") # ds.__name__,
+ if overwrite:
+ silent_remove(zarr_path)
+ z = zarr.open(
+ zarr.storage.ZipStore(zarr_path), "w", zarr_version=2, shape=value.shape, dtype=value.dtype
+ )
+ z[:] = value[:]
+ if key in attrs:
+ z.attrs.update(attrs[key])
+
+ except Exception as e:
+ logger.error(f"Error while converting {dataset}. {e}. Did you preprocess the dataset first?")
+ raise e
+
+
+@app.command()
+def cache():
+ """
+ Get the current local cache path of openQDC
+ """
+ print(f"openQDC local cache:\n {get_local_cache()}")
+
+
if __name__ == "__main__":
app()
diff --git a/openqdc/datasets/base.py b/openqdc/datasets/base.py
index 026bfd75..8a480125 100644
--- a/openqdc/datasets/base.py
+++ b/openqdc/datasets/base.py
@@ -1,13 +1,18 @@
"""The BaseDataset defining shared functionality between all datasets."""
import os
-import pickle as pkl
+
+try:
+ from collections.abc import Iterable
+except ImportError:
+ from collections import Iterable
from functools import partial
from itertools import compress
from os.path import join as p_join
from typing import Callable, Dict, List, Optional, Union
import numpy as np
+from ase import Atoms
from ase.io.extxyz import write_extxyz
from loguru import logger
from sklearn.utils import Bunch
@@ -22,6 +27,7 @@
StatisticManager,
TotalEnergyStats,
)
+from openqdc.datasets.structure import MemMapDataset, ZarrDataset
from openqdc.utils.constants import MAX_CHARGE, NB_ATOMIC_FEATURES
from openqdc.utils.descriptors import get_descriptor
from openqdc.utils.exceptions import (
@@ -32,7 +38,6 @@
copy_exists,
dict_to_atoms,
get_local_cache,
- pull_locally,
push_remote,
set_cache_dir,
)
@@ -76,6 +81,7 @@ class BaseDataset(DatasetPropertyMixIn):
energy_target_names = []
force_target_names = []
+ read_as_zarr = False
__energy_methods__ = []
__force_mask__ = []
__isolated_atom_energies__ = []
@@ -99,7 +105,9 @@ def __init__(
cache_dir: Optional[str] = None,
recompute_statistics: bool = False,
transform: Optional[Callable] = None,
- regressor_kwargs={
+ skip_statistics: bool = False,
+ read_as_zarr: bool = False,
+ regressor_kwargs: Dict = {
"solver_type": "linear",
"sub_sample": None,
"stride": 1,
@@ -107,29 +115,28 @@ def __init__(
) -> None:
"""
- Parameters
- ----------
- energy_unit
- Energy unit to convert dataset to. Supported units: ["kcal/mol", "kj/mol", "hartree", "ev"]
- distance_unit
- Distance unit to convert dataset to. Supported units: ["ang", "nm", "bohr"]
- array_format
- Format to return arrays in. Supported formats: ["numpy", "torch", "jax"]
- energy_type
- Type of isolated atom energy to use for the dataset. Default: "formation"
- Supported types: ["formation", "regression", "null", None]
- overwrite_local_cache
- Whether to overwrite the locally cached dataset.
- cache_dir
- Cache directory location. Defaults to "~/.cache/openqdc"
- recompute_statistics
- Whether to recompute the statistics of the dataset.
- transform, optional
- transformation to apply to the __getitem__ calls
- regressor_kwargs
- Dictionary of keyword arguments to pass to the regressor.
- Default: {"solver_type": "linear", "sub_sample": None, "stride": 1}
- solver_type can be one of ["linear", "ridge"]
+ Parameters:
+ energy_unit:
+ Energy unit to convert dataset to. Supported units: ["kcal/mol", "kj/mol", "hartree", "ev"]
+ distance_unit:
+ Distance unit to convert dataset to. Supported units: ["ang", "nm", "bohr"]
+ array_format:
+ Format to return arrays in. Supported formats: ["numpy", "torch", "jax"]
+ energy_type:
+ Type of isolated atom energy to use for the dataset. Default: "formation"
+ Supported types: ["formation", "regression", "null", None]
+ overwrite_local_cache:
+ Whether to overwrite the locally cached dataset.
+ cache_dir:
+ Cache directory location. Defaults to "~/.cache/openqdc"
+ recompute_statistics:
+ Whether to recompute the statistics of the dataset.
+ transform:
+ transformation to apply to the __getitem__ calls
+ regressor_kwargs:
+ Dictionary of keyword arguments to pass to the regressor.
+ Default: {"solver_type": "linear", "sub_sample": None, "stride": 1}
+ solver_type can be one of ["linear", "ridge"]
"""
set_cache_dir(cache_dir)
# self._init_lambda_fn()
@@ -138,8 +145,10 @@ def __init__(
self.recompute_statistics = recompute_statistics
self.regressor_kwargs = regressor_kwargs
self.transform = transform
+ self.read_as_zarr = read_as_zarr
self.energy_type = energy_type if energy_type is not None else "null"
self.refit_e0s = recompute_statistics or overwrite_local_cache
+ self.skip_statistics = skip_statistics
if not self.is_preprocessed():
raise DatasetNotAvailableError(self.__name__)
else:
@@ -152,6 +161,12 @@ def _init_lambda_fn(self):
self._fn_distance = lambda x: x
self._fn_forces = lambda x: x
+ @property
+ def dataset_wrapper(self):
+ if not hasattr(self, "_dataset_wrapper"):
+ self._dataset_wrapper = ZarrDataset() if self.read_as_zarr else MemMapDataset()
+ return self._dataset_wrapper
+
@property
def config(self):
assert len(self.__links__) > 0, "No links provided for fetching"
@@ -171,7 +186,8 @@ def _post_init(
) -> None:
self._set_units(None, None)
self._set_isolated_atom_energies()
- self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)
+ if not self.skip_statistics:
+ self._precompute_statistics(overwrite_local_cache=overwrite_local_cache)
self._set_units(energy_unit, distance_unit)
self._convert_data()
self._set_isolated_atom_energies()
@@ -331,6 +347,10 @@ def convert_forces(self, x):
def set_energy_unit(self, value: str):
"""
Set a new energy unit for the dataset.
+
+ Parameters:
+ value:
+ New energy unit to set.
"""
# old_unit = self.energy_unit
# self.__energy_unit__ = value
@@ -340,6 +360,10 @@ def set_energy_unit(self, value: str):
def set_distance_unit(self, value: str):
"""
Set a new distance unit for the dataset.
+
+ Parameters:
+ value:
+ New distance unit to set.
"""
# old_unit = self.distance_unit
# self.__distance_unit__ = value
@@ -351,9 +375,22 @@ def set_array_format(self, format: str):
self.array_format = format
def read_raw_entries(self):
+ """
+ Preprocess the raw (aka from the fetched source) into a list of dictionaries.
+ """
raise NotImplementedError
- def collate_list(self, list_entries):
+ def collate_list(self, list_entries: List[Dict]) -> Dict:
+ """
+ Collate a list of entries into a single dictionary.
+
+ Parameters:
+ list_entries:
+ List of dictionaries containing the entries to collate.
+
+ Returns:
+ Dictionary containing the collated entries.
+ """
# concatenate entries
res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}
@@ -364,54 +401,29 @@ def collate_list(self, list_entries):
return res
- def save_preprocess(self, data_dict, upload=False, overwrite=True):
+ def save_preprocess(
+ self, data_dict: Dict[str, np.ndarray], upload: bool = False, overwrite: bool = True, as_zarr: bool = False
+ ):
"""
Save the preprocessed data to the cache directory and optionally upload it to the remote storage.
- data_dict : dict
- Dictionary containing the preprocessed data.
- upload : bool, Defult: False
- Whether to upload the preprocessed data to the remote storage or only saving it locally.
- overwrite : bool, Default: False
- Whether to overwrite the preprocessed data if it already exists.
- Only used if upload is True. Cache is always overwritten locally.
+
+ Parameters:
+ data_dict:
+ Dictionary containing the preprocessed data.
+ upload:
+ Whether to upload the preprocessed data to the remote storage or only saving it locally.
+ overwrite:
+ Whether to overwrite the preprocessed data if it already exists.
+ Only used if upload is True. Cache is always overwritten locally.
"""
# save memmaps
logger.info("Preprocessing data and saving it to cache.")
- for key in self.data_keys:
- local_path = p_join(self.preprocess_path, f"{key}.mmap")
- out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape)
- out[:] = data_dict.pop(key)[:]
- out.flush()
- if upload:
- push_remote(local_path, overwrite=overwrite)
-
- # save smiles and subset
- local_path = p_join(self.preprocess_path, "props.pkl")
-
- # assert that (required) pkl keys are present in data_dict
- assert all([key in data_dict.keys() for key in self.pkl_data_keys])
-
- # store unique and inverse indices for str-based pkl keys
- for key in self.pkl_data_keys:
- if self.pkl_data_types[key] == str:
- data_dict[key] = np.unique(data_dict[key], return_inverse=True)
-
- with open(local_path, "wb") as f:
- pkl.dump(data_dict, f)
+ paths = self.dataset_wrapper.save_preprocess(
+ self.preprocess_path, self.data_keys, data_dict, self.pkl_data_keys, self.pkl_data_types
+ )
if upload:
- push_remote(local_path, overwrite=overwrite)
-
- def _convert_on_loading(self, x, key):
- if key == "energies":
- return self.convert_energy(x)
- elif key == "forces":
- return self.convert_forces(x)
- elif key == "atomic_inputs":
- x = np.array(x, dtype=np.float32)
- x[:, -3:] = self.convert_distance(x[:, -3:])
- return x
- else:
- return x
+ for local_path in paths:
+ push_remote(local_path, overwrite=overwrite) # make it async?
def read_preprocess(self, overwrite_local_cache=False):
logger.info("Reading preprocessed data.")
@@ -421,62 +433,106 @@ def read_preprocess(self, overwrite_local_cache=False):
Distance: {self.distance_unit},\n\
Forces: {self.force_unit if self.force_methods else 'None'}"
)
- self.data = {}
- for key in self.data_keys:
- filename = p_join(self.preprocess_path, f"{key}.mmap")
- pull_locally(filename, overwrite=overwrite_local_cache)
- self.data[key] = np.memmap(filename, mode="r", dtype=self.data_types[key]).reshape(*self.data_shapes[key])
-
- filename = p_join(self.preprocess_path, "props.pkl")
- pull_locally(filename, overwrite=overwrite_local_cache)
- with open(filename, "rb") as f:
- tmp = pkl.load(f)
- all_pkl_keys = set(tmp.keys()) - set(self.data_keys)
- # assert required pkl_keys are present in all_pkl_keys
- assert all([key in all_pkl_keys for key in self.pkl_data_keys])
- for key in all_pkl_keys:
- x = tmp.pop(key)
- if len(x) == 2:
- self.data[key] = x[0][x[1]]
- else:
- self.data[key] = x
+ self.data = self.dataset_wrapper.load_data(
+ self.preprocess_path,
+ self.data_keys,
+ self.data_types,
+ self.data_shapes,
+ self.pkl_data_keys,
+ overwrite_local_cache,
+ ) # this should be async if possible
for key in self.data:
logger.info(f"Loaded {key} with shape {self.data[key].shape}, dtype {self.data[key].dtype}")
- def is_preprocessed(self):
+ def _convert_on_loading(self, x, key):
+ if key == "energies":
+ return self.convert_energy(x)
+ elif key == "forces":
+ return self.convert_forces(x)
+ elif key == "atomic_inputs":
+ x = np.array(x, dtype=np.float32)
+ x[:, -3:] = self.convert_distance(x[:, -3:])
+ return x
+ else:
+ return x
+
+ def is_preprocessed(self) -> bool:
"""
Check if the dataset is preprocessed and available online or locally.
+
+ Returns:
+ True if the dataset is available remotely or locally, False otherwise.
"""
- predicats = [copy_exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
- predicats += [copy_exists(p_join(self.preprocess_path, "props.pkl"))]
+ predicats = [
+ copy_exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f"{key}")))
+ for key in self.data_keys
+ ]
+ predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]
return all(predicats)
- def is_cached(self):
+ def is_cached(self) -> bool:
"""
Check if the dataset is cached locally.
+
+ Returns:
+ True if the dataset is cached locally, False otherwise.
"""
- predicats = [os.path.exists(p_join(self.preprocess_path, f"{key}.mmap")) for key in self.data_keys]
- predicats += [os.path.exists(p_join(self.preprocess_path, "props.pkl"))]
+ predicats = [
+ os.path.exists(p_join(self.preprocess_path, self.dataset_wrapper.add_extension(f"{key}")))
+ for key in self.data_keys
+ ]
+ predicats += [copy_exists(p_join(self.preprocess_path, file)) for file in self.dataset_wrapper._extra_files]
return all(predicats)
- def preprocess(self, upload: bool = False, overwrite: bool = True):
+ def preprocess(self, upload: bool = False, overwrite: bool = True, as_zarr: bool = True):
"""
Preprocess the dataset and save it.
- upload : bool, Defult: False
- Whether to upload the preprocessed data to the remote storage or only saving it locally.
- overwrite : bool, Default: False
- Whether to overwrite the preprocessed data if it already exists.
- Only used if upload is True. Cache is always overwritten locally.
+
+ Parameters:
+ upload:
+ Whether to upload the preprocessed data to the remote storage or only saving it locally.
+ overwrite:
+ hether to overwrite the preprocessed data if it already exists.
+ Only used if upload is True. Cache is always overwritten locally.
+ as_zarr:
+ Whether to save the data as zarr files
"""
if overwrite or not self.is_preprocessed():
entries = self.read_raw_entries()
res = self.collate_list(entries)
- self.save_preprocess(res, upload, overwrite)
+ self.save_preprocess(res, upload, overwrite, as_zarr)
+
+ def upload(self, overwrite: bool = False, as_zarr: bool = False):
+ """
+ Upload the preprocessed data to the remote storage. Must be called after preprocess and
+ need to have write privileges.
+
+ Parameters:
+ overwrite:
+ Whether to overwrite the remote data if it already exists
+ as_zarr:
+ Whether to upload the data as zarr files
+ """
+ for key in self.data_keys:
+ local_path = p_join(self.preprocess_path, f"{key}.mmap" if not as_zarr else f"{key}.zip")
+ push_remote(local_path, overwrite=overwrite)
+ local_path = p_join(self.preprocess_path, "props.pkl" if not as_zarr else "metadata.zip")
+ push_remote(local_path, overwrite=overwrite)
- def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext=True):
+ def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None, ext: bool = True):
"""
- Save the entry at index idx as an extxyz file.
+ Save a single entry at index idx as an extxyz file.
+
+ Parameters:
+ idx:
+ Index of the entry
+ energy_method:
+ Index of the energy method to use
+ path:
+ Path to save the xyz file. If None, the current working directory is used.
+ ext:
+ Whether to include additional informations like forces and other metadatas (extxyz format)
"""
if path is None:
path = os.getcwd()
@@ -486,6 +542,12 @@ def save_xyz(self, idx: int, energy_method: int = 0, path: Optional[str] = None,
def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):
"""
Save dataset as single xyz file (extended xyz format).
+
+ Parameters:
+ energy_method:
+ Index of the energy method to use
+ path:
+ Path to save the xyz file
"""
with open(p_join(path if path else os.getcwd(), f"{self.__name__}.xyz"), "w") as f:
for atoms in tqdm(
@@ -495,16 +557,20 @@ def to_xyz(self, energy_method: int = 0, path: Optional[str] = None):
):
write_extxyz(f, atoms, append=True)
- def get_ase_atoms(self, idx: int, energy_method: int = 0, ext=True):
+ def get_ase_atoms(self, idx: int, energy_method: int = 0, ext: bool = True) -> Atoms:
"""
Get the ASE atoms object for the entry at index idx.
- Parameters
- ----------
- idx : int
- Index of the entry.
- ext : bool, optional
- Whether to include additional informations
+ Parameters:
+ idx:
+ Index of the entry.
+ energy_method:
+ Index of the energy method to use
+ ext:
+ Whether to include additional informations
+
+ Returns:
+ ASE atoms object
"""
entry = self[idx]
at = dict_to_atoms(entry, ext=ext, energy_method=energy_method)
@@ -537,24 +603,23 @@ def calculate_descriptors(
"""
Compute the descriptors for the dataset.
- Parameters
- ----------
- descriptor_name : str
- Name of the descriptor to use. Supported descriptors are ["soap"]
- chemical_species : Optional[List[str]], optional
- List of chemical species to use for the descriptor computation, by default None.
- If None, the chemical species of the dataset are used.
- n_samples : Optional[Union[List[int],int, float]], optional
- Number of samples to use for the computation, by default None. If None, all the dataset is used.
- If a list of integers is provided, the descriptors are computed for each of the specified idx of samples.
- progress : bool, optional
- Whether to show a progress bar, by default True.
- **descriptor_kwargs : dict
- Keyword arguments to pass to the descriptor instantiation of the model.
-
- Returns
- -------
- Dict[str, np.ndarray]
+ Parameters:
+ descriptor_name:
+ Name of the descriptor to use. Supported descriptors are ["soap"]
+ chemical_species:
+ List of chemical species to use for the descriptor computation, by default None.
+ If None, the chemical species of the dataset are used.
+ n_samples:
+ Number of samples to use for the computation, by default None.
+ If None, all the dataset is used.
+ If a list of integers is provided, the descriptors are computed for
+ each of the specified idx of samples.
+ progress:
+ Whether to show a progress bar, by default True.
+ **descriptor_kwargs : dict
+ Keyword arguments to pass to the descriptor instantiation of the model.
+
+ Returns:
Dictionary containing the following keys:
- values : np.ndarray of shape (N, M) containing the descriptors for the dataset
- idxs : np.ndarray of shape (N,) containing the indices of the samples used
@@ -577,14 +642,18 @@ def wrapper(idx):
datum["idxs"] = idxs
return datum
- def as_iter(self, atoms: bool = False, energy_method: int = 0):
+ def as_iter(self, atoms: bool = False, energy_method: int = 0) -> Iterable:
"""
Return the dataset as an iterator.
- Parameters
- ----------
- atoms : bool, optional
- Whether to return the items as ASE atoms object, by default False
+ Parameters:
+ atoms:
+ Whether to return the items as ASE atoms object, by default False
+ energy_method:
+ Index of the energy method to use
+
+ Returns:
+ Iterator of the dataset
"""
func = partial(self.get_ase_atoms, energy_method=energy_method) if atoms else self.__getitem__
@@ -592,12 +661,21 @@ def as_iter(self, atoms: bool = False, energy_method: int = 0):
for i in range(len(self)):
yield func(i)
- def get_statistics(self, return_none: bool = True):
+ def __iter__(self):
+ for idxs in range(len(self)):
+ yield self[idxs]
+
+ def get_statistics(self, return_none: bool = True) -> Dict:
"""
Get the converted statistics of the dataset.
- return_none : bool, optional
- Whether to return None if the statistics for the forces are not available, by default True
- Otherwise, the statistics for the forces are set to 0.0
+
+ Parameters:
+ return_none :
+ Whether to return None if the statistics for the forces are not available, by default True
+ Otherwise, the statistics for the forces are set to 0.0
+
+ Returns:
+ Dictionary containing the statistics of the dataset
"""
selected_stats = self.statistics.get_results()
if len(selected_stats) == 0:
diff --git a/openqdc/datasets/interaction/des.py b/openqdc/datasets/interaction/des.py
index d90be07e..6788c41c 100644
--- a/openqdc/datasets/interaction/des.py
+++ b/openqdc/datasets/interaction/des.py
@@ -74,13 +74,19 @@ def _create_subsets(self, **kwargs):
class DES370K(BaseInteractionDataset, IDES):
"""
- DE Shaw Research interaction energy of over 370K
- small molecule dimers as described in the paper:
-
- Quantum chemical benchmark databases of gold-standard dimer interaction energies.
- Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
- Sci Data 8, 55 (2021).
- https://doi.org/10.1038/s41597-021-00833-x
+ DE Shaw 370K (DES370K) is a dataset of 3,691 distinct dimers with 370K unique geometries with interaction energies
+ computed at CCSD(T)/CBS level of theory. It consists of 392 closed-shell chemical species (both neutral molecules
+ and ions) including water and functional groups found in proteins. Dimer geometries are generated using
+ QM-based optimization with DF-LMP2/aVDZ level of theory and MD-based from condensed phase MD simulations.
+
+ Usage:
+ ```python
+ from openqdc.datasets import DES370K
+ dataset = DES370K()
+ ```
+
+ Reference:
+ https://www.nature.com/articles/s41597-021-00833-x
"""
__name__ = "des370k_interaction"
@@ -173,13 +179,18 @@ def read_raw_entries(self) -> List[Dict]:
class DES5M(DES370K):
"""
- DE Shaw Research interaction energy calculations for
- over 5M small molecule dimers as described in the paper:
-
- Quantum chemical benchmark databases of gold-standard dimer interaction energies.
- Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
- Sci Data 8, 55 (2021).
- https://doi.org/10.1038/s41597-021-00833-x
+ DE Shaw 5M (DES5M) is a dataset of 3,691 distinct dimers with 5,000,000 unique geometries with interaction energies
+ computed using SNS-MP2, a machine learning approach. The unique geometries are generated similar to DES370K using
+ QM based optimization and MD simulations.
+
+ Usage:
+ ```python
+ from openqdc.datasets import DES5M
+ dataset = DES5M()
+ ```
+
+ Reference:
+ https://www.nature.com/articles/s41597-021-00833-x
"""
__name__ = "des5m_interaction"
@@ -242,18 +253,19 @@ class DES5M(DES370K):
class DESS66(DES370K):
"""
- DE Shaw Research interaction energy
- estimates of all 66 conformers from
- the original S66 dataset as described
- in the paper:
-
- Quantum chemical benchmark databases of gold-standard dimer interaction energies.
- Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
- Sci Data 8, 55 (2021).
- https://doi.org/10.1038/s41597-021-00833-x
-
- Data was downloaded from Zenodo:
- https://zenodo.org/records/5676284
+ DESS66 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS
+ dimer interaction energies with 1 equilibrium geometry giving 66 conformers in total.
+ The protocol for estimating energies is based on the DES370K paper.
+
+ Usage:
+ ```python
+ from openqdc.datasets import DESS66
+ dataset = DESS66()
+ ```
+
+ Reference:
+ https://www.nature.com/articles/s41597-021-00833-x\n
+ S66: https://pubs.acs.org/doi/10.1021/ct2002946
"""
__name__ = "des_s66"
@@ -266,19 +278,18 @@ def _create_subsets(self, **kwargs):
class DESS66x8(DESS66):
"""
- DE Shaw Research interaction energy
- estimates of all 528 conformers from
- the original S66x8 dataset as described
- in the paper:
-
- Quantum chemical benchmark databases of gold-standard dimer interaction energies.
- Donchev, A.G., Taube, A.G., Decolvenaere, E. et al.
- Sci Data 8, 55 (2021).
- https://doi.org/10.1038/s41597-021-00833-x
-
- Data was downloaded from Zenodo:
-
- https://zenodo.org/records/5676284
+ DESS66x8 is a dataset consisting of 66 molecular complexes from the S66 dataset with CCSD(T)/CBS
+ dimer interaction energies with 1 equilibrium geometry and 8 geometries along the dissociation curve
+ giving 592 conformers in total. The protocol for estimating energies is based on the DES370K paper.
+
+ Usage:
+ ```python
+ from openqdc.datasets import DESS66x8
+ dataset = DESS66x8()
+ ```
+
+ Reference:
+ https://www.nature.com/articles/s41597-021-00833-x
"""
__name__ = "des_s66x8"
diff --git a/openqdc/datasets/interaction/l7.py b/openqdc/datasets/interaction/l7.py
index 75a63cd5..7307638c 100644
--- a/openqdc/datasets/interaction/l7.py
+++ b/openqdc/datasets/interaction/l7.py
@@ -7,15 +7,18 @@
class L7(YamlDataset):
"""
- The L7 interaction energy dataset as described in:
-
- Accuracy of Quantum Chemical Methods for Large Noncovalent Complexes
- Robert Sedlak, Tomasz Janowski, Michal Pitoňák, Jan Řezáč, Peter Pulay, and Pavel Hobza
- Journal of Chemical Theory and Computation 2013 9 (8), 3364-3374
- DOI: 10.1021/ct400036b
-
- Data was downloaded and extracted from:
- http://cuby4.molecular.cz/dataset_l7.html
+ The L7 interaction energy dataset consists of 7 dispersion stabilized non-covalent complexes with
+ energies labelled using semi-empirical and quantum mechanical methods. The intial geometries are
+ taken from crystal X-ray data and optimized with a DFT method specific to the complex.
+
+ Usage:
+ ```python
+ from openqdc.datasets import L7
+ dataset = L7()
+ ```
+
+ Reference:
+ https://pubs.acs.org/doi/10.1021/ct400036b
"""
__name__ = "l7"
diff --git a/openqdc/datasets/interaction/metcalf.py b/openqdc/datasets/interaction/metcalf.py
index faf5324f..889370e8 100644
--- a/openqdc/datasets/interaction/metcalf.py
+++ b/openqdc/datasets/interaction/metcalf.py
@@ -84,20 +84,19 @@ def read_xyz(fname, subset):
class Metcalf(BaseInteractionDataset):
"""
- Hydrogen-bonded dimers of NMA with 126 molecules as described in:
-
- Approaches for machine learning intermolecular interaction energies and
- application to energy components from symmetry adapted perturbation theory.
- Derek P. Metcalf, Alexios Koutsoukas, Steven A. Spronk, Brian L. Claus,
- Deborah A. Loughney, Stephen R. Johnson, Daniel L. Cheney, C. David Sherrill;
- J. Chem. Phys. 21 February 2020; 152 (7): 074103.
- https://doi.org/10.1063/1.5142636
-
- Further details:
- "Hydrogen-bonded dimers involving N-methylacetamide (NMA) and 126 molecules
- (46 donors and 80 acceptors; Figs. 2 and 3) were used. Optimized geometries
- for the 126 individual monomers were obtained and paired with NMA in broad
- arrays of spatial configurations to generate thousands of complexes for training.
+ Metcalf is a dataset consisting of 126 hydrogen-bonded dimers involving N-methylacetamide (NMA) with 14,744 to
+ 156,704 geometries/configurations for each complex. The geometries are optimized using the RI-MP2 method and
+ the cc-pVTZ basis set. SAPT(0) calculations are performed for computing interaction energies and the various
+ components.
+
+ Usage:
+ ```python
+ from openqdc.datasets import Metcalf
+ dataset = Metcalf()
+ ```
+
+ Reference:
+ https://doi.org/10.1063/1.5142636
"""
__name__ = "metcalf"
diff --git a/openqdc/datasets/interaction/splinter.py b/openqdc/datasets/interaction/splinter.py
index bda10129..6ba3b4d5 100644
--- a/openqdc/datasets/interaction/splinter.py
+++ b/openqdc/datasets/interaction/splinter.py
@@ -12,13 +12,18 @@
class Splinter(BaseInteractionDataset):
"""
- A dataset of over 1.7 million protein-ligand
- interactions as described in the paper:
+ Splinter consists of 30,416A dimer pairs with over 1.5 million geometries. The geometries are generated
+ by quantum mechanical optimization with B3LYP-D3/aug-cc-pV(D+d)Z level of theory. The interaction energies
+ and the various components are computed using SAPT0/qug-cc-pV(D=d)Z method.
- A quantum chemical interaction energy dataset for accurately modeling protein-ligand interactions.
- Spronk, S.A., Glick, Z.L., Metcalf, D.P. et al.
- Sci Data 10, 619 (2023).
- https://doi.org/10.1038/s41597-023-02443-1
+ Usage:
+ ```python
+ from openqdc.datasets import Splinter
+ dataset = Splinter()
+ ```
+
+ Reference:
+ https://doi.org/10.1038/s41597-023-02443-1
"""
__energy_unit__ = "kcal/mol"
diff --git a/openqdc/datasets/interaction/x40.py b/openqdc/datasets/interaction/x40.py
index 64da5d87..d56a976d 100644
--- a/openqdc/datasets/interaction/x40.py
+++ b/openqdc/datasets/interaction/x40.py
@@ -8,16 +8,21 @@
class X40(YamlDataset):
"""
- X40 interaction dataset of 40 dimer pairs as
- introduced in the following paper:
-
- Benchmark Calculations of Noncovalent Interactions of Halogenated Molecules
- Jan Řezáč, Kevin E. Riley, and Pavel Hobza
- Journal of Chemical Theory and Computation 2012 8 (11), 4285-4292
- DOI: 10.1021/ct300647k
-
- Dataset retrieved and processed from:
- http://cuby4.molecular.cz/dataset_x40.html
+ X40 interaction dataset of 40 noncovalent complexes of organic halides, halohydrides, and halogen molecules
+ where the halogens participate in various interaction types such as electrostatic interactions, london
+ dispersion, hydrogen bonds, halogen bonding, halogen-pi interactions and stacking of halogenated aromatic
+ molecules. For each complex 10 geometries are generated resulting in 400 geometries in the dataset. The geometries
+ are optimized using the MP2 level of theory with cc-pVTZ basis set whereas the interaction energies are
+ computed with CCSD(T)/CBS level of theory.
+
+ Usage:
+ ```python
+ from openqdc.datasets import X40
+ dataset = X40()
+ ```
+
+ Reference:
+ https://pubs.acs.org/doi/10.1021/ct300647k
"""
__name__ = "x40"
diff --git a/openqdc/datasets/io.py b/openqdc/datasets/io.py
index 1e621f72..7316768b 100644
--- a/openqdc/datasets/io.py
+++ b/openqdc/datasets/io.py
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
-from typing import Callable, List, Optional
+from typing import Callable, Dict, List, Optional
import datamol as dm
import numpy as np
@@ -17,6 +17,8 @@ def try_retrieve(obj, callable, default=None):
class FromFileDataset(BaseDataset, ABC):
+ """Abstract class for datasets that read from a common format file like xzy, netcdf, gro, hdf5, etc."""
+
def __init__(
self,
path: List[str],
@@ -28,6 +30,7 @@ def __init__(
array_format: Optional[str] = "numpy",
level_of_theory: Optional[QmMethod] = None,
transform: Optional[Callable] = None,
+ skip_statistics: bool = False,
regressor_kwargs={
"solver_type": "linear",
"sub_sample": None,
@@ -35,18 +38,37 @@ def __init__(
},
):
"""
- Create a dataset from a xyz file.
+ Create a dataset from a list of files.
Parameters
----------
path : List[str]
The path to the file or a list of paths.
+ dataset_name : Optional[str], optional
+ The name of the dataset, by default None.
+ energy_type : Optional[str], optional
+ The type of isolated atom energy by default "regression".
+ Supported types: ["formation", "regression", "null", None]
+ energy_unit
+ Energy unit of the dataset. Default is "hartree".
+ distance_unit
+ Distance unit of the dataset. Default is "ang".
+ level_of_theory: Optional[QmMethod, str]
+ The level of theory of the dataset.
+ Used if energy_type is "formation" to fetch the correct isolated atom energies.
+ transform, optional
+ transformation to apply to the __getitem__ calls
+ regressor_kwargs
+ Dictionary of keyword arguments to pass to the regressor.
+ Default: {"solver_type": "linear", "sub_sample": None, "stride": 1}
+ solver_type can be one of ["linear", "ridge"]
"""
self.path = [path] if isinstance(path, str) else path
self.__name__ = self.__class__.__name__ if dataset_name is None else dataset_name
self.recompute_statistics = True
self.refit_e0s = True
self.energy_type = energy_type
+ self.skip_statistics = skip_statistics
self.__energy_unit__ = energy_unit
self._original_unit = self.energy_unit
self.__distance_unit__ = distance_unit
@@ -62,29 +84,19 @@ def __init__(
self.set_array_format(array_format)
self._post_init(True, energy_unit, distance_unit)
- def __str__(self):
- return self.__name__.lower()
-
- def __repr__(self):
- return str(self)
-
@abstractmethod
def read_as_atoms(self, path: str) -> List[Atoms]:
"""
- Method that reads a path and return a list of Atoms objects.
+ Method that reads a file and return a list of Atoms objects.
+ path : str
+ The path to the file.
"""
raise NotImplementedError
- def collate_list(self, list_entries):
- res = {key: np.concatenate([r[key] for r in list_entries if r is not None], axis=0) for key in list_entries[0]}
- csum = np.cumsum(res.get("n_atoms"))
- x = np.zeros((csum.shape[0], 2), dtype=np.int32)
- x[1:, 0], x[:, 1] = csum[:-1], csum
- res["position_idx_range"] = x
-
- return res
-
- def read_raw_entries(self):
+ def read_raw_entries(self) -> List[Dict]:
+ """
+ Process the files and return a list of data objects.
+ """
entries_list = []
for path in self.path:
for entry in self.read_as_atoms(path):
@@ -96,6 +108,11 @@ def _read_and_preprocess(self):
self.data = self.collate_list(entries_list)
def _convert_to_record(self, obj: Atoms):
+ """
+ Convert an Atoms object to a record for the openQDC dataset processing.
+ obj : Atoms
+ The ase.Atoms object to convert
+ """
name = obj.info.get("name", None)
subset = obj.info.get("subset", str(self))
positions = obj.positions
@@ -116,8 +133,18 @@ def _convert_to_record(self, obj: Atoms):
n_atoms=np.array([len(positions)], dtype=np.int32),
)
+ def __str__(self):
+ return self.__name__.lower()
+
+ def __repr__(self):
+ return str(self)
+
class XYZDataset(FromFileDataset):
+ """
+ Baseclass to read datasets from xyz and extxyz files.
+ """
+
def read_as_atoms(self, path):
from ase.io import iread
diff --git a/openqdc/datasets/potential/__init__.py b/openqdc/datasets/potential/__init__.py
index b59fcad7..35721dde 100644
--- a/openqdc/datasets/potential/__init__.py
+++ b/openqdc/datasets/potential/__init__.py
@@ -1,6 +1,7 @@
+from .alchemy import Alchemy
from .ani import ANI1, ANI1CCX, ANI1CCX_V2, ANI1X, ANI2X
from .comp6 import COMP6
-from .dummy import Dummy
+from .dummy import Dummy, PredefinedDataset
from .gdml import GDML
from .geom import GEOM
from .iso_17 import ISO17
@@ -10,18 +11,23 @@
from .nabladft import NablaDFT
from .orbnet_denali import OrbnetDenali
from .pcqm import PCQM_B3LYP, PCQM_PM6
+from .proteinfragments import MDDataset, ProteinFragments
from .qm1b import QM1B, QM1B_SMALL
from .qm7x import QM7X, QM7X_V2
from .qmugs import QMugs, QMugs_V2
+from .qmx import QM7, QM8, QM9, QM7b
from .revmd17 import RevMD17
from .sn2_rxn import SN2RXN
from .solvated_peptides import SolvatedPeptides
from .spice import Spice, SpiceV2, SpiceVL2
from .tmqm import TMQM
from .transition1x import Transition1X
+from .vqm24 import VQM24
+from .waterclusters import SCANWaterClusters
from .waterclusters3_30 import WaterClusters
AVAILABLE_POTENTIAL_DATASETS = {
+ "Alchemy": Alchemy,
"ANI1": ANI1,
"ANI1CCX": ANI1CCX,
"ANI1CCX_V2": ANI1CCX_V2,
@@ -42,6 +48,10 @@
"QMugs_V2": QMugs_V2,
"QM1B": QM1B,
"QM1B_SMALL": QM1B_SMALL,
+ "QM7": QM7,
+ "QM7b": QM7b,
+ "QM8": QM8,
+ "QM9": QM9,
"SN2RXN": SN2RXN,
"SolvatedPeptides": SolvatedPeptides,
"Spice": Spice,
@@ -50,8 +60,12 @@
"TMQM": TMQM,
"Transition1X": Transition1X,
"WaterClusters": WaterClusters,
+ "SCANWaterClusters": SCANWaterClusters,
"MultixcQM9": MultixcQM9,
"MultixcQM9_V2": MultixcQM9_V2,
"RevMD17": RevMD17,
"MD22": MD22,
+ "VQM24": VQM24,
+ "ProteinFragments": ProteinFragments,
+ "MDDataset": MDDataset,
}
diff --git a/openqdc/datasets/potential/alchemy.py b/openqdc/datasets/potential/alchemy.py
new file mode 100644
index 00000000..24c17cd9
--- /dev/null
+++ b/openqdc/datasets/potential/alchemy.py
@@ -0,0 +1,95 @@
+from os.path import join as p_join
+
+import datamol as dm
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils.molecule import get_atomic_number_and_charge
+
+# ['gdb_idx', 'atom number', 'zpve\n(Ha, zero point vibrational energy)',
+#'Cv\n(cal/molK, heat capacity at 298.15 K)', 'gap\n(Ha, LUMO-HOMO)',
+# 'G\n(Ha, Free energy at 298.15 K)', 'HOMO\n(Ha, energy of HOMO)',
+# 'U\n(Ha, internal energy at 298.15 K)', 'alpha\n(a_0^3, Isotropic polarizability)',
+# 'U0\n(Ha, internal energy at 0 K)', 'H\n(Ha, enthalpy at 298.15 K)',
+# 'LUMO\n(Ha, energy of LUMO)', 'mu\n(D, dipole moment)',
+# 'R2\n(a_0^2, electronic spatial extent)']
+
+
+def read_mol(file, energy):
+ try:
+ mol = dm.read_sdf(file, remove_hs=False)[0]
+ positions = mol.GetConformer().GetPositions()
+ x = get_atomic_number_and_charge(mol)
+ n_atoms = positions.shape[0]
+ res = dict(
+ atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),
+ name=np.array([dm.to_smiles(mol)]),
+ energies=np.array([energy], dtype=np.float64)[:, None],
+ n_atoms=np.array([n_atoms], dtype=np.int32),
+ subset=np.array([f"atoms_{n_atoms}"]),
+ )
+
+ except Exception as e:
+ print(f"Skipping due to {e}")
+ res = None
+
+ return res
+
+
+# e B3LYP/6-31G(2df,p) model with the density fitting
+# approximation for electron repulsion integrals. The auxiliary basis cc-pVDZ-jkf
+
+
+class Alchemy(BaseDataset):
+ """
+ Alchemy comprises of 119,487 organic molecules with up to 14 heavy atoms, sampled from the GDB MedChem database.
+ Molecular properties are calculated using PySCF's implementation of the DFT Kohn-Sham method at the B3LYP level
+ with the basis set 6-31G(2df,p). The equilibrium geometry is optimized in three passes. First, OpenBabel is used
+ to parse SMILES string and build the Cartesian coordinates with MMFF94 force field optimization. Second, HF/STO3G
+ is used to generate the preliminary geometry. Third, for the final pass of geometry relaxation, the
+ B3LYP/6-31G(2df,p) model with the density fittting approximation for electron repulsion integrals is used. The
+ auxillary basis cc-pVDZ-jkfit is employed in density fitting to build the Coulomb matrix and the HF exchange
+ matrix.
+
+ Usage:
+ ```python
+ from openqdc.datasets import Alchemy
+ dataset = Alchemy()
+ ```
+
+ Reference:
+ https://arxiv.org/abs/1906.09427
+ https://alchemy.tencent.com/
+ """
+
+ __name__ = "alchemy"
+
+ __energy_methods__ = [
+ PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)"
+ ]
+
+ energy_target_names = [
+ "ωB97x:6-31G(d) Energy",
+ ]
+
+ __energy_unit__ = "hartree"
+ __distance_unit__ = "ang"
+ __forces_unit__ = "hartree/ang"
+ __links__ = {"alchemy.zip": "https://alchemy.tencent.com/data/alchemy-v20191129.zip"}
+
+ def read_raw_entries(self):
+ dir_path = p_join(self.root, "Alchemy-v20191129")
+ full_csv = pd.read_csv(p_join(dir_path, "final_version.csv"))
+ energies = full_csv["U0\n(Ha, internal energy at 0 K)"].tolist()
+ atom_folder = full_csv["atom number"]
+ gdb_idx = full_csv["gdb_idx"]
+ idxs = full_csv.index.tolist()
+ samples = []
+ for i in tqdm(idxs):
+ sdf_file = p_join(dir_path, f"atom_{atom_folder[i]}", f"{gdb_idx[i]}.sdf")
+ energy = energies[i]
+ samples.append(read_mol(sdf_file, energy))
+ return samples
diff --git a/openqdc/datasets/potential/ani.py b/openqdc/datasets/potential/ani.py
index bcff384f..aac35635 100644
--- a/openqdc/datasets/potential/ani.py
+++ b/openqdc/datasets/potential/ani.py
@@ -39,19 +39,22 @@ def extract_ani2_entries(properties):
class ANI1(BaseDataset):
"""
- The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small
- organic molecules with energy labels calculated using DFT. The molecules
- contain 4 distinct atoms, C, N, O and H.
-
- Usage
+ The ANI-1 dataset is a collection of 22 x 10^6 structural conformations from 57,000 distinct small organic
+ molecules. The molecules contain 4 distinct atoms, C, N, O and H. Electronic structure calculations use the
+ wB97x density functional and the 6-31G(d) basis set. For generating structures, smiles strings for molecules
+ are used for generating 3D conformations using RDKit. These 3D structures are then pre-optimized to a stationary
+ point using the MMFF94 force field. Finally, geometries are optimized until energy minima using the chosen DFT
+ level.
+
+ Usage:
```python
from openqdc.datasets import ANI1
dataset = ANI1()
```
References:
- - ANI-1: https://www.nature.com/articles/sdata2017193
- - Github: https://github.com/aiqm/ANI1x_datasets
+ https://www.nature.com/articles/sdata2017193\n
+ https://github.com/aiqm/ANI1x_datasets
"""
__name__ = "ani1"
@@ -79,9 +82,6 @@ def config(self):
return dict(dataset_name="ani", links=self.__links__)
def __smiles_converter__(self, x):
- """util function to convert string to smiles: useful if the smiles is
- encoded in a different format than its display format
- """
return "-".join(x.decode("ascii").split("-")[:-1])
@property
@@ -96,64 +96,23 @@ def read_raw_entries(self):
return samples
-class ANI1CCX(ANI1):
- """
- ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset. The selected
- conformations are then labelled using a high accuracy CCSD(T)*/CBS method.
-
- Usage
- ```python
- from openqdc.datasets import ANI1CCX
- dataset = ANI1CCX()
- ```
-
- References:
- - ANI-1ccx: https://doi.org/10.1038/s41467-019-10827-4
- - Github: https://github.com/aiqm/ANI1x_datasets
- """
-
- __name__ = "ani1ccx"
- __energy_unit__ = "hartree"
- __distance_unit__ = "ang"
- __forces_unit__ = "hartree/ang"
-
- __energy_methods__ = [
- PotentialMethod.NONE, # "ccsd(t)/cbs",
- PotentialMethod.NONE, # "ccsd(t)/cc-pvdz",
- PotentialMethod.NONE, # "ccsd(t)/cc-pvtz",
- PotentialMethod.NONE, # "tccsd(t)/cc-pvdz",
- ]
-
- energy_target_names = [
- "CCSD(T)*:CBS Total Energy",
- "NPNO-CCSD(T):cc-pVDZ Correlation Energy",
- "NPNO-CCSD(T):cc-pVTZ Correlation Energy",
- "TPNO-CCSD(T):cc-pVDZ Correlation Energy",
- ]
- force_target_names = []
- __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"}
-
- def __smiles_converter__(self, x):
- """util function to convert string to smiles: useful if the smiles is
- encoded in a different format than its display format
- """
- return x
-
-
class ANI1X(ANI1):
"""
The ANI-1X dataset consists of ANI-1 molecules + some molecules added using active learning, which leads to
- a total of 5,496,771 conformers with 63,865 unique molecules.
+ a total of 5,496,771 conformers with 63,865 unique molecules. Databases of molecules like GDB-11, ChEMBL,
+ generated amino acids and 2-amino acid peptides are used for sampling new molecules. One of the techniques
+ are used for sampling conformations, (1) molecular dynamics, (2) normal mode sampling, (3) dimer sampling and
+ (4) torsion sampling.
- Usage
+ Usage:
```python
from openqdc.datasets import ANI1X
dataset = ANI1X()
```
References:
- - ANI-1x: https://doi.org/10.1063/1.5023802
- - Github: https://github.com/aiqm/ANI1x_datasets
+ https://doi.org/10.1063/1.5023802\n
+ https://github.com/aiqm/ANI1x_datasets
"""
__name__ = "ani1x"
@@ -162,14 +121,14 @@ class ANI1X(ANI1):
__forces_unit__ = "hartree/ang"
__energy_methods__ = [
- "hf/cc-pvdz",
- "hf/cc-pvqz",
- "hf/cc-pvtz",
- "mp2/cc-pvdz",
- "mp2/cc-pvqz",
- "mp2/cc-pvtz",
- "wb97x/6-31g(d)",
- "wb97x/cc-pvtz",
+ PotentialMethod.NONE, # "hf/cc-pvdz",
+ PotentialMethod.NONE, # "hf/cc-pvqz",
+ PotentialMethod.NONE, # "hf/cc-pvtz",
+ PotentialMethod.NONE, # "mp2/cc-pvdz",
+ PotentialMethod.NONE, # "mp2/cc-pvqz",
+ PotentialMethod.NONE, # "mp2/cc-pvtz",
+ PotentialMethod.NONE, # "wb97x/6-31g(d)",
+ PotentialMethod.NONE, # "wb97x/cc-pvtz",
]
energy_target_names = [
@@ -194,6 +153,47 @@ class ANI1X(ANI1):
def convert_forces(self, x):
return super().convert_forces(x) * 0.529177249 # correct the Dataset error
+ def __smiles_converter__(self, x):
+ return x
+
+
+class ANI1CCX(ANI1):
+ """
+ ANI1-CCX is a dataset of 500k conformers subsampled from the 5.5M conformers of ANI-1X dataset using active
+ learning. The conformations are labelled using a high accuracy CCSD(T)*/CBS method.
+
+ Usage:
+ ```python
+ from openqdc.datasets import ANI1CCX
+ dataset = ANI1CCX()
+ ```
+
+ References:
+ https://doi.org/10.1038/s41467-019-10827-4\n
+ https://github.com/aiqm/ANI1x_datasets
+ """
+
+ __name__ = "ani1ccx"
+ __energy_unit__ = "hartree"
+ __distance_unit__ = "ang"
+ __forces_unit__ = "hartree/ang"
+
+ __energy_methods__ = [
+ PotentialMethod.NONE, # "ccsd(t)/cbs",
+ PotentialMethod.NONE, # "ccsd(t)/cc-pvdz",
+ PotentialMethod.NONE, # "ccsd(t)/cc-pvtz",
+ PotentialMethod.NONE, # "tccsd(t)/cc-pvdz",
+ ]
+
+ energy_target_names = [
+ "CCSD(T)*:CBS Total Energy",
+ "NPNO-CCSD(T):cc-pVDZ Correlation Energy",
+ "NPNO-CCSD(T):cc-pVTZ Correlation Energy",
+ "TPNO-CCSD(T):cc-pVDZ Correlation Energy",
+ ]
+ force_target_names = []
+ __links__ = {"ani1x.hdf5.gz": "https://zenodo.org/record/4081694/files/292.hdf5.gz"}
+
def __smiles_converter__(self, x):
"""util function to convert string to smiles: useful if the smiles is
encoded in a different format than its display format
@@ -202,6 +202,21 @@ def __smiles_converter__(self, x):
class ANI1CCX_V2(ANI1CCX):
+ """
+ ANI1CCX_V2 is an extension of the ANI1CCX dataset with additional PM6 and GFN2_xTB labels
+ for each conformation.
+
+ Usage:
+ ```python
+ from openqdc.datasets import ANI1CCX_V2
+ dataset = ANI1CCX_V2()
+ ```
+
+ References:
+ https://doi.org/10.1038/s41467-019-10827-4\n
+ https://github.com/aiqm/ANI1x_datasets
+ """
+
__name__ = "ani1ccx_v2"
__energy_methods__ = ANI1CCX.__energy_methods__ + [PotentialMethod.PM6, PotentialMethod.GFN2_XTB]
@@ -211,19 +226,20 @@ class ANI1CCX_V2(ANI1CCX):
class ANI2X(ANI1):
"""
- The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL,
- and s66x8. It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k
- chemical isomers, optimized using the LBFGS algorithm and labeled with ωB97X/6-31G*.
+ The ANI-2X dataset was constructed using active learning from modified versions of GDB-11, CheMBL, and s66x8.
+ It adds three new elements (F, Cl, S) resulting in 4.6 million conformers from 13k chemical isomers, optimized
+ using the LBFGS algorithm and labeled with ωB97X/6-31G*. The same sampling techniques as done in ANI-1X are
+ used for generating geometries.
- Usage
+ Usage:
```python
- from openqdc.datasets import ANI@X
+ from openqdc.datasets import ANI2X
dataset = ANI2X()
```
References:
- - ANI-2x: https://doi.org/10.1021/acs.jctc.0c00121
- - Github: https://github.com/aiqm/ANI1x_datasets
+ https://doi.org/10.1021/acs.jctc.0c00121
+ https://github.com/aiqm/ANI1x_datasets
"""
__name__ = "ani2x"
@@ -258,9 +274,6 @@ class ANI2X(ANI1):
}
def __smiles_converter__(self, x):
- """util function to convert string to smiles: useful if the smiles is
- encoded in a different format than its display format
- """
return x
def read_raw_entries(self):
diff --git a/openqdc/datasets/potential/comp6.py b/openqdc/datasets/potential/comp6.py
index d5998e0a..fe24825c 100644
--- a/openqdc/datasets/potential/comp6.py
+++ b/openqdc/datasets/potential/comp6.py
@@ -7,19 +7,43 @@
class COMP6(BaseDataset):
"""
- COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space
- developed for testing the ANI-1x potential. It is curated from 6 benchmark sets:
- S66x8, ANI Molecular Dynamics, GDB7to9, GDB10to13, DrugBank, and Tripeptides.
+ COMP6 is a benchmark suite consisting of broad regions of bio-chemical and organic space developed for testing the
+ ANI-1x potential. It is curated from 6 benchmark sets: S66x8, ANI-MD, GDB7to9, GDB10to13, DrugBank, and
+ Tripeptides. Energies and forces for all non-equilibrium molecular conformations are calculated using
+ the wB97x density functional with the 6-31G(d) basis set. The dataset also includes Hirshfield charges and
+ molecular dipoles.
- Usage
+ Details of the benchmark sets are as follows:
+ S66x8: Consists of 66 dimeric systems involving hydrogen bonding, pi-pi stacking, London interactions and
+ mixed influence interactions.\n
+ ANI Molecular Dynamics (ANI-MD): Forces from the ANI-1x potential are used for running 1ns vacuum molecular
+ dynamics with a 0.25fs time step at 300K using the Langevin thermostat of 14 well-known drug molecules and 2 small
+ proteins. A random subsample of 128 frames from each 1ns trajectory is selected, and reference DFT single point
+ calculations are performed to calculate energies and forces.\n
+ GDB7to9: Consists of 1500 molecules where 500 per 7, 8 and 9 heavy atoms subsampled from the GDB-11 dataset.
+ The intial structure are randomly embedded into 3D space using RDKit and are optimized with tight convergence
+ criteria. Normal modes/force constants are computer using the reference DFT model. Finally, Diverse normal
+ mode sampling (DNMS) is carried out to generate non-equilibrium conformations.\n
+ GDB10to13: Consists of 3000 molecules where 500 molecules per 10 and 11 heavy atoms are subsampled from GDB-11
+ and 1000 molecules per 12 and 13 heavy atom are subsampled from GDB-13. Non-equilibrium conformations are
+ generated via DNMS.\n
+ Tripeptide: Consists of 248 random tripeptides. Structures are optimized similar to GDB7to9.\n
+ DrugBank: Consists of 837 molecules subsampled from the original DrugBank database of real drug molecules.
+ Structures are optimized similar to GDB7to9.
+
+ Usage:
```python
from openqdc.datasets import COMP6
dataset = COMP6()
```
References:
- - https://aip.scitation.org/doi/abs/10.1063/1.5023802
- - Github: https://github.com/isayev/COMP6
+ https://aip.scitation.org/doi/abs/10.1063/1.5023802\n
+ https://github.com/isayev/COMP6\n
+ S66x8: https://pubs.rsc.org/en/content/articlehtml/2016/cp/c6cp00688d\n
+ GDB-11: https://pubmed.ncbi.nlm.nih.gov/15674983/\n
+ GDB-13: https://pubmed.ncbi.nlm.nih.gov/19505099/\n
+ DrugBank: https://pubs.acs.org/doi/10.1021/ja902302h
"""
__name__ = "comp6"
diff --git a/openqdc/datasets/potential/gdml.py b/openqdc/datasets/potential/gdml.py
index 24f283e6..24c74754 100644
--- a/openqdc/datasets/potential/gdml.py
+++ b/openqdc/datasets/potential/gdml.py
@@ -8,25 +8,32 @@
class GDML(BaseDataset):
"""
Gradient Domain Machine Learning (GDML) is a dataset consisting of samples from ab initio
- molecular dynamics (AIMD) trajectories. The dataset consists of,
- - Benzene: 627000 samples
- - Uracil: 133000 samples
- - Naptalene: 326000 samples
- - Aspirin: 211000 samples
- - Salicylic Acid: 320000 samples
- - Malonaldehyde: 993000 samples
- - Ethanol: 555000 samples
- - Toluene: 100000 samples
+ molecular dynamics (AIMD) trajectories at a resolution of 0.5fs. The dataset consists of, Benzene
+ (627000 conformations), Uracil (133000 conformations), Naptalene (326000 conformations), Aspirin
+ (211000 conformations) Salicylic Acid (320000 conformations), Malonaldehyde (993000 conformations),
+ Ethanol (555000 conformations) and Toluene (100000 conformations). Energy and force labels for
+ each conformation are computed using the PBE + vdW-TS electronic structure method.
+ molecular dynamics (AIMD) trajectories.
- Usage
+ The dataset consists of the following trajectories:
+ Benzene: 627000 samples\n
+ Uracil: 133000 samples\n
+ Naptalene: 326000 samples\n
+ Aspirin: 211000 samples\n
+ Salicylic Acid: 320000 samples\n
+ Malonaldehyde: 993000 samples\n
+ Ethanol: 555000 samples\n
+ Toluene: 100000 samples\n
+
+ Usage:
```python
from openqdc.datasets import GDML
dataset = GDML()
```
References:
- - https://www.science.org/doi/10.1126/sciadv.1603015
- - http://www.sgdml.org/#datasets
+ https://www.science.org/doi/10.1126/sciadv.1603015
+ http://www.sgdml.org/#datasets
"""
__name__ = "gdml"
diff --git a/openqdc/datasets/potential/geom.py b/openqdc/datasets/potential/geom.py
index d07a3d93..7c86b1e3 100644
--- a/openqdc/datasets/potential/geom.py
+++ b/openqdc/datasets/potential/geom.py
@@ -61,9 +61,11 @@ def read_mol(mol_id: str, mol_dict, base_path: str, partition: str) -> Dict[str,
class GEOM(BaseDataset):
"""
- The Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules
- from QM9, and 317,000 molecules with experimental data related to biophysics, physiology,
- and physical chemistry. The dataset is generated using the GFN2-xTB semi-empirical method.
+ Geometric Ensemble Of Molecules (GEOM) dataset contains 37 million conformers for 133,000 molecules
+ from QM9, and 317,000 molecules with experimental data related to biophysics, physiology, and physical chemistry.
+ For each molecule, the initial structure is generated with RDKit, optimized with the GFN2-xTB energy method and
+ the lowest energy conformer is fed to the CREST software. CREST software uses metadynamics for exploring the
+ conformational space for each molecule. Energies in the dataset are computed using semi-empirical method GFN2-xTB.
Usage:
```python
@@ -72,8 +74,9 @@ class GEOM(BaseDataset):
```
References:
- - https://www.nature.com/articles/s41597-022-01288-4
- - https://github.com/learningmatter-mit/geom
+ https://www.nature.com/articles/s41597-022-01288-4\n
+ https://github.com/learningmatter-mit/geom\n
+ CREST Software: https://pubs.rsc.org/en/content/articlelanding/2020/cp/c9cp06869d
"""
__name__ = "geom"
diff --git a/openqdc/datasets/potential/iso_17.py b/openqdc/datasets/potential/iso_17.py
index fe6aab5c..5672650b 100644
--- a/openqdc/datasets/potential/iso_17.py
+++ b/openqdc/datasets/potential/iso_17.py
@@ -7,11 +7,12 @@
class ISO17(BaseDataset):
"""
- ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed
- composition of atoms (C7O2H10) arranged in different chemically valid structures. It consist
- of 129 molecules, each containing 5,000 conformational geometries, energies and forces with a resolution
- of 1 femtosecond in the molecular dynamics trajectories. The simulations were carried out using the
- Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der Waals correction method.
+ ISO17 dataset consists of the largest set of isomers from the QM9 dataset that consists of a fixed composition of
+ atoms (C7O2H10) arranged in different chemically valid structures. It consist of 129 molecules, each containing
+ 5,000 conformational geometries, energies and forces with a resolution of 1 fs in the molecular dynamics
+ trajectories. The simulations were carried out using density functional theory (DFT) in the generalized gradient
+ approximation (GGA) with the Perdew-Burke-Ernzerhof (PBE) functional and the Tkatchenko-Scheffler (TS) van der
+ Waals correction method.
Usage:
```python
@@ -20,7 +21,11 @@ class ISO17(BaseDataset):
```
References:
- - https://paperswithcode.com/dataset/iso17
+ https://arxiv.org/abs/1706.08566\n
+ https://arxiv.org/abs/1609.08259\n
+ https://www.nature.com/articles/sdata201422\n
+ https://pubmed.ncbi.nlm.nih.gov/10062328/\n
+ https://pubmed.ncbi.nlm.nih.gov/19257665/
"""
__name__ = "iso_17"
diff --git a/openqdc/datasets/potential/md22.py b/openqdc/datasets/potential/md22.py
index b9976426..0eb4a72c 100644
--- a/openqdc/datasets/potential/md22.py
+++ b/openqdc/datasets/potential/md22.py
@@ -40,6 +40,22 @@ def create_path(filename, root):
class MD22(RevMD17):
+ """
+ MD22 consists of molecular dynamics (MD) trajectories of four major classes of biomolecules and supramolecules,
+ ranging from a small peptide with 42 atoms to a double-walled nanotube with 370 atoms. The simulation trajectories
+ are sampled at 400K and 500K with a resolution of 1fs. Potential energy and forces are computed using the PBE+MBD
+ level of theory.
+
+ Usage:
+ ```python
+ from openqdc.datasets import MD22
+ dataset = MD22()
+ ```
+
+ Reference:
+ https://arxiv.org/abs/2209.14865
+ """
+
__name__ = "md22"
__links__ = {
f"{x}.npz": f"http://www.quantum-machine.org/gdml/repo/datasets/md22_{x}.npz"
diff --git a/openqdc/datasets/potential/molecule3d.py b/openqdc/datasets/potential/molecule3d.py
index ec1dbd00..fa4f4683 100644
--- a/openqdc/datasets/potential/molecule3d.py
+++ b/openqdc/datasets/potential/molecule3d.py
@@ -67,9 +67,10 @@ def _read_sdf(sdf_path: str, properties_path: str) -> List[Dict[str, np.ndarray]
class Molecule3D(BaseDataset):
"""
- Molecule3D dataset consists of 3,899,647 molecules with ground state geometries and energies
- calculated at the B3LYP/6-31G* level of theory. The molecules are extracted from the
- PubChem database and cleaned by removing invalid molecule files.
+ Molecule3D dataset consists of 3,899,647 molecules with equilibrium geometries and energies calculated at the
+ B3LYP/6-31G* level of theory. The molecules are extracted from the PubChem database and cleaned by removing
+ molecules with invalid molecule files, with SMILES conversion error, RDKIT warnings, sanitization problems,
+ or with damaged log files.
Usage:
```python
@@ -78,8 +79,8 @@ class Molecule3D(BaseDataset):
```
References:
- - https://arxiv.org/abs/2110.01717
- - https://github.com/divelab/MoleculeX
+ https://arxiv.org/abs/2110.01717\n
+ https://github.com/divelab/MoleculeX
"""
__name__ = "molecule3d"
diff --git a/openqdc/datasets/potential/multixcqm9.py b/openqdc/datasets/potential/multixcqm9.py
index 70dab1ea..41d7a4dc 100644
--- a/openqdc/datasets/potential/multixcqm9.py
+++ b/openqdc/datasets/potential/multixcqm9.py
@@ -37,20 +37,21 @@ def read_xyz_files(folder_path):
class MultixcQM9(BaseDataset):
"""
- MultixcQM9 is a dataset of molecular and reaction energies from
- multi-level quantum chemical methods consisting of 133 K QM9 molecules
- calculated with 76 different DFT functionals and three different basis sets
- (228 energy numbers for each molecule) + 1 GFN2-XTB calculation.
+ MultixcQM9 is a dataset of molecular and reaction energies from multi-level quantum chemical methods consisting
+ of 133K QM9 molecules geometries calculated with 76 different DFT functionals and three different basis sets
+ resulting in 228 energy values for each molecule along with semi-empirical method GFN2-xTB. Geometries for the
+ molecules are used directly from Kim et al. which uses G4MP2 method.
Usage:
```python
- from openqdc.datasets import NablaDFT
- dataset = NablaDFT()
+ from openqdc.datasets import MultixcQM9
+ dataset = MultixcQM9()
```
References:
- - https://www.nature.com/articles/s41597-023-02690-2
- - https://github.com/chemsurajit/largeDFTdata
+ https://www.nature.com/articles/s41597-023-02690-2\n
+ https://github.com/chemsurajit/largeDFTdata\n
+ https://www.nature.com/articles/s41597-019-0121-7\n
"""
__name__ = "multixcqm9"
diff --git a/openqdc/datasets/potential/nabladft.py b/openqdc/datasets/potential/nabladft.py
index 4700ade5..f83f1c00 100644
--- a/openqdc/datasets/potential/nabladft.py
+++ b/openqdc/datasets/potential/nabladft.py
@@ -52,7 +52,11 @@ class NablaDFT(BaseDataset):
"""
NablaDFT is a dataset constructed from a subset of the
[Molecular Sets (MOSES) dataset](https://github.com/molecularsets/moses) consisting of 1 million molecules
- with 5,340,152 unique conformations generated using ωB97X-D/def2-SVP level of theory.
+ with 5,340,152 unique conformations. Conformations for each molecule are generated in 2 steps. First, a set of
+ conformations are generated using RDKit. Second, using Butina Clustering Method on conformations, clusters that
+ cover 95% of the conformations are selected and the centroids of those clusters are selected as the final set.
+ This results in 1-62 conformations per molecule. For generating quantum properties, Kohn-Sham method at
+ wB97X-D/def2-XVP levels are used to generate the energy.
Usage:
```python
@@ -61,8 +65,8 @@ class NablaDFT(BaseDataset):
```
References:
- - https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D
- - https://github.com/AIRI-Institute/nablaDFT
+ https://pubs.rsc.org/en/content/articlelanding/2022/CP/D2CP03966D\n
+ https://github.com/AIRI-Institute/nablaDFT
"""
__name__ = "nabladft"
@@ -76,6 +80,15 @@ class NablaDFT(BaseDataset):
__forces_unit__ = "hartree/bohr"
__links__ = {"nabladft.db": "https://n-usr-31b1j.s3pd12.sbercloud.ru/b-usr-31b1j-qz9/data/moses_db/dataset_full.db"}
+ @property
+ def data_types(self):
+ return {
+ "atomic_inputs": np.float32,
+ "position_idx_range": np.int32,
+ "energies": np.float32,
+ "forces": np.float32,
+ }
+
@requires_package("nablaDFT")
def read_raw_entries(self):
from nablaDFT.dataset import HamiltonianDatabase
diff --git a/openqdc/datasets/potential/orbnet_denali.py b/openqdc/datasets/potential/orbnet_denali.py
index 6a7c3f47..1dd70468 100644
--- a/openqdc/datasets/potential/orbnet_denali.py
+++ b/openqdc/datasets/potential/orbnet_denali.py
@@ -36,10 +36,14 @@ def read_archive(mol_id, conf_dict, base_path, energy_target_names: List[str]) -
class OrbnetDenali(BaseDataset):
"""
- Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. It performs
- DFT (ωB97X-D3/def2-TZVP) calculations on molecules and geometries consisting of organic molecules
- and chemistries, with protonation and tautomeric states, non-covalent interactions, common salts,
- and counterions, spanning the most common elements in bio and organic chemistry.
+ Orbnet Denali is a collection of 2.3 million conformers from 212,905 unique molecules. Molecules include a range
+ of organic molecules with protonation and tautomeric states, non-covalent interactions, common salts, and
+ counterions, spanning the most common elements in bio and organic chemistry. Geometries are generated in 2 steps.
+ First, four energy-minimized conformations are generated for each molecule using the ENTOS BREEZE conformer
+ generator. Second, using the four energy-minimized conformers, non-equilibrium geometries are generated using
+ normal mode sampling at 300K or ab initio molecular dynamics (AIMD) for 200fs at 500K; using GFN1-xTB level of
+ theory. Energies are calculated using DFT method wB97X-D3/def2-TZVP and semi-empirical method GFN1-xTB level of
+ theory.
Usage:
```python
@@ -48,8 +52,8 @@ class OrbnetDenali(BaseDataset):
```
References:
- - https://arxiv.org/pdf/2107.00299.pdf
- - https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867
+ https://arxiv.org/abs/2107.00299\n
+ https://figshare.com/articles/dataset/OrbNet_Denali_Training_Data/14883867
"""
__name__ = "orbnet_denali"
@@ -74,13 +78,6 @@ def read_raw_entries(self):
for mol_id, group in df.groupby("mol_id")
}
- # print(df.head())
- # tmp = df.to_dict('index')
- # for i, k in enumerate(tmp):
- # print(k, tmp[k])
- # if i > 10:
- # break
- # exit()
fn = lambda x: read_archive(x[0], x[1], self.root, self.energy_target_names)
res = dm.parallelized(fn, list(labels.items()), scheduler="threads", n_jobs=-1, progress=True)
samples = sum(res, [])
diff --git a/openqdc/datasets/potential/pcqm.py b/openqdc/datasets/potential/pcqm.py
index 535b90dc..cd32b838 100644
--- a/openqdc/datasets/potential/pcqm.py
+++ b/openqdc/datasets/potential/pcqm.py
@@ -66,6 +66,23 @@ def read_preprocessed_archive(path):
class PCQM_PM6(BaseDataset):
+ """
+ PubChemQC PM6 (PCQM_PM6) is an exhaustive dataset containing 221 million organic molecules with optimized
+ molecular geometries and electronic properties. To generate the dataset, only molecules with weights less
+ than 1000g/mol are considered from the PubChem ftp site. The initial structure is generated using OpenBabel
+ and then is optimized using geometry optimization with the semi-empirical method PM6. The energies are also
+ computed using the PM6 method.
+
+ Usage:
+ ```python
+ from openqdc.datasets import PCQM_PM6
+ dataset = PCQM_PM6()
+ ```
+
+ References:
+ https://pubs.acs.org/doi/abs/10.1021/acs.jcim.0c00740
+ """
+
__name__ = "pubchemqc_pm6"
__energy_methods__ = [PotentialMethod.PM6]
@@ -93,6 +110,15 @@ def collate_list(self, list_entries):
res = None
return res
+ @property
+ def data_types(self):
+ return {
+ "atomic_inputs": np.float32,
+ "position_idx_range": np.int32,
+ "energies": np.float32,
+ "forces": np.float32,
+ }
+
def read_raw_entries(self):
arxiv_paths = glob(p_join(self.root, f"{self.__energy_methods__[0]}", "*.pkl"))
f = lambda x: self.collate_list(read_preprocessed_archive(x))
@@ -150,6 +176,21 @@ def collate_and_save_list(self, list_entries):
class PCQM_B3LYP(PCQM_PM6):
+ """
+ PubChemQC B3LYP/6-31G* (PCQM_B3LYP) comprises of 85 million molecules ranging from essential compounds to
+ biomolecules. The geometries for the molecule are optimized using PM6. Using the optimized geometry,
+ the electronic structure and properties are calculated using B3LIP/6-31G* method.
+
+ Usage:
+ ```python
+ from openqdc.datasets import PCQM_B3LYP
+ dataset = PCQM_B3LYP()
+ ```
+
+ References:
+ https://arxiv.org/abs/2305.18454
+ """
+
__name__ = "pubchemqc_b3lyp"
__energy_methods__ = ["b3lyp/6-31g*"]
energy_target_names = ["b3lyp"]
diff --git a/openqdc/datasets/potential/proteinfragments.py b/openqdc/datasets/potential/proteinfragments.py
new file mode 100644
index 00000000..d6289750
--- /dev/null
+++ b/openqdc/datasets/potential/proteinfragments.py
@@ -0,0 +1,192 @@
+import os
+from os.path import join as p_join
+
+import numpy as np
+from tqdm import tqdm
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils.io import get_local_cache
+from openqdc.utils.package_utils import requires_package
+
+
+def convert_entries(r, e, f, z, subset):
+ coordinates = r
+ species = z
+ forces = f
+ energies = e
+ n_atoms = coordinates.shape[0]
+ flattened_coordinates = coordinates[:].reshape((-1, 3))
+ xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1)
+ res = dict(
+ name=np.array([subset]),
+ subset=np.array([subset]),
+ energies=energies[:].reshape((-1, 1)).astype(np.float64),
+ atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32),
+ n_atoms=np.array([n_atoms], dtype=np.int32),
+ forces=forces[:].reshape(-1, 3, 1).astype(np.float32),
+ )
+ return res
+
+
+@requires_package("apsw")
+def read_db(path):
+ database = Database(path)
+ subset = os.path.basename(path).split(".")[0]
+ # Read an entry from the database.
+ # entry = 0
+ n = len(database)
+ entries = []
+ for entry in tqdm(range(n)):
+ q, s, z, r, e, f, d = database[entry]
+ entries.append(convert_entries(r, e, f, z, subset))
+ return entries
+
+ # assert entry < len(database)
+ # q, s, z, r, e, f, d = database[entry]
+ # with np.printoptions(threshold=7):
+ # print(f'entry {entry} of {len(database)}')
+ # print('total charge\n', q)
+ # print('number of unpaired electrons\n', s)
+ # print('atomic numbers\n', z)
+ # print('positions [Å]\n', r)
+ # print('energy [eV]\n', e)
+ # print('forces [eV/Å]\n', f)
+ # print('dipole [e*Å]\n', d)
+
+
+class Database:
+ @requires_package("apsw")
+ def __init__(self, filename):
+ import apsw
+
+ self.cursor = apsw.Connection(filename, flags=apsw.SQLITE_OPEN_READONLY).cursor()
+
+ def __len__(self):
+ return self.cursor.execute("""SELECT * FROM metadata WHERE id=1""").fetchone()[-1]
+
+ def __getitem__(self, idx):
+ data = self.cursor.execute("""SELECT * FROM data WHERE id=""" + str(idx)).fetchone()
+ return self._unpack_data_tuple(data)
+
+ def _deblob(self, buffer, dtype, shape=None):
+ array = np.frombuffer(buffer, dtype)
+ if not np.little_endian:
+ array = array.byteswap()
+ array.shape = shape
+ return np.copy(array)
+
+ def _unpack_data_tuple(self, data):
+ n = len(data[3]) // 4 # A single int32 is 4 bytes long.
+ q = np.asarray([0.0 if data[1] is None else data[1]], dtype=np.float32)
+ s = np.asarray([0.0 if data[2] is None else data[2]], dtype=np.float32)
+ z = self._deblob(data[3], dtype=np.int32, shape=(n,))
+ r = self._deblob(data[4], dtype=np.float32, shape=(n, 3))
+ e = np.asarray([0.0 if data[5] is None else data[5]], dtype=np.float32)
+ f = self._deblob(data[6], dtype=np.float32, shape=(n, 3))
+ d = self._deblob(data[7], dtype=np.float32, shape=(1, 3))
+ return q, s, z, r, e, f, d
+
+
+class ProteinFragments(BaseDataset):
+ """
+ ProteinFragments is a dataset constructed from a subset of the
+ the data was generated from a top-down and bottom-up approach:
+
+ Top-down:
+ Fragments are generated by cutting out a spherical
+ region around an atom (including solvent molecules)
+ and saturating all dangling bonds.
+ Sampling was done with the Molecular Dynamics (MD) method from
+ conventional FF at room temperature.
+
+ Bottom-up:
+ Fragments are generated by constructing chemical graphs
+ of one to eight nonhydrogen atoms.
+ Sampling of multiple conformers per fragments was done with
+ MD simulations at high temperatures or normal mode sampling.
+
+
+ Usage:
+ ```python
+ from openqdc.datasets import ProteinFragments
+ dataset = ProteinFragments()
+ ```
+
+ References:
+ https://www.science.org/doi/10.1126/sciadv.adn4397
+ """
+
+ __name__ = "proteinfragments"
+ # PBE0/def2-TZVPP+MBD
+ __energy_methods__ = [
+ PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)"
+ ]
+
+ energy_target_names = [
+ "ωB97x:6-31G(d) Energy",
+ ]
+ # PBE0/def2-TZVPP+MBD
+ __energy_unit__ = "ev"
+ __distance_unit__ = "ang"
+ __forces_unit__ = "ev/ang"
+ __links__ = {
+ f"{name}.db": f"https://zenodo.org/records/10720941/files/{name}.db?download=1"
+ for name in ["general_protein_fragments"]
+ }
+
+ @property
+ def root(self):
+ return p_join(get_local_cache(), "proteinfragments")
+
+ @property
+ def config(self):
+ assert len(self.__links__) > 0, "No links provided for fetching"
+ return dict(dataset_name="proteinfragments", links=self.__links__)
+
+ @property
+ def preprocess_path(self):
+ path = p_join(self.root, "preprocessed", self.__name__)
+ os.makedirs(path, exist_ok=True)
+ return path
+
+ def read_raw_entries(self):
+ samples = []
+ for name in self.__links__:
+ raw_path = p_join(self.root, f"{name}")
+ samples.extend(read_db(raw_path))
+ return samples
+
+
+class MDDataset(ProteinFragments):
+ """
+ MDDataset is a subset of the proteinfragments dataset that
+ generated from the molecular dynamics with their model.
+ The sampling was done with Molecular Dynamics
+ at room temperature 300K in various solvent phase:
+
+ Subsets:
+ Polyalanine:
+ All the polyalanine are sampled in gas phase. AceAla15Lys is
+ a polyalanine peptides capped with an N-terminal acetyl group
+ and a protonated lysine residue at the C-terminus,
+ Acela15nme is polyalanine peptide capped with an N-terminal acetyl group
+ and a C-terminal N-methyl amide group\n
+ Crambin: 46-residue protein crambin in aqueous solution (25,257 atoms)
+
+ Usage:
+ ```python
+ from openqdc.datasets import MDDataset
+ dataset = MDDataset()
+ ```
+
+ References:
+ https://www.science.org/doi/10.1126/sciadv.adn4397
+ """
+
+ __name__ = "mddataset"
+
+ __links__ = {
+ f"{name}.db": f"https://zenodo.org/records/10720941/files/{name}.db?download=1"
+ for name in ["acala15nme_folding_clusters", "crambin", "minimahopping_acala15lysh", "minimahopping_acala15nme"]
+ }
diff --git a/openqdc/datasets/potential/qm1b.py b/openqdc/datasets/potential/qm1b.py
index 5e10ed23..edccae0d 100644
--- a/openqdc/datasets/potential/qm1b.py
+++ b/openqdc/datasets/potential/qm1b.py
@@ -78,11 +78,11 @@ def extract_from_row(row, file_idx=None):
class QM1B(BaseDataset):
"""
- QM1B is a low-resolution DFT dataset generated using PySCF IPU.
- It is composed of one billion training examples containing 9-11 heavy atoms.
- It was created by taking 1.09M SMILES strings from the GDB-11 database and
- computing molecular properties (e.g. HOMO-LUMO gap) for a set of up to 1000
- conformers per molecule at the B3LYP/STO-3G level of theory.
+ QM1B is a dataset containing 1 billion conformations for 1.09M small molecules generated using a custom
+ PySCF library that incorporates hardware acceleration via IPUs. The molecules contain 9-11 heavy atoms and are
+ subsampled from the Generated Data Bank (GDB). For each molecule, 1000 geometries are generated using RDKit.
+ Electronic properties for each conformation are then calculated using the density functional B3LYP
+ and the basis set STO-3G.
Usage:
```python
@@ -91,8 +91,8 @@ class QM1B(BaseDataset):
```
References:
- - https://arxiv.org/pdf/2311.01135
- - https://github.com/graphcore-research/qm1b-dataset/
+ https://arxiv.org/pdf/2311.01135\n
+ https://github.com/graphcore-research/qm1b-dataset/
"""
__name__ = "qm1b"
@@ -144,8 +144,7 @@ def extract_parallel(df, i):
class QM1B_SMALL(QM1B):
"""
- QM1B_SMALL is a subset of the QM1B dataset containing a
- maximum of 15 random conformers per molecule.
+ QM1B_SMALL is a subset of the QM1B dataset containing a maximum of 15 random conformers per molecule.
Usage:
```python
diff --git a/openqdc/datasets/potential/qm7x.py b/openqdc/datasets/potential/qm7x.py
index 7bf1323c..351162c0 100644
--- a/openqdc/datasets/potential/qm7x.py
+++ b/openqdc/datasets/potential/qm7x.py
@@ -35,12 +35,15 @@ def read_mol(mol_h5, mol_name, energy_target_names, force_target_names):
class QM7X(BaseDataset):
"""
- QM7X is a collection of almost 4.2 million conformers from 6,950 unique molecules. It contains DFT
- energy and force labels at the PBE0+MBD level of theory. It consists of structures for molecules with
- up to seven heavy (C, N, O, S, Cl) atoms from the GDB13 database. For each molecule, (meta-)stable
- equilibrium structures including constitutional/structural isomers and stereoisomers are
- searched using density-functional tight binding (DFTB). Then, for each (meta-)stable structure, 100
- off-equilibrium structures are obtained and labeled with PBE0+MBD.
+ QM7X is a collection of almost 4.2 million conformers from 6,950 unique organic molecules. The molecules with
+ up to seven heavy (C, N, O, S, Cl) atoms are considered from the GDB13 database. For generating conformations,
+ OpenBabel is utilized to get an initial structure using the MMFF94 force field. Using the initial structure, meta-
+ stable conformational isomers are generated using the Confab tool along with the MMFF94 force field. The structure
+ is then re-optimized with density-functional tight binding (DFTB) supplemented with many-body dispersion (MBD)
+ interactions. The lowest energy structure is then considered as the final equilibrium conformer. Additionally, non
+ -equilibrium conformations are generated by displacing the equilibrium geometry along a linear combination of
+ normal mode coordinates computed at the DFTB3-MBD level within the harmonic approximation. The dataset has
+ energy values for each geometry computed at PBE0-MBD and DFTB3-MBD method.
Usage:
```python
@@ -49,8 +52,8 @@ class QM7X(BaseDataset):
```
References:
- - https://arxiv.org/abs/2006.15139
- - https://zenodo.org/records/4288677
+ https://arxiv.org/abs/2006.15139\n
+ https://zenodo.org/records/4288677
"""
__name__ = "qm7x"
@@ -59,9 +62,9 @@ class QM7X(BaseDataset):
energy_target_names = ["ePBE0+MBD", "eDFTB+MBD"]
- __force_mask__ = [True, True]
+ __force_mask__ = [True, False]
- force_target_names = ["pbe0FOR", "vdwFOR"]
+ force_target_names = ["pbe0FOR"]
__energy_unit__ = "ev"
__distance_unit__ = "ang"
@@ -81,6 +84,16 @@ def read_raw_entries(self):
class QM7X_V2(QM7X):
+ """
+ QM7X_V2 is an extension of the QM7X dataset containing PM6 labels for each of the 4.2M geometries.
+
+ Usage:
+ ```python
+ from openqdc.datasets import QM7X_V2
+ dataset = QM7X_V2()
+ ```
+ """
+
__name__ = "qm7x_v2"
__energy_methods__ = QM7X.__energy_methods__ + [PotentialMethod.PM6]
__force_mask__ = QM7X.__force_mask__ + [False]
diff --git a/openqdc/datasets/potential/qmugs.py b/openqdc/datasets/potential/qmugs.py
index 6cc38900..b819b214 100644
--- a/openqdc/datasets/potential/qmugs.py
+++ b/openqdc/datasets/potential/qmugs.py
@@ -38,8 +38,9 @@ def read_mol(mol_dir):
class QMugs(BaseDataset):
"""
The QMugs dataset contains 2 million conformers for 665k biologically and pharmacologically relevant molecules
- extracted from the ChEMBL database. The atomic and molecular properties are calculated using both,
- semi-empirical methods (GFN2-xTB) and DFT method (ωB97X-D/def2-SVP).
+ extracted from the ChEMBL database. Three geometries per molecule are generated and optimized using the GFN2-xTB
+ method. Using the optimized geometry, the atomic and molecular properties are calculated using both, semi-empirical
+ method (GFN2-xTB) and DFT method (ωB97X-D/def2-SVP).
Usage:
```python
@@ -48,8 +49,9 @@ class QMugs(BaseDataset):
```
References:
- - https://www.nature.com/articles/s41597-022-01390-7#ethics
- - https://www.research-collection.ethz.ch/handle/20.500.11850/482129
+ https://arxiv.org/abs/2107.00367\n
+ https://www.nature.com/articles/s41597-022-01390-7#ethics\n
+ https://www.research-collection.ethz.ch/handle/20.500.11850/482129
"""
__name__ = "qmugs"
@@ -76,6 +78,16 @@ def read_raw_entries(self):
class QMugs_V2(QMugs):
+ """
+ QMugs_V2 is an extension of the QMugs dataset containing PM6 labels for each of the 4.2M geometries.
+
+ Usage:
+ ```python
+ from openqdc.datasets import QMugs_V2
+ dataset = QMugs_V2()
+ ```
+ """
+
__name__ = "qmugs_v2"
__energy_methods__ = QMugs.__energy_methods__ + [PotentialMethod.PM6]
energy_target_names = QMugs.energy_target_names + ["PM6"]
diff --git a/openqdc/datasets/potential/qmx.py b/openqdc/datasets/potential/qmx.py
new file mode 100644
index 00000000..2dfb8443
--- /dev/null
+++ b/openqdc/datasets/potential/qmx.py
@@ -0,0 +1,402 @@
+import os
+from abc import ABC
+from os.path import join as p_join
+
+import datamol as dm
+import numpy as np
+import pandas as pd
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils import read_qc_archive_h5
+from openqdc.utils.io import get_local_cache
+from openqdc.utils.molecule import get_atomic_number_and_charge
+
+
+def extract_ani2_entries(properties):
+ coordinates = properties["coordinates"]
+ species = properties["species"]
+ forces = properties["forces"]
+ energies = properties["energies"]
+ n_atoms = coordinates.shape[1]
+ n_entries = coordinates.shape[0]
+ flattened_coordinates = coordinates[:].reshape((-1, 3))
+ xs = np.stack((species[:].flatten(), np.zeros(flattened_coordinates.shape[0])), axis=-1)
+ res = dict(
+ name=np.array(["ANI2"] * n_entries),
+ subset=np.array([str(n_atoms)] * n_entries),
+ energies=energies[:].reshape((-1, 1)).astype(np.float64),
+ atomic_inputs=np.concatenate((xs, flattened_coordinates), axis=-1, dtype=np.float32),
+ n_atoms=np.array([n_atoms] * n_entries, dtype=np.int32),
+ forces=forces[:].reshape(-1, 3, 1).astype(np.float32),
+ )
+ return res
+
+
+class QMX(ABC, BaseDataset):
+ """
+ QMX dataset base abstract class
+ """
+
+ __name__ = "qm9"
+
+ __energy_methods__ = [
+ PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)"
+ ]
+
+ energy_target_names = [
+ "ωB97x:6-31G(d) Energy",
+ ]
+
+ __energy_unit__ = "hartree"
+ __distance_unit__ = "bohr"
+ __forces_unit__ = "hartree/bohr"
+ __links__ = {}
+
+ @property
+ def root(self):
+ return p_join(get_local_cache(), "qmx")
+
+ @property
+ def preprocess_path(self):
+ path = p_join(self.root, "preprocessed", self.__name__)
+ os.makedirs(path, exist_ok=True)
+ return path
+
+ @property
+ def config(self):
+ assert len(self.__links__) > 0, "No links provided for fetching"
+ return dict(dataset_name="qmx", links=self.__links__)
+
+ def read_raw_entries(self):
+ raw_path = p_join(self.root, f"{self.__name__}.h5.gz")
+ samples = read_qc_archive_h5(raw_path, self.__name__, self.energy_target_names, None)
+ return samples
+
+
+# ['smiles', 'E1-CC2', 'E2-CC2', 'f1-CC2', 'f2-CC2', 'E1-PBE0', 'E2-PBE0', 'f1-PBE0', 'f2-PBE0',
+# 'E1-PBE0.1', 'E2-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1', 'E1-CAM', 'E2-CAM', 'f1-CAM', 'f2-CAM']
+class QM7(QMX):
+ """
+ QM7 is a dataset constructed from subsets of the GDB-13 database (
+ stable and synthetically accessible organic molecules),
+ containing up to seven “heavy” atoms.
+ The molecules conformation are optimized using DFT at the
+ PBE0/def2-TZVP level of theory.
+
+ Chemical species:
+ [C, N, O, S, H]
+
+ Usage:
+ ```python
+ from openqdc.datasets import QM7
+ dataset = QM7()
+ ```
+
+ References:
+ https://arxiv.org/pdf/1703.00564
+ """
+
+ __links__ = {"qm7.hdf5.gz": "https://zenodo.org/record/3588337/files/150.hdf5.gz?download=1"}
+ __name__ = "qm7"
+
+ energy_target_names = [
+ "B2PLYP-D3(BJ):aug-cc-pvdz",
+ "B2PLYP-D3(BJ):aug-cc-pvtz",
+ "B2PLYP-D3(BJ):def2-svp",
+ "B2PLYP-D3(BJ):def2-tzvp",
+ "B2PLYP-D3(BJ):sto-3g",
+ "B2PLYP-D3:aug-cc-pvdz",
+ "B2PLYP-D3:aug-cc-pvtz",
+ "B2PLYP-D3:def2-svp",
+ "B2PLYP-D3:def2-tzvp",
+ "B2PLYP-D3:sto-3g",
+ "B2PLYP-D3M(BJ):aug-cc-pvdz",
+ "B2PLYP-D3M(BJ):aug-cc-pvtz",
+ "B2PLYP-D3M(BJ):def2-svp",
+ "B2PLYP-D3M(BJ):def2-tzvp",
+ "B2PLYP-D3M(BJ):sto-3g",
+ "B2PLYP-D3M:aug-cc-pvdz",
+ "B2PLYP-D3M:aug-cc-pvtz",
+ "B2PLYP-D3M:def2-svp",
+ "B2PLYP-D3M:def2-tzvp",
+ "B2PLYP-D3M:sto-3g",
+ "B2PLYP:aug-cc-pvdz",
+ "B2PLYP:aug-cc-pvtz",
+ "B2PLYP:def2-svp",
+ "B2PLYP:def2-tzvp",
+ "B2PLYP:sto-3g",
+ "B3LYP-D3(BJ):aug-cc-pvdz",
+ "B3LYP-D3(BJ):aug-cc-pvtz",
+ "B3LYP-D3(BJ):def2-svp",
+ "B3LYP-D3(BJ):def2-tzvp",
+ "B3LYP-D3(BJ):sto-3g",
+ "B3LYP-D3:aug-cc-pvdz",
+ "B3LYP-D3:aug-cc-pvtz",
+ "B3LYP-D3:def2-svp",
+ "B3LYP-D3:def2-tzvp",
+ "B3LYP-D3:sto-3g",
+ "B3LYP-D3M(BJ):aug-cc-pvdz",
+ "B3LYP-D3M(BJ):aug-cc-pvtz",
+ "B3LYP-D3M(BJ):def2-svp",
+ "B3LYP-D3M(BJ):def2-tzvp",
+ "B3LYP-D3M(BJ):sto-3g",
+ "B3LYP-D3M:aug-cc-pvdz",
+ "B3LYP-D3M:aug-cc-pvtz",
+ "B3LYP-D3M:def2-svp",
+ "B3LYP-D3M:def2-tzvp",
+ "B3LYP-D3M:sto-3g",
+ "B3LYP:aug-cc-pvdz",
+ "B3LYP:aug-cc-pvtz",
+ "B3LYP:def2-svp",
+ "B3LYP:def2-tzvp",
+ "B3LYP:sto-3g",
+ "HF:aug-cc-pvdz",
+ "HF:aug-cc-pvtz",
+ "HF:def2-svp",
+ "HF:def2-tzvp",
+ "HF:sto-3g",
+ "MP2:aug-cc-pvdz",
+ "MP2:aug-cc-pvtz",
+ "MP2:def2-svp",
+ "MP2:def2-tzvp",
+ "MP2:sto-3g",
+ "PBE0:aug-cc-pvdz",
+ "PBE0:aug-cc-pvtz",
+ "PBE0:def2-svp",
+ "PBE0:def2-tzvp",
+ "PBE0:sto-3g",
+ "PBE:aug-cc-pvdz",
+ "PBE:aug-cc-pvtz",
+ "PBE:def2-svp",
+ "PBE:def2-tzvp",
+ "PBE:sto-3g",
+ "WB97M-V:aug-cc-pvdz",
+ "WB97M-V:aug-cc-pvtz",
+ "WB97M-V:def2-svp",
+ "WB97M-V:def2-tzvp",
+ "WB97M-V:sto-3g",
+ "WB97X-D:aug-cc-pvdz",
+ "WB97X-D:aug-cc-pvtz",
+ "WB97X-D:def2-svp",
+ "WB97X-D:def2-tzvp",
+ "WB97X-D:sto-3g",
+ ]
+
+ __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))] # "wb97x/6-31g(d)"
+
+
+class QM7b(QMX):
+ """
+ QM7b is a dataset constructed from subsets of the GDB-13 database (
+ stable and synthetically accessible organic molecules),
+ containing up to seven “heavy” atoms.
+ The molecules conformation are optimized using DFT at the
+ PBE0/def2-TZVP level of theory.
+
+ Chemical species:
+ [C, N, O, S, Cl, H]
+
+ Usage:
+ ```python
+ from openqdc.datasets import QM7b
+ dataset = QM7b()
+ ```
+
+ References:
+ https://arxiv.org/pdf/1703.00564
+ """
+
+ __links__ = {"qm7b.hdf5.gz": "https://zenodo.org/record/3588335/files/200.hdf5.gz?download=1"}
+ __name__ = "qm7b"
+ energy_target_names = [
+ "CCSD(T0):cc-pVDZ",
+ "HF:cc-pVDZ",
+ "HF:cc-pVTZ",
+ "MP2:cc-pVTZ",
+ "B2PLYP-D3:aug-cc-pvdz",
+ "B2PLYP-D3:aug-cc-pvtz",
+ "B2PLYP-D3:def2-svp",
+ "B2PLYP-D3:def2-tzvp",
+ "B2PLYP-D3:sto-3g",
+ "B2PLYP-D3M(BJ):aug-cc-pvdz",
+ "B2PLYP-D3M(BJ):aug-cc-pvtz",
+ "B2PLYP-D3M(BJ):def2-svp",
+ "B2PLYP-D3M(BJ):def2-tzvp",
+ "B2PLYP-D3M(BJ):sto-3g",
+ "B2PLYP-D3M:aug-cc-pvdz",
+ "B2PLYP-D3M:aug-cc-pvtz",
+ "B2PLYP-D3M:def2-svp",
+ "B2PLYP-D3M:def2-tzvp",
+ "B2PLYP-D3M:sto-3g",
+ "B2PLYP:aug-cc-pvdz",
+ "B2PLYP:aug-cc-pvtz",
+ "B2PLYP:def2-svp",
+ "B2PLYP:def2-tzvp",
+ "B2PLYP:sto-3g",
+ "B3LYP-D3(BJ):aug-cc-pvdz",
+ "B3LYP-D3(BJ):aug-cc-pvtz",
+ "B3LYP-D3(BJ):def2-svp",
+ "B3LYP-D3(BJ):def2-tzvp",
+ "B3LYP-D3(BJ):sto-3g",
+ "B3LYP-D3:aug-cc-pvdz",
+ "B3LYP-D3:aug-cc-pvtz",
+ "B3LYP-D3:def2-svp",
+ "B3LYP-D3:def2-tzvp",
+ "B3LYP-D3:sto-3g",
+ "B3LYP-D3M(BJ):aug-cc-pvdz",
+ "B3LYP-D3M(BJ):aug-cc-pvtz",
+ "B3LYP-D3M(BJ):def2-svp",
+ "B3LYP-D3M(BJ):def2-tzvp",
+ "B3LYP-D3M(BJ):sto-3g",
+ "B3LYP-D3M:aug-cc-pvdz",
+ "B3LYP-D3M:aug-cc-pvtz",
+ "B3LYP-D3M:def2-svp",
+ "B3LYP-D3M:def2-tzvp",
+ "B3LYP-D3M:sto-3g",
+ "B3LYP:aug-cc-pvdz",
+ "B3LYP:aug-cc-pvtz",
+ "B3LYP:def2-svp",
+ "B3LYP:def2-tzvp",
+ "B3LYP:sto-3g",
+ "HF:aug-cc-pvdz",
+ "HF:aug-cc-pvtz",
+ "HF:cc-pvtz",
+ "HF:def2-svp",
+ "HF:def2-tzvp",
+ "HF:sto-3g",
+ "PBE0:aug-cc-pvdz",
+ "PBE0:aug-cc-pvtz",
+ "PBE0:def2-svp",
+ "PBE0:def2-tzvp",
+ "PBE0:sto-3g",
+ "PBE:aug-cc-pvdz",
+ "PBE:aug-cc-pvtz",
+ "PBE:def2-svp",
+ "PBE:def2-tzvp",
+ "PBE:sto-3g",
+ "SVWN:sto-3g",
+ "WB97M-V:aug-cc-pvdz",
+ "WB97M-V:aug-cc-pvtz",
+ "WB97M-V:def2-svp",
+ "WB97M-V:def2-tzvp",
+ "WB97M-V:sto-3g",
+ "WB97X-D:aug-cc-pvdz",
+ "WB97X-D:aug-cc-pvtz",
+ "WB97X-D:def2-svp",
+ "WB97X-D:def2-tzvp",
+ "WB97X-D:sto-3g",
+ ]
+ __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))] # "wb97x/6-31g(d)"]
+
+
+class QM8(QMX):
+ """QM8 is the subset of QM9 used in a study on modeling quantum
+ mechanical calculations of electronic spectra and excited
+ state energy (a increase of energy from the ground states) of small molecules
+ up to eight heavy atoms.
+ Multiple methods were used, including
+ time-dependent density functional theories (TDDFT) and
+ second-order approximate coupled-cluster (CC2).
+ The molecules conformations are relaxed geometries computed using
+ the DFT B3LYP with basis set 6-31G(2df,p).
+ For more information about the sampling, check QM9 dataset.
+
+ Usage:
+ ```python
+ from openqdc.datasets import QM8
+ dataset = QM8()
+ ```
+
+ References:
+ https://arxiv.org/pdf/1504.01966
+ """
+
+ __name__ = "qm8"
+
+ __energy_methods__ = [
+ PotentialMethod.NONE, # "wb97x/6-31g(d)"
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ ]
+
+ __links__ = {
+ "qm8.csv": "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm8.csv",
+ "qm8.tar.gz": "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/gdb8.tar.gz",
+ }
+
+ def read_raw_entries(self):
+ df = pd.read_csv(p_join(self.root, "qm8.csv"))
+ mols = dm.read_sdf(p_join(self.root, "qm8.sdf"), sanitize=False, remove_hs=False)
+ samples = []
+ for idx_row, mol in zip(df.iterrows(), mols):
+ _, row = idx_row
+ positions = mol.GetConformer().GetPositions()
+ x = get_atomic_number_and_charge(mol)
+ n_atoms = positions.shape[0]
+ samples.append(
+ dict(
+ atomic_inputs=np.concatenate((x, positions), axis=-1, dtype=np.float32).reshape(-1, 5),
+ name=np.array([row["smiles"]]),
+ energies=np.array(
+ [
+ row[
+ ["E1-CC2", "E2-CC2", "E1-PBE0", "E2-PBE0", "E1-PBE0.1", "E2-PBE0.1", "E1-CAM", "E2-CAM"]
+ ].tolist()
+ ],
+ dtype=np.float64,
+ ).reshape(1, -1),
+ n_atoms=np.array([n_atoms], dtype=np.int32),
+ subset=np.array([f"{self.__name__}"]),
+ )
+ )
+ return samples
+
+
+class QM9(QMX):
+ """
+ QM7b is a dataset constructed containing 134k molecules from subsets of the GDB-17 database,
+ containing up to 9 “heavy” atoms. All molecular properties are calculated at B3LUP/6-31G(2df,p)
+ level of quantum chemistry. For each of the 134k molecules, equilibrium geometries are computed
+ by relaxing geometries with quantum mechanical method B3LYP.
+
+ Usage:
+ ```python
+ from openqdc.datasets import QM9
+ dataset = QM9()
+ ```
+
+ Reference:
+ https://www.nature.com/articles/sdata201422
+ """
+
+ __links__ = {"qm9.hdf5.gz": "https://zenodo.org/record/3588339/files/155.hdf5.gz?download=1"}
+ __name__ = "qm9"
+ energy_target_names = [
+ "Internal energy at 0 K",
+ "B3LYP:def2-svp",
+ "HF:cc-pvtz",
+ "HF:sto-3g",
+ "PBE:sto-3g",
+ "SVWN:sto-3g",
+ "WB97X-D:aug-cc-pvtz",
+ "WB97X-D:def2-svp",
+ "WB97X-D:def2-tzvp",
+ ]
+
+ __energy_methods__ = [
+ PotentialMethod.NONE, # "wb97x/6-31g(d)"
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ PotentialMethod.NONE,
+ ]
diff --git a/openqdc/datasets/potential/revmd17.py b/openqdc/datasets/potential/revmd17.py
index 613ce91f..425f9784 100644
--- a/openqdc/datasets/potential/revmd17.py
+++ b/openqdc/datasets/potential/revmd17.py
@@ -54,23 +54,27 @@ def create_path(filename, root):
class RevMD17(BaseDataset):
"""
- - Benzene: 627000 samples
- - Uracil: 133000 samples
- - Naptalene: 326000 samples
- - Aspirin: 211000 samples
- - Salicylic Acid: 320000 samples
- - Malonaldehyde: 993000 samples
- - Ethanol: 555000 samples
- - Toluene: 100000 samples
-
- Usage
+ Revised MD (RevMD17) improves upon the MD17 dataset by removing all the numerical noise present in the original
+ dataset. The data is generated from an ab-initio molecular dynamics (AIMD) simulation where forces and energies
+ are computed at the PBE/def2-SVP level of theory using very tigh SCF convergence and very dense DFT integration
+ grid. The dataset contains the following molecules:
+ Benzene: 627000 samples\n
+ Uracil: 133000 samples\n
+ Naptalene: 326000 samples\n
+ Aspirin: 211000 samples\n
+ Salicylic Acid: 320000 samples\n
+ Malonaldehyde: 993000 samples\n
+ Ethanol: 555000 samples\n
+ Toluene: 100000 samples\n
+
+ Usage:
```python
from openqdc.datasets import RevMD17
dataset = RevMD17()
```
References:
- - https://arxiv.org/abs/2007.09593
+ https://arxiv.org/abs/2007.09593
"""
__name__ = "revmd17"
diff --git a/openqdc/datasets/potential/sn2_rxn.py b/openqdc/datasets/potential/sn2_rxn.py
index 29337573..2194775b 100644
--- a/openqdc/datasets/potential/sn2_rxn.py
+++ b/openqdc/datasets/potential/sn2_rxn.py
@@ -39,10 +39,12 @@ def extract_npz_entry(data):
class SN2RXN(BaseDataset):
"""
- This dataset probes chemical reactions of methyl halides with halide anions, i.e.
- X- + CH3Y -> CH3X + Y-, and contains structures for all possible combinations of
- X,Y = F, Cl, Br, I. It contains energy and forces for 452709 conformations calculated
- at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory.
+ This dataset probes chemical reactions of methyl halides with halide anions, i.e. X- + CH3Y -> CH3X + Y-, and
+ contains structures for all possible combinations of X,Y = F, Cl, Br, I. The conformations are generated by
+ running MD simulations at a temperature of 5000K with a time step of 0.1 fs using Atomic Simulation Environment
+ (ASE). The forces are derived using semi-empirical method PM7 and the structures are saved every 10 steps, and
+ for each of them, energy and forces are calculated at the DSD-BLYP-D3(BJ)/def2-TZVP level of theory. The dataset
+ contains 452,709 structures along with the energy, force and dipole moments.
Usage:
```python
@@ -51,8 +53,8 @@ class SN2RXN(BaseDataset):
```
References:
- - https://doi.org/10.1021/acs.jctc.9b00181
- - https://zenodo.org/records/2605341
+ https://doi.org/10.1021/acs.jctc.9b00181\n
+ https://zenodo.org/records/2605341
"""
__name__ = "sn2_rxn"
diff --git a/openqdc/datasets/potential/solvated_peptides.py b/openqdc/datasets/potential/solvated_peptides.py
index 4fead36f..f00e1a05 100644
--- a/openqdc/datasets/potential/solvated_peptides.py
+++ b/openqdc/datasets/potential/solvated_peptides.py
@@ -7,10 +7,10 @@
class SolvatedPeptides(BaseDataset):
"""
- The solvated protein fragments dataset probes many-body intermolecular
- interactions between "protein fragments" and water molecules.
- It contains energy and forces for 2731180 structures calculated
- at the revPBE-D3(BJ)/def2-TZVP level of theory.
+ The solvated protein fragments dataset probes many-body intermolecular interactions between "protein fragments"
+ and water molecules. Geometries are first optimized with the semi-empirical method PM7 and then MD simulations are
+ run at 1000K with a time-step of 0.1fs using Atomic Simulations Environment (ASE). Structures are saved every 10
+ steps, where energies, forces and dipole moments are calculated at revPBE-D3(BJ)/def2-TZVP level of theory.
Usage:
```python
@@ -19,8 +19,8 @@ class SolvatedPeptides(BaseDataset):
```
References:
- - https://doi.org/10.1021/acs.jctc.9b00181
- - https://zenodo.org/records/2605372
+ https://doi.org/10.1021/acs.jctc.9b00181\n
+ https://zenodo.org/records/2605372
"""
__name__ = "solvated_peptides"
diff --git a/openqdc/datasets/potential/spice.py b/openqdc/datasets/potential/spice.py
index 27525bb4..2f8cc36f 100644
--- a/openqdc/datasets/potential/spice.py
+++ b/openqdc/datasets/potential/spice.py
@@ -40,9 +40,12 @@ def read_record(r, obj):
class Spice(BaseDataset):
"""
- The Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of
- small molecules, dimers, dipeptides, and solvated amino acids. It consists of both forces and energies calculated
- at the {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory.
+ Spice dataset consists of 1.1 million conformations for a diverse set of 19k unique molecules consisting of
+ small molecules, dimers, dipeptides, and solvated amino acids. Conformations are first generated with RDKit,
+ and then molecular dynamics simulations at 100ps and 500K using OpenMM and Amber force field are used to generate
+ 100 high energy conformations. Low-energy conformations are then generated by L-BFGS energy minimization and
+ molecular dynamics at 1ps and 100K. Forces and energies for conformations are calculated at the
+ wB97M-D3(BJ)/def2-TZVPPD level of theory.
Usage:
```python
@@ -51,8 +54,8 @@ class Spice(BaseDataset):
```
References:
- - https://arxiv.org/abs/2209.10702
- - https://github.com/openmm/spice-dataset
+ https://arxiv.org/abs/2209.10702\n
+ https://github.com/openmm/spice-dataset
"""
__name__ = "spice"
@@ -96,10 +99,11 @@ def read_raw_entries(self):
class SpiceV2(Spice):
"""
- SpiceV2 dataset augmented with amino acids complexes, water boxes,
- pubchem solvated molecules.
- It consists of both forces and energies calculated
- at the {\omega}B97M-D3(BJ)/def2-TZVPPD level of theory.
+ SpiceV2 dataset augments the Spice data with amino acids complexes, water boxes, pubchem solvated molecules.
+ The main changes include, (1) over 13,000 new PubChem molecules, out of which 1500 contain boron and 1900 contain
+ silicon, (2) 194,000 conformations of dimers containing amino acid and ligands, (3) 1000 water clusters to improve
+ sampling interactions in bulk water, (4) 1397 PubChem molecules solvated with a shell of water molecules, and
+ (5) Fixing bad calculations from the Spice dataset. The data generation process is the same as the Spice dataset.
Usage:
```python
@@ -108,8 +112,8 @@ class SpiceV2(Spice):
```
References:
- - https://github.com/openmm/spice-dataset/releases/tag/2.0.0
- - https://github.com/openmm/spice-dataset
+ https://github.com/openmm/spice-dataset/releases/tag/2.0.0\n
+ https://github.com/openmm/spice-dataset
"""
__name__ = "spicev2"
@@ -150,6 +154,20 @@ def read_raw_entries(self):
class SpiceVL2(SpiceV2):
+ """
+ SpiceVL2 is an extension of the SpiceV2 dataset with additional semi-empirical GFN2-xTB and PM6 energy methods.
+
+ Usage:
+ ```python
+ from openqdc.datasets import SpiceVL2
+ dataset = SpiceVL2()
+ ```
+
+ References:
+ https://github.com/openmm/spice-dataset/releases/tag/2.0.0\n
+ https://github.com/openmm/spice-dataset
+ """
+
__name__ = "spice_vl2"
__energy_methods__ = SpiceV2.__energy_methods__ + [PotentialMethod.GFN2_XTB, PotentialMethod.PM6]
diff --git a/openqdc/datasets/potential/tmqm.py b/openqdc/datasets/potential/tmqm.py
index 1da6901a..987fa10f 100644
--- a/openqdc/datasets/potential/tmqm.py
+++ b/openqdc/datasets/potential/tmqm.py
@@ -47,10 +47,10 @@ def read_xyz(fname, e_map):
class TMQM(BaseDataset):
"""
- The tmQM dataset contains the geometries of a large transition metal-organic
- compound space with a large variety of organic ligands and 30 transition metals.
- It contains energy labels for 86,665 mononuclear complexe calculated
- at the TPSSh-D3BJ/def2-SV DFT level of theory.
+ tmQM dataset contains the geometries of a large transition metal-organic compound space with a large variety of
+ organic ligands and 30 transition metals. It contains energy labels for 86,665 mononuclear complexes calculated
+ at the TPSSh-D3BJ/def2-SV DFT level of theory. Structures are first extracted from Cambridge Structure Database
+ and then optimized in gas phase with the extended tight-binding GFN2-xTB method.
Usage:
```python
@@ -59,8 +59,8 @@ class TMQM(BaseDataset):
```
References:
- - https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041
- - https://github.com/bbskjelstad/tmqm
+ https://pubs.acs.org/doi/10.1021/acs.jcim.0c01041\n
+ https://github.com/bbskjelstad/tmqm
"""
__name__ = "tmqm"
diff --git a/openqdc/datasets/potential/transition1x.py b/openqdc/datasets/potential/transition1x.py
index 8b5b4bc1..d15d71c1 100644
--- a/openqdc/datasets/potential/transition1x.py
+++ b/openqdc/datasets/potential/transition1x.py
@@ -39,9 +39,9 @@ def read_record(r, group):
class Transition1X(BaseDataset):
"""
- The Transition1x dataset contains structures from 10k organic reaction pathways of various types.
- It contains DFT energy and force labels for 9.6 mio. conformers calculated at the
- wB97x/6-31-G(d) level of theory.
+ Transition1x dataset contains structures from 10k organic reaction pathways of various types. It contains energy
+ and force labels for 9.6 mio. conformers calculated at the wB97x/6-31-G(d) level of theory. The geometries and
+ the transition states are generated by running Nudged Elastic Band (NEB) with DFT.
Usage:
```python
@@ -50,8 +50,8 @@ class Transition1X(BaseDataset):
```
References:
- - https://www.nature.com/articles/s41597-022-01870-w
- - https://gitlab.com/matschreiner/Transition1x
+ - https://www.nature.com/articles/s41597-022-01870-w\n
+ - https://gitlab.com/matschreiner/Transition1x\n
"""
__name__ = "transition1x"
diff --git a/openqdc/datasets/potential/vqm24.py b/openqdc/datasets/potential/vqm24.py
new file mode 100644
index 00000000..1710e1dd
--- /dev/null
+++ b/openqdc/datasets/potential/vqm24.py
@@ -0,0 +1,82 @@
+import os
+from os.path import join as p_join
+
+import numpy as np
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+
+
+def shape_atom_inputs(coords, atom_species):
+ xs = np.stack((atom_species, np.zeros_like(atom_species)), axis=-1)
+ return np.concatenate((xs, coords), axis=-1, dtype=np.float32)
+
+
+def read_npz_entry(raw_path):
+ samples = np.load(raw_path, allow_pickle=True)
+ # get name of file without extension
+ subset = os.path.basename(raw_path).split(".")[0]
+
+ # atoms
+ # coordinates
+ coordinates = np.concatenate(samples["coordinates"])
+ atom_species = np.concatenate(samples["atoms"]).ravel()
+ names = list(map(lambda x: x.split("_")[0], samples["compounds"]))
+ n_comps = len(names)
+
+ # graphs
+ # inchi
+ # Etot
+ # Eatomization
+ res = dict(
+ name=np.array(list(map(lambda x: x.split("_")[0], samples["compounds"]))),
+ subset=np.array([subset] * n_comps),
+ energies=samples["Etot"][:, None].astype(np.float64),
+ atomic_inputs=shape_atom_inputs(coordinates, atom_species),
+ n_atoms=np.array(list(map(lambda x: len(x), samples["coordinates"])), dtype=np.int32),
+ )
+ return res
+
+
+# graphs is smiles
+class VQM24(BaseDataset):
+ """
+ Vector-QM24 (VQM24) dataset consists of small organic and inorganic molecules with quantum mechanical
+ properties calculated at wB97x-D3//cc-pVDZ level of theory. This leads to 258,242 unique constitutional
+ isomers and 577,705 conformers of varying stoichiometries. Geometries are generated using GFN2-xTB, and
+ relaxed with DFT method wB97x-D3/cc-pVDZ. The energy values are calculated with DFT method wB97x-D3/cc-pVDZ.
+
+ Usage:
+ ```python
+ from openqdc.datasets import VQM24
+ dataset = VQM24()
+ ```
+
+ Reference:
+ https://arxiv.org/abs/2405.05961
+ """
+
+ __name__ = "vqm24"
+
+ __energy_methods__ = [
+ PotentialMethod.WB97X_6_31G_D, # "wb97x/6-31g(d)"
+ ]
+
+ energy_target_names = [
+ "ωB97x:6-31G(d) Energy",
+ ]
+ # ωB97X-D3/cc-pVDZ
+ __energy_unit__ = "hartree"
+ __distance_unit__ = "ang"
+ __forces_unit__ = "hartree/ang"
+ __links__ = {
+ f"{name}.npz": f"https://zenodo.org/records/11164951/files/{name}.npz?download=1"
+ for name in ["DFT_all", "DFT_saddles", "DFT_uniques", "DMC"]
+ }
+
+ def read_raw_entries(self):
+ samples = []
+ for name in self.__links__:
+ raw_path = p_join(self.root, f"{name}")
+ samples.append(read_npz_entry(raw_path))
+ return samples
diff --git a/openqdc/datasets/potential/waterclusters.py b/openqdc/datasets/potential/waterclusters.py
new file mode 100644
index 00000000..8c791474
--- /dev/null
+++ b/openqdc/datasets/potential/waterclusters.py
@@ -0,0 +1,175 @@
+from collections import defaultdict
+from os.path import join as p_join
+
+import numpy as np
+
+from openqdc.datasets.base import BaseDataset
+from openqdc.methods import PotentialMethod
+from openqdc.utils.package_utils import requires_package
+
+_default_basis_sets = {
+ "BEGDB_H2O": "aug-cc-pVQZ",
+ "WATER27": "aug-cc-pVQZ",
+ "H2O_alkali_clusters": "def2-QZVPPD",
+ "H2O_halide_clusters": "def2-QZVPPD",
+}
+
+
+@requires_package("monty")
+@requires_package("pymatgen")
+def read_geometries(fname, dataset):
+ from monty.serialization import loadfn
+
+ geometries = {k: v.to_ase_atoms() for k, v in loadfn(fname)[dataset].items()}
+ return geometries
+
+
+@requires_package("monty")
+def read_energies(fname, dataset):
+ from monty.serialization import loadfn
+
+ # fname
+ _energies = loadfn(fname)[dataset]
+ metadata_restrictions = {"basis_set": _default_basis_sets.get(dataset)}
+
+ functionals_to_return = []
+ for dfa, at_dfa_d in _energies.items():
+ functionals_to_return += [f"{dfa}" if dfa == at_dfa else f"{dfa}@{at_dfa}" for at_dfa in at_dfa_d]
+
+ energies = defaultdict(dict)
+ for f in functionals_to_return:
+ if "-FLOSIC" in f and "@" not in f:
+ func = f.split("-FLOSIC")[0]
+ at_f = "-FLOSIC"
+ else:
+ func = f.split("@")[0]
+ at_f = f.split("@")[-1]
+
+ if func not in _energies:
+ print(f"No functional {func} included in dataset" f"- available options:\n{', '.join(_energies.keys())}")
+ elif at_f not in _energies[func]:
+ print(
+ f"No @functional {at_f} included in {func} dataset"
+ f"- available options:\n{', '.join(_energies[func].keys())}"
+ )
+ else:
+ if isinstance(_energies[func][at_f], list):
+ for entry in _energies[func][at_f]:
+ if all(entry["metadata"].get(k) == v for k, v in metadata_restrictions.items()):
+ energies[f] = entry
+ break
+ else:
+ energies[f] = _energies[func][at_f]
+ return dict(energies)
+
+
+def extract_desc(atom):
+ # atom_dict=atom.__dict__
+ # arrays -> numbers, positions
+ # charge, spin_multiplicity
+ pos = atom.get_positions()
+ z = atom.get_atomic_numbers()
+ charges = atom.get_initial_charges()
+ formula = atom.get_chemical_formula()
+ return pos, z, charges, formula
+
+
+def format_geometry_and_entries(geometries, energies, subset):
+ entries_list = []
+ for entry, atoms in geometries.items():
+ pos, z, charges, formula = extract_desc(atoms)
+ energies_list = []
+ for level_of_theory, entry_en_dict in energies.items():
+ en = entry_en_dict.get(entry, np.nan)
+ energies_list.append(en)
+ energy_array = np.array(energies_list)
+ if subset in ["WATER27", "H2O_alkali_clusters", "H2O_halide_clusters"]:
+ # only the first 9 energies are available
+ energy_array.resize(19)
+ energy_array[energy_array == 0] = np.nan
+ res = dict(
+ atomic_inputs=np.concatenate(
+ (np.hstack((z[:, None], charges[:, None])), pos), axis=-1, dtype=np.float32
+ ).reshape(-1, 5),
+ name=np.array([formula]),
+ energies=np.array(energy_array, dtype=np.float64).reshape(1, -1),
+ n_atoms=np.array([pos.shape[0]], dtype=np.int32),
+ subset=np.array([subset]),
+ )
+ entries_list.append(res)
+ return entries_list
+
+
+class SCANWaterClusters(BaseDataset):
+ """
+ The SCAN Water Clusters dataset contains conformations of
+ neutral water clusters containing up to 20 monomers, charged water clusters,
+ and alkali- and halide-water clusters. This dataset consists of our data sets of water clusters:
+ the benchmark energy and geometry database (BEGDB) neutral water cluster subset; the WATER2723 set of 14
+ neutral, 5 protonated, 7 deprotonated, and one auto-ionized water cluster; and two sets of
+ ion-water clusters M...(H2O)n, where M = Li+, Na+, K+, F−, Cl−, or Br−.
+ Water clusters were obtained from 10 nanosecond gas-phase molecular dynamics
+ simulations using AMBER 9 and optimized to obtain
+ lowest energy isomers were determined using MP2/aug-cc-pVDZ//MP2/6-31G* Gibbs free energies.
+
+
+ Chemical Species:
+ [H, O, Li, Na, K, F, Cl, Br]
+
+ Usage:
+ ```python
+ from openqdc.datasets import SCANWaterClusters
+ dataset = SCANWaterClusters()
+ ```
+
+ References:
+ https://chemrxiv.org/engage/chemrxiv/article-details/662aaff021291e5d1db7d8ec\n
+ https://github.com/esoteric-ephemera/water_cluster_density_errors
+ """
+
+ __name__ = "scanwaterclusters"
+
+ __energy_unit__ = "hartree"
+ __distance_unit__ = "ang"
+ __forces_unit__ = "hartree/ang"
+ energy_target_names = [
+ "HF",
+ "HF-r2SCAN-DC4",
+ "SCAN",
+ "SCAN@HF",
+ "SCAN@r2SCAN50",
+ "r2SCAN",
+ "r2SCAN@HF",
+ "r2SCAN@r2SCAN50",
+ "r2SCAN50",
+ "r2SCAN100",
+ "r2SCAN10",
+ "r2SCAN20",
+ "r2SCAN25",
+ "r2SCAN30",
+ "r2SCAN40",
+ "r2SCAN60",
+ "r2SCAN70",
+ "r2SCAN80",
+ "r2SCAN90",
+ ]
+ __energy_methods__ = [PotentialMethod.NONE for _ in range(len(energy_target_names))]
+ force_target_names = []
+ # 27 # 9 level
+ subsets = ["BEGDB_H2O", "WATER27", "H2O_alkali_clusters", "H2O_halide_clusters"]
+ __links__ = {
+ "geometries.json.gz": "https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/geometries.json.gz?raw=True", # noqa
+ "total_energies.json.gz": "https://github.com/esoteric-ephemera/water_cluster_density_errors/blob/main/data_files/total_energies.json.gz?raw=True", # noqa
+ }
+
+ def read_raw_entries(self):
+ entries = [] # noqa
+ for i, subset in enumerate(self.subsets):
+ geometries = read_geometries(p_join(self.root, "geometries.json.gz"), subset)
+ energies = read_energies(p_join(self.root, "total_energies.json.gz"), subset)
+ datum = {}
+ for k in energies:
+ _ = energies[k].pop("metadata")
+ datum[k] = energies[k]["total_energies"]
+ entries.extend(format_geometry_and_entries(geometries, datum, subset))
+ return entries
diff --git a/openqdc/datasets/potential/waterclusters3_30.py b/openqdc/datasets/potential/waterclusters3_30.py
index f4c7d88e..a52b9e17 100644
--- a/openqdc/datasets/potential/waterclusters3_30.py
+++ b/openqdc/datasets/potential/waterclusters3_30.py
@@ -53,6 +53,10 @@ class WaterClusters(BaseDataset):
clusters of sizes n = 3 - 30. The cluster structures are derived and labeled with
the TTM2.1-F ab-initio based interaction potential for water.
It contains approximately 4.5 mil. structures.
+ Sampling was done with the Monte Carlo Temperature Basin Paving (MCTBP) method.
+
+ Chemical Species:
+ ["H", "O"]
Usage:
```python
@@ -61,8 +65,8 @@ class WaterClusters(BaseDataset):
```
References:
- - https://doi.org/10.1063/1.5128378
- - https://sites.uw.edu/wdbase/database-of-water-clusters/
+ https://doi.org/10.1063/1.5128378\n
+ https://sites.uw.edu/wdbase/database-of-water-clusters/\n
"""
__name__ = "waterclusters3_30"
diff --git a/openqdc/datasets/statistics.py b/openqdc/datasets/statistics.py
index d471387b..6b1adeb5 100644
--- a/openqdc/datasets/statistics.py
+++ b/openqdc/datasets/statistics.py
@@ -2,7 +2,7 @@
from copy import deepcopy
from dataclasses import asdict, dataclass
from os.path import join as p_join
-from typing import Optional
+from typing import Callable, Dict, Optional
import numpy as np
from loguru import logger
@@ -17,9 +17,15 @@ class StatisticsResults:
"""
def to_dict(self):
+ """
+ Convert the class to a dictionary
+ """
return asdict(self)
- def transform(self, func):
+ def transform(self, func: Callable):
+ """
+ Apply a function to all the attributes of the class
+ """
for k, v in self.to_dict().items():
if v is not None:
setattr(self, k, func(v))
@@ -55,6 +61,14 @@ class StatisticManager:
"""
def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "AbstractStatsCalculator"):
+ """
+ dataset : openqdc.datasets.base.BaseDataset
+ The dataset object to compute the statistics
+ recompute : bool, default = False
+ Flag to recompute the statistics
+ *statistic_calculators : AbstractStatsCalculator
+ statistic calculators to run
+ """
self._state = {}
self._results = {}
self._statistic_calculators = [
@@ -63,7 +77,7 @@ def __init__(self, dataset, recompute: bool = False, *statistic_calculators: "Ab
]
@property
- def state(self) -> dict:
+ def state(self) -> Dict:
"""
Return the dictionary state of the manager
"""
@@ -120,7 +134,7 @@ class AbstractStatsCalculator(ABC):
"""
Abstract class that defines the interface for all
the calculators object and the methods to
- compute the statistics
+ compute the statistics.
"""
# State Dependencies of the calculator to skip part of the calculation
@@ -140,6 +154,28 @@ def __init__(
atom_charges: Optional[np.ndarray] = None,
forces: Optional[np.ndarray] = None,
):
+ """
+ name : str
+ Name of the dataset for saving and loading.
+ energy_type : str, default = None
+ Type of the energy for the computation of the statistics. Used for loading and saving.
+ force_recompute : bool, default = False
+ Flag to force the recomputation of the statistics
+ energies : np.ndarray, default = None
+ Energies of the dataset
+ n_atoms : np.ndarray, default = None
+ Number of atoms in the dataset
+ atom_species : np.ndarray, default = None
+ Atomic species of the dataset
+ position_idx_range : np.ndarray, default = None
+ Position index range of the dataset
+ e0_matrix : np.ndarray, default = None
+ Isolated atom energies matrix of the dataset
+ atom_charges : np.ndarray, default = None
+ Atomic charges of the dataset
+ forces : np.ndarray, default = None
+ Forces of the dataset
+ """
self.name = name
self.energy_type = energy_type
self.force_recompute = force_recompute
@@ -149,6 +185,7 @@ def __init__(
self.e0_matrix = e0_matrix
self.n_atoms = n_atoms
self.atom_species_charges_tuple = (atom_species, atom_charges)
+ self._root = p_join(get_local_cache(), self.name)
if atom_species is not None and atom_charges is not None:
# by value not reference
self.atom_species_charges_tuple = np.concatenate((atom_species[:, None], atom_charges[:, None]), axis=-1)
@@ -159,7 +196,7 @@ def has_forces(self) -> bool:
@property
def preprocess_path(self):
- path = p_join(self.root, "preprocessed", str(self) + ".pkl")
+ path = p_join(self.root, "statistics", self.name + f"_{str(self)}" + ".pkl")
return path
@property
@@ -167,14 +204,14 @@ def root(self):
"""
Path to the dataset folder
"""
- return p_join(get_local_cache(), self.name)
+ return self._root
@classmethod
def from_openqdc_dataset(cls, dataset, recompute: bool = False):
"""
- Create a calculator object from a dataset object
+ Create a calculator object from a dataset object.
"""
- return cls(
+ obj = cls(
name=dataset.__name__,
force_recompute=recompute,
energy_type=dataset.energy_type,
@@ -186,6 +223,8 @@ def from_openqdc_dataset(cls, dataset, recompute: bool = False):
atom_charges=dataset.data["atomic_inputs"][:, 1].ravel(),
e0_matrix=dataset.__isolated_atom_energies__,
)
+ obj._root = dataset.root # set to the dataset root in case of multiple datasets
+ return obj
@abstractmethod
def compute(self) -> StatisticsResults:
@@ -214,7 +253,7 @@ def attempt_load(self) -> bool:
logger.warning(f"Statistics for {str(self)} not found. Computing...")
return False
- def _setup_deps(self, state: dict) -> None:
+ def _setup_deps(self, state: Dict) -> None:
"""
Check if the dependencies of calculators are satisfied
from the state object and set the attributes of the calculator
@@ -226,7 +265,7 @@ def _setup_deps(self, state: dict) -> None:
for dep in self.state_dependency:
setattr(self, dep, state[dep])
- def write_state(self, update: dict) -> None:
+ def write_state(self, update: Dict) -> None:
"""
Write/update the state dictionary with the update dictionary
@@ -235,7 +274,7 @@ def write_state(self, update: dict) -> None:
"""
self.state.update(update)
- def run(self, state: dict) -> None:
+ def run(self, state: Dict) -> None:
"""
Main method to run the calculator.
Setup the dependencies from the state dictionary
diff --git a/openqdc/datasets/structure.py b/openqdc/datasets/structure.py
new file mode 100644
index 00000000..f6dc077e
--- /dev/null
+++ b/openqdc/datasets/structure.py
@@ -0,0 +1,276 @@
+import pickle as pkl
+from abc import ABC, abstractmethod
+from os import PathLike
+from os.path import join as p_join
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import zarr
+
+from openqdc.utils.io import pull_locally
+
+
+class GeneralStructure(ABC):
+ """
+ Abstract Factory class for datasets type in the openQDC package.
+ """
+
+ _ext: Optional[str] = None
+ _extra_files: Optional[List[str]] = None
+
+ @property
+ def ext(self):
+ return self._ext
+
+ @property
+ @abstractmethod
+ def load_fn(self) -> Callable:
+ """
+ Function to use for loading the data.
+ Must be implemented by the child class.
+
+ Returns:
+ the function to use for loading the data
+ """
+ raise NotImplementedError
+
+ def add_extension(self, filename: str) -> str:
+ """
+ Add the correct extension to a filename
+
+ Parameters:
+ filename: the filename to add the extension to
+
+ Returns:
+ the filename with the extension
+ """
+ return filename + self.ext
+
+ @abstractmethod
+ def save_preprocess(
+ self,
+ preprocess_path: Union[str, PathLike],
+ data_keys: List[str],
+ data_dict: Dict[str, np.ndarray],
+ extra_data_keys: List[str],
+ extra_data_types: Dict[str, type],
+ ) -> List[str]:
+ """
+ Save the preprocessed data to the cache directory and optionally upload it to the remote storage.
+ Must be implemented by the child class.
+
+ Parameters:
+ preprocess_path: path to the preprocessed data file
+ data_keys: list of keys to load from the data file
+ data_dict: dictionary of data to save
+ extra_data_keys: list of keys to load from the extra data file
+ extra_data_types: dictionary of data types for each key
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def load_extra_files(
+ self,
+ data: Dict[str, np.ndarray],
+ preprocess_path: Union[str, PathLike],
+ data_keys: List[str],
+ pkl_data_keys: List[str],
+ overwrite: bool,
+ ):
+ """
+ Load extra files required to define other types of data.
+ Must be implemented by the child class.
+
+ Parameters:
+ data: dictionary of data to load
+ preprocess_path: path to the preprocessed data file
+ data_keys: list of keys to load from the data file
+ pkl_data_keys: list of keys to load from the extra files
+ overwrite: whether to overwrite the local cache
+ """
+ raise NotImplementedError
+
+ def join_and_ext(self, path: Union[str, PathLike], filename: str) -> Union[str, PathLike]:
+ """
+ Join a path and a filename and add the correct extension.
+
+ Parameters:
+ path: the path to join
+ filename: the filename to join
+
+ Returns:
+ the joined path with the correct extension
+ """
+ return p_join(path, self.add_extension(filename))
+
+ def load_data(
+ self,
+ preprocess_path: Union[str, PathLike],
+ data_keys: List[str],
+ data_types: Dict[str, np.dtype],
+ data_shapes: Dict[str, Tuple[int, int]],
+ extra_data_keys: List[str],
+ overwrite: bool,
+ ):
+ """
+ Main method to load the data from a filetype structure like memmap or zarr.
+
+ Parameters:
+ preprocess_path: path to the preprocessed data file
+ data_keys: list of keys to load from the data file
+ data_types: dictionary of data types for each key
+ data_shapes: dictionary of shapes for each key
+ extra_data_keys: list of keys to load from the extra data file
+ overwrite: whether to overwrite the local cache
+ """
+ data = {}
+ for key in data_keys:
+ filename = self.join_and_ext(preprocess_path, key)
+ pull_locally(filename, overwrite=overwrite)
+ data[key] = self.load_fn(filename, mode="r", dtype=data_types[key])
+ data[key] = self.unpack(data[key])
+ data[key] = data[key].reshape(*data_shapes[key])
+
+ data = self.load_extra_files(data, preprocess_path, data_keys, extra_data_keys, overwrite)
+ return data
+
+ def unpack(self, data: any) -> any:
+ """
+ Unpack the data from the loaded file.
+
+ Parameters:
+ data: the data to unpack
+
+ Returns:
+ the unpacked data
+ """
+ return data
+
+
+class MemMapDataset(GeneralStructure):
+ """
+ Dataset structure for memory-mapped numpy arrays and props.pkl files.
+ """
+
+ _ext = ".mmap"
+ _extra_files = ["props.pkl"]
+
+ @property
+ def load_fn(self):
+ return np.memmap
+
+ def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:
+ local_paths = []
+ for key in data_keys:
+ local_path = self.join_and_ext(preprocess_path, key)
+ out = np.memmap(local_path, mode="w+", dtype=data_dict[key].dtype, shape=data_dict[key].shape)
+ out[:] = data_dict.pop(key)[:]
+ out.flush()
+ local_paths.append(local_path)
+
+ # save smiles and subset
+ local_path = p_join(preprocess_path, "props.pkl")
+
+ # assert that (required) pkl keys are present in data_dict
+ assert all([key in data_dict.keys() for key in extra_data_keys])
+
+ # store unique and inverse indices for str-based pkl keys
+ for key in extra_data_keys:
+ if extra_data_types[key] == str:
+ data_dict[key] = np.unique(data_dict[key], return_inverse=True)
+
+ with open(local_path, "wb") as f:
+ pkl.dump(data_dict, f)
+
+ local_paths.append(local_path)
+ return local_paths
+
+ def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):
+ filename = p_join(preprocess_path, "props.pkl")
+ pull_locally(filename, overwrite=overwrite)
+ with open(filename, "rb") as f:
+ tmp = pkl.load(f)
+ all_pkl_keys = set(tmp.keys()) - set(data_keys)
+ # assert required pkl_keys are present in all_pkl_keys
+ assert all([key in all_pkl_keys for key in pkl_data_keys])
+ for key in all_pkl_keys:
+ x = tmp.pop(key)
+ if len(x) == 2:
+ data[key] = x[0][x[1]]
+ else:
+ data[key] = x
+ return data
+
+
+class ZarrDataset(GeneralStructure):
+ """
+ Dataset structure for zarr files.
+ """
+
+ _ext = ".zip"
+ _extra_files = ["metadata.zip"]
+ _zarr_version = 2
+
+ @property
+ def load_fn(self):
+ return zarr.open
+
+ def unpack(self, data):
+ return data[:]
+
+ def save_preprocess(self, preprocess_path, data_keys, data_dict, extra_data_keys, extra_data_types) -> List[str]:
+ # os.makedirs(p_join(ds.root, "zips", ds.__name__), exist_ok=True)
+ local_paths = []
+ for key, value in data_dict.items():
+ if key not in data_keys:
+ continue
+ zarr_path = self.join_and_ext(preprocess_path, key)
+ value = data_dict.pop(key)
+ z = zarr.open(
+ zarr.storage.ZipStore(zarr_path),
+ "w",
+ zarr_version=self._zarr_version,
+ shape=value.shape,
+ dtype=value.dtype,
+ )
+ z[:] = value[:]
+ local_paths.append(zarr_path)
+ # if key in attrs:
+ # z.attrs.update(attrs[key])
+
+ metadata = p_join(preprocess_path, "metadata.zip")
+
+ group = zarr.group(zarr.storage.ZipStore(metadata))
+
+ for key in extra_data_keys:
+ if extra_data_types[key] == str:
+ data_dict[key] = np.unique(data_dict[key], return_inverse=True)
+
+ for key, value in data_dict.items():
+ # sub=group.create_group(key)
+ if key in ["name", "subset"]:
+ data = group.create_dataset(key, shape=value[0].shape, dtype=value[0].dtype)
+ data[:] = value[0][:]
+ data2 = group.create_dataset(key + "_ptr", shape=value[1].shape, dtype=np.int32)
+ data2[:] = value[1][:]
+ else:
+ data = group.create_dataset(key, shape=value.shape, dtype=value.dtype)
+ data[:] = value[:]
+ local_paths.append(metadata)
+ return local_paths
+
+ def load_extra_files(self, data, preprocess_path, data_keys, pkl_data_keys, overwrite):
+ filename = self.join_and_ext(preprocess_path, "metadata")
+ pull_locally(filename, overwrite=overwrite)
+ tmp = self.load_fn(filename)
+ all_pkl_keys = set(tmp.keys()) - set(data_keys)
+ # assert required pkl_keys are present in all_pkl_keys
+ assert all([key in all_pkl_keys for key in pkl_data_keys])
+ for key in all_pkl_keys:
+ if key not in pkl_data_keys:
+ data[key] = tmp[key][:][tmp[key][:]]
+ else:
+ data[key] = tmp[key][:]
+ return data
+
+ # TODO: checksum , maybe convert to archive instead of zips
diff --git a/openqdc/methods/atom_energies.py b/openqdc/methods/atom_energies.py
index fed41dc4..523ff171 100644
--- a/openqdc/methods/atom_energies.py
+++ b/openqdc/methods/atom_energies.py
@@ -1,6 +1,6 @@
import ast
import pkgutil
-from typing import Tuple
+from typing import Dict, Tuple
import numpy as np
from loguru import logger
@@ -18,19 +18,15 @@
atom_energy_collection = {k.lower(): v for k, v in atom_energy_collection.items()}
-def to_e_matrix(atom_energies: dict) -> np.ndarray:
+def to_e_matrix(atom_energies: Dict) -> np.ndarray:
"""
Get the matrix of isolated atom energies for a dict of non-null values calculates
- Parameters
- ----------
- atom_energies: dict
- Dict of energies computed for a given QM method.
- Keys are pairs of (atom, charge) and values are energy values
+ Parameters:
+ atom_energies: Dict of energies computed for a given QM method.
+ Keys are pairs of (atom, charge) and values are energy values
- Returns
- -------
- np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)
+ Returns: np.ndarray of shape (MAX_ATOMIC_NUMBER, 2 * MAX_CHARGE + 1)
Matrix containing the isolated atom energies for each atom and charge written in the form:
| | -2 | -1 | 0 | +1 | +2 | <- charges
diff --git a/openqdc/utils/download_api.py b/openqdc/utils/download_api.py
index c96f3d91..c73c752d 100644
--- a/openqdc/utils/download_api.py
+++ b/openqdc/utils/download_api.py
@@ -14,7 +14,9 @@
import gdown
import requests
import tqdm
-from aiohttp import ClientTimeout
+
+# from aiohttp import ClientTimeout
+from dotenv import load_dotenv
from fsspec import AbstractFileSystem
from fsspec.callbacks import TqdmCallback
from fsspec.implementations.local import LocalFileSystem
@@ -27,25 +29,39 @@
@dataclass
class FileSystem:
"""
- A class to handle file system operations
+ A basic class to handle file system operations
"""
public_endpoint: Optional[AbstractFileSystem] = None
private_endpoint: Optional[AbstractFileSystem] = None
local_endpoint: AbstractFileSystem = LocalFileSystem()
+ def __init__(self):
+ load_dotenv() # load environment variables from .env
+ self.KEY = os.getenv("CLOUDFARE_KEY", None)
+ self.SECRET = os.getenv("CLOUDFARE_SECRET", None)
+
@property
def public(self):
+ """
+ Return the public remote filesystem with read permission
+ """
self.connect()
return self.public_endpoint
@property
def private(self):
+ """
+ Return the private remote filesystem with write permission
+ """
self.connect()
return self.private_endpoint
@property
def local(self):
+ """
+ Return the local filesystem
+ """
return self.local_endpoint
@property
@@ -57,23 +73,29 @@ def is_connected(self):
def connect(self):
"""
- Attempt connection to the public and private endpoints
+ Attempt connection to the public and private remote endpoints
"""
if not self.is_connected:
with warnings.catch_warnings():
warnings.simplefilter("ignore") # No quota warning
self.public_endpoint = self.get_default_endpoint("public")
self.private_endpoint = self.get_default_endpoint("private")
- self.public_endpoint.client_kwargs = {"timeout": ClientTimeout(total=3600, connect=1000)}
+ # self.public_endpoint.client_kwargs = {"timeout": ClientTimeout(total=3600, connect=1000)}
def get_default_endpoint(self, endpoint: str) -> AbstractFileSystem:
"""
Return a default endpoint for the given str [public, private]
"""
if endpoint == "private":
- return fsspec.filesystem("gs")
+ return fsspec.filesystem(
+ "s3",
+ key=self.KEY,
+ secret=self.SECRET,
+ endpoint_url=ioqdc.request_s3fs_config()["endpoint_url"],
+ )
elif endpoint == "public":
- return fsspec.filesystem("https")
+ # return fsspec.filesystem("https")
+ return fsspec.filesystem("s3", **ioqdc.request_s3fs_config())
else:
return self.local_endpoint
diff --git a/openqdc/utils/io.py b/openqdc/utils/io.py
index 5e039960..08f69e1a 100644
--- a/openqdc/utils/io.py
+++ b/openqdc/utils/io.py
@@ -3,6 +3,8 @@
import json
import os
import pickle as pkl
+
+# from os.path import join as p_join
from typing import Dict, List, Optional
import fsspec
@@ -23,6 +25,12 @@
"~/.cache/openqdc" if "OPENQDC_CACHE_DIR" not in os.environ else os.path.normpath(os.environ["OPENQDC_CACHE_DIR"])
)
+_OPENQDC_DOWNLOAD_API = {
+ "s3": "/openqdc/v1",
+ # "https" : "https://storage.openqdc.org/v1",
+ "gs": "https://storage.googleapis.com/qmdata-public/openqdc",
+}
+
def set_cache_dir(d):
r"""
@@ -54,9 +62,11 @@ def get_remote_cache(write_access=False) -> str:
Returns the entry point based on the write access.
"""
if write_access:
- remote_cache = "gs://qmdata-public/openqdc"
+ remote_cache = "openqdc/v1" # "gs://qmdata-public/openqdc"
+ # remote_cache = "gs://qmdata-public/openqdc"
else:
- remote_cache = "https://storage.googleapis.com/qmdata-public/openqdc"
+ remote_cache = _OPENQDC_DOWNLOAD_API.get(os.environ.get("OPENQDC_DOWNLOAD_API", "s3"))
+ # remote_cache = "https://storage.googleapis.com/qmdata-public/openqdc"
return remote_cache
@@ -78,6 +88,7 @@ def pull_locally(local_path, overwrite=False):
"""
Retrieve file from remote gs path or local cache
"""
+
remote_path = local_path.replace(get_local_cache(), get_remote_cache())
os.makedirs(os.path.dirname(local_path), exist_ok=True)
if not os.path.exists(local_path) or overwrite:
@@ -85,6 +96,15 @@ def pull_locally(local_path, overwrite=False):
return local_path
+def request_s3fs_config():
+ import httpx
+
+ response = httpx.get("https://storage.openqdc.org/config.json")
+ response.raise_for_status()
+ config = response.json()
+ return config
+
+
def copy_exists(local_path):
remote_path = local_path.replace(get_local_cache(), get_remote_cache())
return os.path.exists(local_path) or API.exists(remote_path)
@@ -150,8 +170,8 @@ def load_hdf5_file(hdf5_file_path: str):
# inorder to enable multiprocessing:
# https://github.com/fsspec/gcsfs/issues/379#issuecomment-839929801
- fsspec.asyn.iothread[0] = None
- fsspec.asyn.loop[0] = None
+ # fsspec.asyn.iothread[0] = None
+ # fsspec.asyn.loop[0] = None
return file
@@ -177,7 +197,7 @@ def load_xyz(path):
return MolFromXYZFile(path)
-def dict_to_atoms(d: dict, ext: bool = False, energy_method: int = 0) -> Atoms:
+def dict_to_atoms(d: Dict, ext: bool = False, energy_method: int = 0) -> Atoms:
"""
Converts dictionary to ase atoms object
diff --git a/openqdc/utils/package_utils.py b/openqdc/utils/package_utils.py
index 990f6cb3..e1381dad 100644
--- a/openqdc/utils/package_utils.py
+++ b/openqdc/utils/package_utils.py
@@ -1,3 +1,4 @@
+# from openFF package
import importlib
from functools import wraps
from typing import Any, Callable, TypeVar
diff --git a/openqdc/utils/preprocess.py b/openqdc/utils/preprocess.py
deleted file mode 100644
index 1171a68c..00000000
--- a/openqdc/utils/preprocess.py
+++ /dev/null
@@ -1,38 +0,0 @@
-"""Dataset preprocessing."""
-
-import click
-import numpy as np
-from loguru import logger
-
-from openqdc import AVAILABLE_DATASETS
-
-options = list(AVAILABLE_DATASETS.values())
-options_map = {d.__name__.lower(): d for d in options}
-
-
-@click.command()
-@click.option("--dataset", "-d", type=str, default="ani1", help="Dataset name or index.")
-@click.option("--upload", "-u", type=bool, default=False, help="Try to upload it to the remote storage.")
-def preprocess(dataset, upload):
- if dataset not in options_map:
- dataset_id = int(dataset)
- data_class = options[dataset_id]
- else:
- data_class = options_map[dataset]
-
- data_class.no_init().preprocess(upload=upload, overwrite=True)
- data = data_class()
- logger.info(f"Preprocessing {data.__name__}")
-
- n = len(data)
- for i in np.random.choice(n, 3, replace=False):
- x = data[i]
- print(x.name, x.subset, end=" ")
- for k in x:
- if isinstance(x[k], np.ndarray):
- print(k, x[k].shape, end=" ")
- print()
-
-
-if __name__ == "__main__":
- preprocess()
diff --git a/openqdc/utils/regressor.py b/openqdc/utils/regressor.py
index 1d3e50ad..0c23d9b4 100644
--- a/openqdc/utils/regressor.py
+++ b/openqdc/utils/regressor.py
@@ -7,8 +7,6 @@
import pandas as pd
from loguru import logger
-SubSampleFrac = Union[float, int]
-
def non_nan_idxs(array):
"""
@@ -24,7 +22,18 @@ class Solver(ABC):
@staticmethod
@abstractmethod
- def solve(X, Y):
+ def solve(X: np.ndarray, Y: np.ndarray) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+ """
+ Main method to solve the regression problem.
+ Must be implemented in all the subclasses.
+
+ Parameters:
+ X: Input features of shape (n_samples, n_species)
+ Y: Target values of shape (n_samples,) (energy values for the regression)
+
+ Returns:
+ Tuple of predicted values and the estimated uncertainty.
+ """
pass
def __call__(self, X, Y):
@@ -38,7 +47,26 @@ def __repr__(self):
class Regressor:
- """Regressor class for preparing and solving regression problem for isolated atom energies."""
+ """
+ Regressor class for preparing and solving regression problem for isolated atom energies.
+ A isolated atom energy regression problem is defined as:\n
+ X = [n_samples, n_species] (number of atoms of each species per sample)\n
+ Y = [n_samples, ] (energies)\n
+ The regression problem is solved by solving the linear system X E0 = Y.
+
+ Example:
+ For a sytem of 2 samples (H20, CH4)\n
+ n_species = 3, n_samples = 2\n
+ H20 = 2H , 1O -> X = [2, 1, 0]\n
+ CH4 = 4C, 1H -> X = [1, 0, 4]\n
+ X = [[2, 1, 0],
+ [ 1, 0, 4]]\n
+ Y = [[10, 20]]\n
+ X E0 = Y\n
+ Linear system to solve\n
+ [[2 eH, 1 eO, 0 eC],
+ [ 1 eH, 0 eO, 4 eC]] = [[10, 20]]
+ """
solver: Solver
@@ -49,27 +77,29 @@ def __init__(
position_idx_range: np.ndarray,
solver_type: str = "linear",
stride: int = 1,
- subsample: Optional[SubSampleFrac] = None,
+ subsample: Optional[Union[float, int]] = None,
remove_nan: bool = True,
- *args,
- **kwargs,
+ *args: any,
+ **kwargs: any,
):
"""
- Parameters
- ----------
- energies
- numpy array of energies in the shape (n_samples, n_energy_methods)
- atomic_numbers
- numpy array of atomic numbers in the shape (n_atoms,)
- position_idx_range
- array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset
- stride
- Stride to use for the regression.
- subsample
- Sumsample the dataset. If a float, it is interpreted as a fraction of the dataset to use.
- If >1 it is interpreted as the number of samples to use.
- remove_nan
- Sanitize the dataset by removing energies samples with NaN values.
+ Regressor class for preparing and solving regression problem for isolated atom energies.
+
+ Parameters:
+ energies:
+ numpy array of energies in the shape (n_samples, n_energy_methods)
+ atomic_numbers:
+ numpy array of atomic numbers in the shape (n_atoms,)
+ position_idx_range:
+ array of shape (n_samples, 2) containing the start and end indices of the atoms in the dataset
+ solver_type: Type of solver to use. ["linear", "ridge"]
+ stride: Stride to use for the regression.
+ subsample: Sumsample the dataset.
+ If a float, it is interpreted as a fraction of the dataset to use.
+ If >1 it is interpreted as the number of samples to use.
+ remove_nan: Sanitize the dataset by removing energies samples with NaN values.
+ *args: Additional arguments to be passed to the regressor.
+ **kwargs: Additional keyword arguments to be passed to the regressor.
"""
self.subsample = subsample
self.stride = stride
@@ -87,7 +117,19 @@ def __init__(
self._post_init()
@classmethod
- def from_openqdc_dataset(cls, dataset, *args, **kwargs):
+ def from_openqdc_dataset(cls, dataset: any, *args: any, **kwargs: any) -> "Regressor":
+ """
+ Initialize the regressor object from an openqdc dataset. This is the default method.
+ *args and and **kwargs are passed to the __init__ method and depends on the specific regressor.
+
+ Parameters:
+ dataset: openqdc dataset object.
+ *args: Additional arguments to be passed to the regressor.
+ **kwargs: Additional keyword arguments to be passed to the regressor.
+
+ Returns:
+ Instance of the regressor class.
+ """
energies = dataset.data["energies"]
position_idx_range = dataset.data["position_idx_range"]
atomic_numbers = dataset.data["atomic_inputs"][:, 0].astype("int32")
@@ -116,12 +158,11 @@ def _downsample(self):
self.update_hparams({"idxs": idxs})
def _get_solver(self):
- if self.solver_type == "linear":
+ try:
+ return AVAILABLE_SOLVERS[self.solver_type]()
+ except KeyError:
+ logger.warning(f"Unknown solver type {self.solver_type}, defaulting to linear regression.")
return LinearSolver()
- elif self.solver_type == "ridge":
- return RidgeSolver()
- logger.warning(f"Unknown solver type {self.solver_type}, defaulting to linear regression.")
- return LinearSolver()
def _prepare_inputs(self) -> Tuple[np.ndarray, np.ndarray]:
logger.info("Preparing inputs for regression.")
@@ -137,6 +178,9 @@ def _prepare_inputs(self) -> Tuple[np.ndarray, np.ndarray]:
self.y = B
def solve(self):
+ """
+ Solve the regression problem and return the predicted isolated energies and the estimated uncertainty.
+ """
logger.info(f"Solving regression with {self.solver}.")
E0_list, cov_list = [], []
for energy_idx in range(self.y.shape[1]):
@@ -157,6 +201,11 @@ def __call__(self):
def atom_standardization(X, y):
+ """
+ Standardize the energies and the atom counts.
+ This will make the calculated uncertainty more
+ meaningful.
+ """
X_norm = X.sum()
X = X / X_norm
y = y / X_norm
@@ -165,7 +214,14 @@ def atom_standardization(X, y):
class LinearSolver(Solver):
- _regr_str = "LinearRegression"
+ """
+ Linear regression solver.
+
+ Note:
+ No Uncertainty associated as it is quite small.
+ """
+
+ _regr_str = "linear"
@staticmethod
def solve(X, y):
@@ -175,7 +231,11 @@ def solve(X, y):
class RidgeSolver(Solver):
- _regr_str = "RidgeRegression"
+ """
+ Ridge regression solver.
+ """
+
+ _regr_str = "ridge"
@staticmethod
def solve(X, y):
@@ -189,3 +249,10 @@ def solve(X, y):
cov = np.sqrt(sigma2 * np.einsum("ij,kj,kl,li->i", Ainv, X, X, Ainv))
mean = mean + y_mean.reshape([-1])
return mean, cov
+
+
+AVAILABLE_SOLVERS = {
+ cls._regr_str: cls
+ for str_name, cls in globals().items()
+ if isinstance(cls, type) and issubclass(cls, Solver) and str_name != "Solver" # Exclude the base class
+}
diff --git a/openqdc/utils/units.py b/openqdc/utils/units.py
index d8613a58..898faf8a 100644
--- a/openqdc/utils/units.py
+++ b/openqdc/utils/units.py
@@ -1,11 +1,14 @@
"""
-Unit conversion utils.
+Units conversion utilities module.
-Energy units:
- ["kcal/mol", "kj/mol", "hartree", "ev"]
+Available Energy units:
+ ["kcal/mol", "kj/mol", "hartree", "ev" "mev", "ryd]
-Distance units:
+Available Distance units:
["ang", "nm", "bohr"]
+
+Available Force units:
+ Combinations between Energy and Distance units
"""
from enum import Enum, unique
@@ -40,7 +43,16 @@ class EnergyTypeConversion(ConversionEnum, StrEnum):
MEV = "mev"
RYD = "ryd"
- def to(self, energy: "EnergyTypeConversion"):
+ def to(self, energy: "EnergyTypeConversion") -> Callable[[float], float]:
+ """
+ Get the conversion function to convert the energy to the desired units.
+
+ Parameters:
+ energy: energy unit to convert to
+
+ Returns:
+ Callable to convert the distance to the desired units
+ """
return get_conversion(str(self), str(energy))
@@ -54,7 +66,17 @@ class DistanceTypeConversion(ConversionEnum, StrEnum):
NM = "nm"
BOHR = "bohr"
- def to(self, distance: "DistanceTypeConversion", fraction: bool = False):
+ def to(self, distance: "DistanceTypeConversion", fraction: bool = False) -> Callable[[float], float]:
+ """
+ Get the conversion function to convert the distance to the desired units.
+
+ Parameters:
+ distance: distance unit to convert to
+ fraction: whether it is distance^1 or distance^-1
+
+ Returns:
+ callable to convert the distance to the desired units
+ """
return get_conversion(str(self), str(distance)) if not fraction else get_conversion(str(distance), str(self))
@@ -91,33 +113,32 @@ def __init__(self, energy: EnergyTypeConversion, distance: DistanceTypeConversio
def __str__(self):
return f"{self.energy}/{self.distance}"
- def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion):
+ def to(self, energy: EnergyTypeConversion, distance: DistanceTypeConversion) -> Callable[[float], float]:
+ """
+ Get the conversion function to convert the force to the desired units.
+
+ Parameters:
+ energy: energy unit to convert to
+ distance: distance unit to convert to
+
+ Returns:
+ callable to convert the distance to the desired units
+ """
return lambda x: self.distance.to(distance, fraction=True)(self.energy.to(energy)(x))
class Conversion:
"""
- Conversion from one unit system to another.
-
- Attributes
- ----------
- name
- A human-readable name for the conversion
- fn:
- The callable to compute the conversion
+ Conversion from one unit system to another defined by a name and a callable
"""
def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):
"""
- Parameters
- ----------
- in_unit
- String defining the units of the current values
- out_unit
- String defining the target units
- func
- The callable to compute the conversion
+ Parameters:
+ in_unit: String defining the units of the current values
+ out_unit: String defining the target units
+ func: The callable to compute the conversion
"""
name = "convert_" + in_unit.lower().strip() + "_to_" + out_unit.lower().strip()
@@ -129,11 +150,20 @@ def __init__(self, in_unit: str, out_unit: str, func: Callable[[float], float]):
self.fn = func
def __call__(self, x):
- """Convert measure"""
return self.fn(x)
-def get_conversion(in_unit: str, out_unit: str):
+def get_conversion(in_unit: str, out_unit: str) -> Callable[[float], float]:
+ """
+ Utility function to get the conversion function between two units.
+
+ Parameters:
+ in_unit : The input unit
+ out_unit : The output unit
+
+ Returns:
+ The conversion function
+ """
name = "convert_" + in_unit.lower().strip() + "_to_" + out_unit.lower().strip()
if in_unit.lower().strip() == out_unit.lower().strip():
return lambda x: x
@@ -142,6 +172,8 @@ def get_conversion(in_unit: str, out_unit: str):
return CONVERSION_REGISTRY[name]
+# Conversion definitions
+
# ev conversion
Conversion("ev", "kcal/mol", lambda x: x * 23.0605)
Conversion("ev", "hartree", lambda x: x * 0.0367493)
diff --git a/pyproject.toml b/pyproject.toml
index 43f414b8..d5e6a002 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,10 @@ dependencies = [
"ase" ,
"gdown",
"h5py >= 3.8.0" ,
- "dscribe"
+ "dscribe",
+ "zarr",
+ "python-dotenv",
+ "s3fs",
]
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..0718b5fb
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,8 @@
+[pytest]
+tmp_path_retention_policy = none
+filterwarnings =
+ ignore::DeprecationWarning
+ ignore::UserWarning
+
+markers =
+ download: tests for datasets downloading
diff --git a/tests/test_download.py b/tests/test_download.py
new file mode 100644
index 00000000..dd53d405
--- /dev/null
+++ b/tests/test_download.py
@@ -0,0 +1,15 @@
+from os.path import join as p_join
+from pathlib import Path
+
+import pytest
+
+from openqdc.datasets import QM7
+
+
+@pytest.mark.download
+def test_API_download(tmp_path, monkeypatch):
+ monkeypatch.chdir(tmp_path)
+ ds = QM7(cache_dir=tmp_path)
+ for filename in ["energies.mmap", "position_idx_range.mmap", "atomic_inputs.mmap", "props.pkl"]:
+ assert (Path(p_join(tmp_path, ds.preprocess_path, filename))).exists()
+ monkeypatch.undo()